Skip to content
Snippets Groups Projects
Commit 633b079f authored by Alka Nixon's avatar Alka Nixon
Browse files

Merge branch qos-scheduler:main into main

parents aff242e9 6a1c98f8
Branches main
Tags v1.1.1
No related merge requests found
Showing
with 1696 additions and 599 deletions
# SPDX-FileCopyrightText: 2024 Siemens AG
# SPDX-License-Identifier: Apache-2.0
*.sh text eol=lf
qos-scheduler/.devcontainer/Dockerfile eol=lf
......@@ -4,13 +4,15 @@
# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.245.0/containers/go/.devcontainer/base.Dockerfile
# [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.19, 1.18, 1-bullseye, 1.19-bullseye, 1.18-bullseye, 1-buster, 1.19-buster, 1.18-buster
ARG VARIANT="1.19-bullseye"
ARG VARIANT="1.21-bullseye"
ARG GOOS="linux"
ARG GOARCH="amd64"
FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT}
ARG ENVTEST_K8S_VERSION="1.26.1"
FROM mcr.microsoft.com/vscode/devcontainers/go:${VARIANT}
ENV GOOS=$GOOS
ENV GOARCH=$GOARCH
ENV ENVTEST_K8S_VERSION=$ENVTEST_K8S_VERSION
# [Choice] Node.js version: none, lts/*, 18, 16, 14
ARG NODE_VERSION="none"
......@@ -18,7 +20,7 @@ RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/
# [Optional] Uncomment this section to install additional OS packages.
RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
&& apt-get -y install --no-install-recommends cmake
&& apt-get -y install --no-install-recommends cmake yq
# Install kubebuilder
RUN curl -sL -o kubebuilder https://go.kubebuilder.io/dl/latest/$(go env GOOS)/$(go env GOARCH) \
......@@ -40,6 +42,14 @@ chmod 755 -R /usr/local/include
rm -f protoc.zip
EOF
# install kind
RUN <<EOF
#!/bin/bash
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.22.0/kind-linux-$(go env GOARCH)
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
EOF
# [Optional] Uncomment the next lines to use go get to install anything else you need
USER vscode
# RUN go get -x <your-dependency-or-tool>
......@@ -48,7 +58,18 @@ RUN go install github.com/onsi/ginkgo/v2/ginkgo@v2.9.2
RUN go install github.com/onsi/gomega/...
RUN go install github.com/golang/mock/mockgen@v1.6.0
RUN go install sigs.k8s.io/kustomize/kustomize/v4@latest
RUN go install github.com/jstemmer/go-junit-report/v2@latest
# [Optional] Uncomment this line to install global node packages.
# RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1
# install envtest
WORKDIR /envtest
RUN v="0.18.5" ; export GOBIN=/envtest ;\
path=controller-runtime/archive/refs/tags/v${v}.tar.gz ; \
curl -sL https://github.com/kubernetes-sigs/${path} | tar -xz ;\
cd controller-runtime-${v}/tools/setup-envtest ;\
go install . ;\
cd - ; rm -rf controller-runtime-${v} ;\
strip setup-envtest ;\
./setup-envtest use ${ENVTEST_K8S_VERSION}
......@@ -8,14 +8,14 @@
// Update the VARIANT arg to pick a version of Go: 1, 1.19, 1.18
// Append -bullseye or -buster to pin to an OS version.
// Use -bullseye variants on local arm64/Apple Silicon.
"VARIANT": "1.20-bullseye",
"VARIANT": "1.22-bookworm",
// Options
"NODE_VERSION": "lts/*",
"GOOS": "linux",
"GOARCH": "arm64"
"ENVTEST_K8S_VERSION": "1.26.1",
"GOOS": "linux"
}
},
"runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--network=host" ],
"runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--network=host", "--privileged" ],
// Configure tool-specific properties.
"customizations": {
......@@ -27,20 +27,22 @@
"go.useLanguageServer": true,
"go.gopath": "/go",
"go.toolsEnvVars": {
"GOOS": "linux",
"GOARCH": "arm64"
"GOOS": "linux"
}
},
// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"golang.Go"
"golang.Go",
"ms-azuretools.vscode-docker",
"ms-kubernetes-tools.vscode-kubernetes-tools",
"ms-kubernetes-tools.kind-vscode"
]
}
},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
"forwardPorts": [6443],
// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "go version",
......@@ -50,10 +52,10 @@
// activate docker host access
"features": {
"ghcr.io/devcontainers/features/docker-from-docker:1": {
"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {
"version": "latest",
"moby": true,
"dockerDashComposeVersion": "v1"
"dockerDashComposeVersion": "v2"
},
"ghcr.io/devcontainers/features/kubectl-helm-minikube:1": {
"version": "latest",
......@@ -64,7 +66,8 @@
// additional mounts
"mounts": [
"source=${localEnv:HOME}${localEnv:USERPROFILE}/.kube,target=/home/vscode/.kube,type=bind,consistency=cached"
"source=${localEnv:HOME}${localEnv:USERPROFILE}/.kube,target=/home/vscode/.kube,type=bind,consistency=cached",
"source=${localEnv:HOME}${localEnv:USERPROFILE}/.ssh,target=/home/vscode/.ssh,type=bind,consistency=cached,readonly"
],
"containerEnv": {
......
# SPDX-FileCopyrightText: 2024 Siemens AG
# SPDX-License-Identifier: Apache-2.0
*.sh text eol=lf
qos-scheduler/.devcontainer/Dockerfile eol=lf
......@@ -22,9 +22,21 @@ reuse-report.txt
deliv/**
tmp/**
z???????.mk
/coverage
vscode.mk
tmp/**
deliv/**
*~
./*.mk
coverage
coverage-out
build/ci/script/iid.txt
# Intellij Idea
.idea
.run
scheduler/testdata/optrequest.json
scheduler/testdata/qosmodel.json
scheduler/testdata/qosmodel.save
scheduler/testdata/qosmodel.txt
scheduler/testdata/semiotics.json
scheduler/testdata/semiotics.save
This diff is collapsed.
......@@ -26,6 +26,7 @@ License: Apache-2.0
## Generated code for Protobuf
Files:
scheduler/assignment/optimizer*.pb.go
scheduler/api/v1/groupversion_info.go
scheduler/api/v1alpha1/groupversion_info.go
Copyright: 2023 Siemens AG
License: Apache-2.0
......@@ -33,8 +34,13 @@ License: Apache-2.0
## Files with version numbers
Files:
scheduler/.go-version
container/goenv/version
VERSION
build/ci/script/VERSION
build/ci/goenv/VERSION
build/ci/reuse/VERSION
build/ci/helm/VERSION
build/ci/manifest/VERSION
build/ci/shellcheck/VERSION
Copyright: 2023 Siemens AG
License: Apache-2.0
......
<!---
SPDX-FileCopyrightText: 2023 Siemens AG
SPDX-License-Identifier: CC-BY-SA-4.0
-->
# Quick Start
Steps for getting a demo to run on a local Linux Helm environment and a local
KiND cluster.
## Preconditions
- Docker
- kubectl
- Helm
- KinD
- make
## Ensure access to container registry
- Enable access to Docker hub
```bash
docker login (=> enter username and password or access token)
```
- Test registry access (optional)
```bash
REGISTRY_PREFIX=hecodeco/swm-
REPO_HOST=gitlab.eclipse.org
REPO_GROUP=/eclipse-research-labs/codeco-project/scheduling-and-workload-migration-swm/
REPO_URL=https://${REPO_HOST}${REPO_GROUP}
docker pull ${REGISTRY_PREFIX}controller:1.2.0
```
## Copy required scripts to local environment
- One way is to clone the SWM repository
```bash
cd <qos-scheduler-directory>
git clone ${REPO_URL}qos-scheduler.git
```
**Attention:** If this is done in Windows (using Visual Studio Code - VSC),
depending on the settings of VSC, it may happen that text files use a CRLF
(Carriage Return Line Feed) as line separation (the "Windows way"). Scripts
will not execute with this or throw errors. The solution to this problem:
- Make sure the standard setting of VSC is to use LF instead of CRLF, or
- For each shell script, open it in VSC and adjust the End of Line Sequence
to "LF" (this can be seen and changed in the blue bar on the lower right
corner of the VSC window, when the file is selected)
# Install K8s cluster using KinD
- In local shell:
```bash
cd <qos-scheduler-directory>
export SED="sed"
export STAT="stat"
./start_cluster.sh
```
- This will install a KinD cluster with one master and two worker nodes. The
K8s config to access the cluster will be appended to ~/.kube/config and in
case there are multiple clusters in the config file, the context (of
kubectl) will be switched to the new KinD cluster
- Check whether cluster is working and whether your K8s config is pointing to
the right cluster
```bash
kubectl get nodes
```
This should output something like this:
```text
NAME STATUS ROLES AGE VERSION
c1 Ready <none> 118s v1.23.1
c2 Ready <none> 118s v1.23.1
kind-control-plane Ready control-plane,master 2m31s v1.23.1
```
# Install QoS scheduler
## Install QoS Scheduler and Solver
- In local shell:
```bash
make chart
helm install qostest --namespace=he-codeco-swm --create-namespace tmp/helm
```
- Show network topology/ network links (Custom Resources)
```bash
kubectl get networklinks -A
```
This should show you links in the network-k8s-namespace namespace.
- Show network paths (Custom Resources)
```bash
kubectl get networkpaths -A
```
This should show you paths in both the above namespace.
# Deploy sample ApplicationGroup and Application
- In local shell:
```bash
kubectl apply -f config/demo/sample-topology.yaml
```
This will create a sample network topology. If you are running a topology
operator in the network-demo-namespace namespace (true if you are using the
latest Helm chart without modifications), you should see the network links
and paths in this namespace soon.
```bash
kubectl get networklinks -A
```
```bash
kubectl get networkpaths -A
```
When the network links and paths you need are there:
```bash
kubectl apply -f config/demo/applicationgroup.yaml
kubectl apply -f config/demo/app-besteffort.yaml
```
- This will create an *ApplicationGroup* with minimum 1 Applications and the
Application app-besteffort, which consists of
- Two *Workloads* w1 and w2
- each Workload has a container wbitt/network-multitool (which has a
couple of networking tools to test and demonstrate communication)
- *Channel* "ernie" from w1 to w2 (5Mbit/s, 150µs)
- *Channel* with a generated name (app-besteffort-w2-to-app-besteffort-w1)
from w2 to w1 (2Mbit/s, 200µs)
*Channel* "ernie" requests the *BESTEFFORT* network service class, which the
default Helm chart maps to the *k8s* network (which gets created
automatically).
The other channel requests the *BESTEFFORT* network service class, which the
default Helm chart implements using the *k8s* network.
## Show optimizer call and output
- First thing that happens is that the QoS-Scheduler Controller, once the
*ApplicationGroup* is complete, collects all inputs related to *Workloads*
and *Channels*, and calls the WorkloadPlacementSolver (Pod name: optimizer)
with this information, together with information related to the
infrastructure (compute and network)
- The WorkloadPlacementSolver then calculates and returns a placement for all
Pods of the *ApplicationGroup*
- Look into logs of Optimizer Pod (e.g. via Lens)
- The output of the WorkloadPlacementSolver (assignment of Pods to Nodes) is
written to a custom resource *AssignmentPlan* (Lens: Custom
Resources/qos-scheduler.siemens.com/AssignmentPlan)
## Show Channels
- Show *Channels* (Custom Resource)
```bash
kubectl get channels
```
- or use Lens: Custom Resources/qos-scheduler.siemens.com/Channels
- One custom resource Channel is created per *Channel* that connects
*Workloads* placed on different nodes, and it corresponds to the channel
being set up
- Services (svc): Each *Channel* also gets a K8s Service with the name
"svc-\<channel-name>-\<namespace>". This can be used to address the Pods
behind this channel using a DNS name.
```bash
kubectl get svc
```
If there was no name specified in the deployment file, a name is generated:
"svc-\<src-applicationname>-\<src-workloadname>\-\<tgt-applicationname>-\<tgt-workloadname>-\<namespace>
- Endpoint (ep): IP address behind a *Channel*. This is what the corresponding
Service resolves to.
```bash
kubectl get ep
```
# Uninstall ApplicationGroup and Applications
- Either in Lens Click on the *ApplicationGroup* -> Delete, or
```bash
kubectl delete -f config/demo/applicationgroup.yaml
```
- This will delete all *Applications* in the *ApplicationGroup* and all
dependent resources
# Uninstall QoS Scheduler and Optimizer
- In Lens:
- Helm/Releases/qostest => delete
- In a local shell
```bash
helm list --namespace=controllers-system
helm uninstall --namespace=controllers-system qostest
```
- Delete the QoS-Scheduler CRDs (not uninstalled by the helm chart)
```bash
kubectl delete crds --all
```
# Uninstall KinD cluster
- In a local shell
```bash
kind delete cluster
```
<!---
SPDX-FileCopyrightText: 2024 Siemens AG
SPDX-License-Identifier: CC-BY-SA-4.0
-->
# ADR000 - Title
Creation date: YYYY-MM-DD
## Motivation/ Problem Statement
...
## Alternatives and Evaluation
### A. Alternative A
Description
#### Advantages
- ...
#### Disadvantages
- ...
### B. Alternative B
Description
#### Advantages
- ...
#### Disadvantages
- ...
## Decision
...
## Consequences
...
<!---
SPDX-FileCopyrightText: 2024 Siemens AG
SPDX-License-Identifier: CC-BY-SA-4.0
-->
# ADR100 - Rescheduling result
Creation date: 2024-08-15
## Motivation/ Problem Statement
- Situation: The QoS-Scheduler calls the Solver/Optimizer to perform a re-scheduling, i.e. there is already an existing placement. The Solver/Optimizer calculates a new placement, which may be different to the existing one.
- Problem: Shall the Solver/Optimizer return the complete new placement or only the delta to the existing placement?
## Alternatives and Evaluation
### A. Return complete new placement
The Solver/Optimizer always returns the complete new placement.
#### Advantages
- There is no deviation possible, the Solver/Optimizer statement explicitely contains the intended state
#### Disadvantages
- Not compact, we need to always return the complete placement, even if no or little changes
### B. Return only delta of new placement to existing placement
The Solver/Optimizer only returns the changed placements:
- For workloads with changed assignment: workload, new node (empty of no longer assigned), old node
- For channels with changed assignment: channel, new path (empty if no longer assigned), old path
#### Advantages
- Compact return value (only deltas)
- QoS-Solver does not need to compare new placement with existing placement to detect changes, it can directly work with the returned delta placement
#### Disadvantages
- Only transmitting delta could lead to error cumulation. This is however not critical, as if there are errors, the actual state (incl. the error) will be trasnmitted to the Solver/Optimizer (they are stateless) with the next placement call, i.e. errors cannot cumulate.
## Decision
Alternative B. Return only delta
## Consequences
- Implement accordingly in Solver, Optimizer, and QoS-Scheduler
- gRPC interface already fit for this solution (but update documentation/ comments)
......@@ -74,9 +74,9 @@ list should look as follows:
isPhysical: true
namespace: TSN
image:
repository: networkoperator
repository: network-topology
tag: network-namespaces
command: /bin/topology-nw-operator
command: /bin/network-topology
```
Each operator specifies the networks implementations it will process, and
......
......@@ -12,7 +12,7 @@ gitlab pipeline, among them:
2. `optimizer` contains the optimizer client and a basic server stub in Go.
3. `network` is base classes and sample network operator implementations
4. `controller` is the controller-manager for our custom resources.
5. `nodedaemon` gets deployed to the node daemon set, for information about compute nodes.
5. `node-daemon` gets deployed to the node daemon set, for information about compute nodes.
When the Gitlab pipeline builds a new version, it runs tests, builds images, labels them
with the name of the current branch, and publishes them to the project’s container registry.
......
......@@ -13,64 +13,59 @@ participant "\nChannelController\n" as cc
participant "\nSolver/Optimizer\n" as opt
participant "\nK8s\n" as k8s
== Waiting ==
hnote over ag #LightGreen: Waiting
?o-> ac : create Application
activate ac
note right of ac #LightBlue
Application in state Waiting
end note
hnote over ac #LightGreen: Waiting
ac -> ag : receive Application event
deactivate ac
activate ag
note right of ag #LightYellow
enough Applications Waiting?
end note
note over ag : enough Applications Waiting?
ag -> ac : update owner reference
deactivate ag
hnote over ag #LightGreen: Optimizing
== Optimizing ==
rnote over ag : Collect QoSModel \n(infrastructure and application models)
loop while returned AssignmentPlan contains retryable error
ag -> pc : create unresolved AssignmentPlan
note right of ag #LightYellow
Collect QoSModel: infrastructure and application models
end note
ag -> pc : create unresolved AssignmentPlan
pc -> opt : find placement (gRPC)
activate opt
rnote over opt : Calculate placement
return placement result (gRPC)
activate pc
pc -> opt : calculates assignments and paths
deactivate pc
rnote over pc : Record Solver/Optimizer reply in AssignmentPlan
pc -> ag : AssignmentPlan with Solver/Optimizer reply exists
activate opt
opt -> pc : record reply in an assignment plan
deactivate opt
note over ag : is the reply from Solver/Optimizer valid?
end
activate pc
pc -> ag :looks for AssignmentPlan with solver reply
deactivate pc
hnote over ag #LightGreen: Scheduling
activate ag
note right of ag #LightYellow
Will repeat if the AssignmentPlan contains a retryable error.
Will Set Application Group to status "Scheduling" on valid reply from Solver / Optimizer
end note
ag -> cc: Initialize Channels based on channels in AssignmentPlan
activate cc
ag -> ac : Set Application to status "Scheduling"
ac -> k8s : Create Pods
ac -> ac: Set Application to status "Pending"
ag -> ag : Set ApplicationGroup to status "Waiting"
deactivate ag
ag -> cc: Create Channels based on channels in AssignmentPlan
activate cc
note right of cc #LightYellow
Will trigger a cascade of events
Further specified in Sequence_Orchestrator-ChannelController.plantuml
end note
cc -> ac : Pods are running
deactivate cc
activate ac
ac -> ac: set Application status to Running
ac -> ac : create service and endpoints
ag -> ac : Set Applications to status "Scheduling"
hnote over ac #LightGreen: Scheduling
deactivate ag
ac -> k8s : Create Pods
hnote over ac #LightGreen: Pending
deactivate ag
k8s -> ac : Pods are running
hnote over ac #LightGreen: Running
ac -> k8s : create services and endpoints
ac -> ag : All Applications in status "Running"
hnote over ag #LightGreen: Waiting
deactivate cc
@enduml
......@@ -20,7 +20,7 @@ end note
note right of Scheduling
for applications that have a plan entry:
set application state to "Scheduling"
send stream requests
send channel requests
end note
note left of Optimizing
......
......@@ -5,41 +5,29 @@ SPDX-License-Identifier: CC-BY-SA-4.0
# Getting Started
Steps for getting a demo to run. This is for deploying the QoS Scheduler and the Workload Placement Solver to a K8s cluster.
Steps for getting a demo to run. This is for the TSN version of the QoS Scheduler and according environment (K8s cluster with TSN network and Harbor registry).
## Preconditions
### You need a Kubernetes cluster
- Kubernetes server version >= 20, prefer >= 23 (but currently <= 26)
- you must be able to obtain docker images from whichever registry you're using (Docker hub)
- you must be able to obtain docker images from whichever registry you're using
- you need to install a few things into the kube-system namespace and create one daemonset with network privileges, so you need the right access to your cluster
- some pods get installed with root privileges (e.g. node daemonset, cni plugin), your cluster access needs to make that possible.
- some pods get installed with root privileges (e.g. node daemonset), your cluster access needs to make that possible.
It does not matter which CNI plugin you start the cluster with.
#### Container registry
Docker images have to come from a container registry. You can have a local registry, or a harbor/artifactory/etc
instance running in your network, or you can get images from the internet.
instance running in your network, or you can get images from the internet. For the siemens-built images, you either
need access so you can fetch them from cr.siemens.com or you need to transfer them to a registry that you can access.
Your Kubernetes nodes must be able to reach all the registries that they need to fetch images from.
## Setup steps
### Prepare the nodes in your cluster
In order to address specific nodes for workloads using the SWM CRs, you have to use specific labels on the nodes. When using the default environment (kind cluster), the labeling of the nodes is done in the start_cluster.sh script.
It you are using another k8s cluster, you have to label the nodes as follows
```bash
# label the nodes with their Kubernetes node names for easier use with
# WorkloadPlacementSolver compatible labels
export NODE_LABEL_PREFIX="siemens.com.qosscheduler"
export NODES=$(kubectl get no -o jsonpath="{.items[*].metadata.name}")
for n in $NODES; do
kubectl label --overwrite nodes $n $NODE_LABEL_PREFIX.$n=
done
```
### How to run the Helm chart
All the files for the Helm chart are in the `helm/qos-scheduler` directory. You need to install a Helm client (v3 is good) and run Helm from the `helm/qos-scheduler` directory.
......@@ -50,26 +38,25 @@ Here are the parameters:
- `qosNamespace`: the namespace where the controllers will run.
- `image.repositoryPrefix`: where we should be getting the images for qos-scheduler, controllers etc from.
- `image.tag`: the docker image tag to use for all images except for the `solver`.
- `image.tag`: the docker image tag to use for all images except for the `optimizer`.
- `image.pullSecrets`: the chart will create image pull secrets for you using `image.credentials`. This is so the Kubernetes pods can get images from your registry.
- `image.pullPolicy`: the default Kubernetes image pull policy. `Always` is good when you're developing, otherwise `IfNotPresent` is more efficient. You can override this for each subchart.
- `image.credentials`: this is the login to the docker registry. For `password`, I recommend getting a token with registry read access. Do not put your actual password in here. These credentials will be stored in the Kubernetes cluster using secrets; they will not be encrypted.
Other things you may want to check or modify in the Helm chart:
- you may need to edit `charts/optimizer/values.yaml` and specify an image tag that works for you.,
- you probably need to edit `charts/optimizer/values.yaml` and specify an image that works for you. The system will use a grpc client, so you should run a service implementing the service proto.
- take a look at `charts/network/values.yaml`. This determines the network controllers that the Helm chart will start for you. You can add your own network sections to the values file here or you can start your own network controllers outside the chart.
- `templates/_helpers.tpl` contains the `serviceClassMap`. This is how the system decides which network to use for your channels' service classes. Each channel service class needs to have a key in the map, and the values need to be networks for which a channel operator exists.
```bash
make chart
helm install qostest --namespace=controllers-system --create-namespace tmp/helm
helm install qostest --namespace=controllers-system --create-namespace --set global.image.credentials.username=your.email@siemens.com --set global.image.credentials.password=$YOURTOKEN --set global.image.credentials.email=your.email@siemens.com --set global.image.credentials.registry=cr.siemens.com .
```
If you make changes to the helm chart and just want to apply those:
```bash
helm upgrade qostest --namespace=controllers-system tmp/helm
helm upgrade qostest --namespace=controllers-system --set global.image.credentials.username=your.email@siemens.com --set global.image.credentials.password=$YOURTOKEN --set global.image.credentials.email=your.email@siemens.com --set global.image.credentials.registry=cr.siemens.com .
```
To uninstall the release:
......@@ -202,7 +189,7 @@ Example:
25s Normal phase change applicationgroup/applicationgroup-demo changed phase from Optimizing to Failed
```
This tells you that the application group entered a failed state because it tried to
This tells me that the application group entered a failed state because it tried to
call the optimizer and the optimizer returned a non-retryable error.
Other reasons for failure include applications failing because their pods failed or because
......
......@@ -15,7 +15,7 @@ described in more detail below.
Some more complex steps have subdirectories in `container`, others
are implemented completely in the `.gitlab-ci.yml` file.
1. dev-container
1. ci-images
2. lint
3. test
4. build
......@@ -34,13 +34,13 @@ the `latest` image is broken, you may want to pin goenv_version to a different t
`K8SVERSION` is the Kubernetes version your code is tied to. Switching
this will not create code for the given version automatically; you need to run the `hack/maybe-update-deps.sh` script first, then check in the changes it makes.
## dev-container
## ci-images
This step builds the `goenv` container, which has a go compiler,
all the packages required by the code, as well as some tools for running tests.
This step is not hermetic, so it runs in a kaniko container. It
is also fairly complex, so it has its own subdirectory (`container/goenv`).
is also fairly complex, so it has its own subdirectory (`build/ci/goenv`).
The `pipeline-jobs.yml` file provides the steps before running kaniko. Mainly, this part copies `go.mod` files from the packages we
build so they can get processed in the kaniko/Docker build.
......@@ -70,7 +70,7 @@ big, and this step makes it even bigger. Of course
the image produced here is never deployed to production. The deploy
step creates a much smaller image.
Similar to the dev-container step, there is a `force-build` file here which is really just for forcing a rebuild.
Similar to the ci-images step, there is a `force-build` file here which is really just for forcing a rebuild.
The most common reason for needing to force a rebuild is using the wrong
value of `OPT_TAG`. This is the tag of the optimizer container you want to build on. This needs to be an optimizer container built on top of ubuntu:focal, not ubuntu:18.04. Currently this means you need the `build-experiments` tag. Once that has been merged, you can use the `develop` tag (or maybe, one day, the `latest` tag).
......@@ -89,8 +89,9 @@ It runs in the `goenv` container and is hermetic.
## build
This builds all go code. It runs in the `goenv` container and is hermetic.
It produces the `qosScheduler` (custom scheduler), `manager` (controller-manager service),
and `networkoperator` binaries and exports them to CI as artifacts.
It produces the `qosScheduler` (custom scheduler), `controller`
(controller-manager service), and binaries `network-k8s`, `network-l2sm`, `network-tsn`
and `network-topology` and exports them to CI as artifacts.
## collect-configs
......@@ -115,7 +116,9 @@ Here is the list of docker images you should have at the end:
1. goenv (environment for building and testing go code)
2. controller (the controller-manager binary)
3. custom-scheduler (the custom scheduler binary)
4. networkoperator (maintains network information)
5. configs (the package with all the yaml files and scripts)
4. network-k8s (maintains Kubernetes network information)
5. network-l2sm (maintains L2S-M network information)
6. network-topology (maintains network information)
7. configs (the package with all the yaml files and scripts)
Not all pipeline runs will rebuild all images. There are gitlab ci/cd settings in the yaml files that specify which file changes trigger which step.
......@@ -5,7 +5,8 @@ SPDX-License-Identifier: CC-BY-SA-4.0
# Quick Start
Steps for getting a demo to run on a local Linux Helm environment and a local KiND cluster.
Steps for getting a demo to run on a local Linux Helm environment and a local
KiND cluster.
## Preconditions
......@@ -42,13 +43,18 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
git clone ${REPO_URL}qos-scheduler.git
```
- **Attention:** If this is done in Windows (using Visual Studio Code - VSC), depending on the settings of VSC, it may happen that text files use a CRLF (Carriage Return Line Feed) as line separation (the "Windows way"). Scripts will not execute with this or throw errors. The solution to this problem:
**Attention:** If this is done in Windows (using Visual Studio Code - VSC),
depending on the settings of VSC, it may happen that text files use a CRLF
(Carriage Return Line Feed) as line separation (the "Windows way"). Scripts
will not execute with this or throw errors. The solution to this problem:
- Make sure the standard setting of VSC is to use LF instead of CRLF, or
- For each shell script, open it in VSC and adjust the End of Line Sequence to "LF" (this can be seen and changed in the blue bar on the lower right corner of the VSC window, when the file is selected)
- For each shell script, open it in VSC and adjust the End of Line Sequence
to "LF" (this can be seen and changed in the blue bar on the lower right
corner of the VSC window, when the file is selected)
# Install K8s cluster using KinD
- In local shell (Linux):
- In local shell:
```bash
cd <qos-scheduler-directory>
......@@ -57,9 +63,13 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
./start_cluster.sh
```
- This will install a KinD cluster with one master and two worker nodes
- The K8s config to access the cluster will be appended to ~/.kube/config and in case there are multiple clusters in the config file, the context (of kubectl) will be switched to the new KinD cluster
- Check whether cluster is working and whether your K8s config is pointing to the right cluster
- This will install a KinD cluster with one master and two worker nodes. The
K8s config to access the cluster will be appended to ~/.kube/config and in
case there are multiple clusters in the config file, the context (of
kubectl) will be switched to the new KinD cluster.
- Check whether cluster is working and whether your K8s config is pointing to
the right cluster
```bash
kubectl get nodes
......@@ -92,6 +102,7 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
```
This should show you links in the network-k8s-namespace namespace.
- Show network paths (Custom Resources)
```bash
......@@ -105,11 +116,13 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
- In local shell:
```bash
kubectl apply -f config/demo/sample-topology.yaml
kubectl apply -f config/demo/sample-topology-vlan.yaml
```
This will create a sample network topology. If you are running a topology operator in the
network-demo-namespace namespace (true if you are using the latest Helm chart without modifications), you should see the network links and paths in this namespace soon.
This will create a sample network topology. If you are running a topology
operator in the network-demo-namespace namespace (true if you are using the
latest Helm chart without modifications), you should see the network links
and paths in this namespace soon.
```bash
kubectl get networklinks -A
......@@ -126,24 +139,36 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
kubectl apply -f config/demo/app-besteffort.yaml
```
- This will create an *ApplicationGroup* with minimum 1 Applications and the Application app-besteffort, which consists of
- This will create an *ApplicationGroup* with minimum 1 Applications and the
Application app-besteffort, which consists of
- Two *Workloads* w1 and w2
- each Workload has a container wbitt/network-multitool (which has a couple of networking tools to test and demonstrate communication)
- each Workload has a container wbitt/network-multitool (which has a
couple of networking tools to test and demonstrate communication)
- *Channel* "ernie" from w1 to w2 (5Mbit/s, 150µs)
- *Channel* with a generated name (app-besteffort-w2-to-app-besteffort-w1) from w2 to w1 (2Mbit/s, 200µs)
- *Channel* with a generated name (app-besteffort-w2-to-app-besteffort-w1)
from w2 to w1 (2Mbit/s, 200µs)
*Channel* "ernie" requests the *BESTEFFORT* network service class, which the default Helm chart
maps to the *k8s* network (which gets created automatically).
*Channel* "ernie" requests the *BESTEFFORT* network service class, which the
default Helm chart maps to the *k8s* network (which gets created
automatically).
The other channel requests the *BESTEFFORT* network service class, which the default Helm chart
implements using the *k8s* network.
The other channel requests the *BESTEFFORT* network service class, which the
default Helm chart implements using the *k8s* network.
## Show optimizer call and output
- First thing that happens is that the QoS-Scheduler Controller, once the *ApplicationGroup* is complete, collects all inputs related to *Workloads* and *Channels*, and calls the WorkloadPlacementSolver (Pod name: solver) with this information, together with information related to the infrastructure (compute and network)
- The WorkloadPlacementSolver then calculates and returns a placement for all Pods of the *ApplicationGroup*
- Look into logs of Solver Pod (e.g. via Lens)
- The output of the WorkloadPlacementSolver (assignment of Pods to Nodes) is written to a custom resource *AssignmentPlan* (Lens: Custom Resources/qos-scheduler.siemens.com/AssignmentPlan)
- First thing that happens is that the QoS-Scheduler Controller, once the
*ApplicationGroup* is complete, collects all inputs related to *Workloads*
and *Channels*, and calls the WorkloadPlacementSolver (Pod name: optimizer)
with this information, together with information related to the
infrastructure (compute and network)
- The WorkloadPlacementSolver then calculates and returns a placement for all
Pods of the *ApplicationGroup*
- Look into logs of Optimizer Pod (e.g. via Lens)
- The output of the WorkloadPlacementSolver (assignment of Pods to Nodes) is
written to a custom resource *AssignmentPlan* (Lens: Custom
Resources/qos-scheduler.siemens.com/AssignmentPlan)
## Show Channels
......@@ -154,15 +179,23 @@ implements using the *k8s* network.
```
- or use Lens: Custom Resources/qos-scheduler.siemens.com/Channels
- One custom resource Channel is created per *Channel* that connects *Workloads* placed on different nodes, and it corresponds to the channel being set up
- Services (svc): Each *Channel* also gets a K8s Service with the name "svc-\<channel-name>-\<namespace>". This can be used to address the Pods behind this channel using a DNS name.
- One custom resource Channel is created per *Channel* that connects
*Workloads* placed on different nodes, and it corresponds to the channel
being set up
- Services (svc): Each *Channel* also gets a K8s Service with the name
"svc-\<channel-name>-\<namespace>". This can be used to address the Pods
behind this channel using a DNS name.
```bash
kubectl get svc
```
If there was no name specified in the deployment file, a name is generated: "svc-\<src-applicationname>-\<src-workloadname>\-\<tgt-applicationname>-\<tgt-workloadname>-\<namespace>
- Endpoint (ep): IP address behind a *Channel*. This is what the corresponding Service resolves to.
If there was no name specified in the deployment file, a name is generated:
“svc-\<src-applicationname>-\<src-workloadname>\-\<tgt-applicationname>-\<tgt-workloadname>-\<namespace>”
- Endpoint (ep): IP address behind a *Channel*. This is what the corresponding
Service resolves to.
```bash
kubectl get ep
......@@ -176,7 +209,8 @@ implements using the *k8s* network.
kubectl delete -f config/demo/applicationgroup.yaml
```
- This will delete all *Applications* in the *ApplicationGroup* and all dependent resources
- This will delete all *Applications* in the *ApplicationGroup* and all
dependent resources
# Uninstall QoS Scheduler and Optimizer
......@@ -188,6 +222,7 @@ implements using the *k8s* network.
helm list --namespace=controllers-system
helm uninstall --namespace=controllers-system qostest
```
- Delete the QoS-Scheduler CRDs (not uninstalled by the helm chart)
```bash
......
......@@ -9,10 +9,14 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
## Preconditions
The following tools need to be installed:
- Docker
- kubectl
- Helm
- KinD
- make
- rsync
- yq
## Ensure access to container registry
......@@ -24,18 +28,18 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
```bash
DOCKERUSER=<your_docker_username>
ACCESSTOKEN=<your_access_token>
REGISTRYHOST=colab-repo.intracom-telecom.com
REGISTRYNAME=${REGISTRYHOST}:5050
REGISTRYHOST=cr.siemens.com
REGISTRYNAME=${REGISTRYHOST}
REGISTRYURL=https://${REGISTRYNAME}
REPOHOST=colab-repo.intracom-telecom.com
REPOGROUP=/colab-projects/he-codeco/swm/
REPOHOST=code.siemens.com
REPOGROUP=/itp_cloud_research/qos-scheduler
docker login ${REGISTRYURL} -u ${DOCKERUSER} -p ${ACCESSTOKEN}
```
- Test registry access (optional)
```bash
docker pull ${REGISTRYNAME}${REPOGROUP}qos-scheduler/custom-scheduler:main
docker pull ${REGISTRYNAME}${REPOGROUP}/scheduler:latest
```
## Copy required scripts to local environment
......@@ -87,7 +91,7 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
```bash
make chart
helm install qostest --namespace=controllers-system --create-namespace --set global.image.credentials.username=${DOCKERUSER} --set global.image.credentials.password=${ACCESSTOKEN} --set global.image.credentials.email=${DOCKERUSER} --set global.image.credentials.registry=${REGISTRYURL} tmp/helm
helm install qosscheduler --namespace=controllers-system --create-namespace --set global.image.credentials.username=${DOCKERUSER} --set global.image.credentials.password=${ACCESSTOKEN} --set global.image.credentials.email=${DOCKERUSER} --set global.image.credentials.registry=${REGISTRYURL} tmp/helm
```
- Show network topology/ network links (Custom Resources)
......@@ -110,7 +114,7 @@ Steps for getting a demo to run on a local Linux Helm environment and a local Ki
- In local shell:
```bash
kubectl apply -f config/demo/sample-topology.yaml
kubectl apply -f config/demo/sample-topology-vlan.yaml
```
This will create a sample network topology. If you are running a topology operator in the
......@@ -191,9 +195,11 @@ implements using the *k8s* network.
```bash
helm list --namespace=controllers-system
helm uninstall --namespace=controllers-system qostest
helm uninstall --namespace=controllers-system qosscheduler
```
- Delete the QoS-Scheduler CRDs (not uninstalled by the helm chart)
- Delete the QoS-Scheduler CRDs (not uninstalled by the helm chart).
_Attention:_ This deletes all CRDs, also others that were not installed by our helm chart
```bash
kubectl delete crds --all
......
This diff is collapsed.
......@@ -5,17 +5,32 @@ SPDX-License-Identifier: CC-BY-SA-4.0
# SWM QoS Scheduler
The SWM QoS scheduler is an extension of the Kubernetes scheduler that implements an extended model for describing enhanced application requirements and infrastructure capabilities, according to the Seamless Computing concept.
The SWM QoS Scheduler is an extension of the Kubernetes scheduler that
implements an extended model for describing enhanced application requirements
and infrastructure capabilities, according to the Seamless Computing concept.
“QoS” is short for “quality of service”.
The SWM QoS scheduler considers the extended application (QoS) requirements when placing application components (further called workloads) to compute nodes in the K8s cluster. The main enhancements beyond the existing K8s scheduler are:
The SWM QoS Scheduler considers the extended application (QoS) requirements
when placing application components (further called workloads) to compute
nodes in the Kubernetes cluster. The main enhancements beyond the existing
Kubernetes scheduler are:
- schedule multiple workloads (K8s Pods) at once, considering dependencies between these workloads
- network aware scheduling: make K8s aware of the network topology, connectivity, and resource availability between worker nodes, and consider this in the scheduling decision (according to the communication requirements of the application)
- schedule multiple workloads (Kubernetes pods) at once, considering
dependencies between these workloads
- network-aware scheduling: make Kubernetes aware of the network topology,
connectivity, and resource availability between worker nodes, and consider
this in the scheduling decision (according to the communication requirements
of the applications)
## Getting Started
For installing the SWM QoS Scheduler (including all CRDs and required containers) and to deploy a small demo application, see [quick start](Documentation/quick-start-oss.md).
For installing the SWM QoS Scheduler (including all CRDs and required
containers) and to deploy a small demo application, see [quick start][1].
[1]: Documentation/quick-start-oss.md
## License
All code files are licensed under Apache license version 2.0.
All documentation is licensed under Creative Commons Attribution-ShareAlike 4.0 International.
All code files are licensed under Apache license version 2.0. All
documentation is licensed under Creative Commons Attribution-ShareAlike 4.0
International.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment