Solver ignores nodeSelector and resource requirements
Our use case requires the pods to be scheduled only on worker nodes that have a GPU (nvidia.com/gpu
) and the arm64/v8 architecture.
Therefore, we included nodeSelector
and resources.limits
in our CodecoApp
CR.
However, we noticed that the solver sometimes will still place the 3 pods on the master initially, where they can't be scheduled by k8s due to the nodeSelector
and resources.limits
field. As a result, the CODECO components basically deadlock, since PDLC-RL will only start after the first successful placement and the solver will only attempt to run the initial placement once.
Ideally, the solver should filter out all the nodes for each pod, where the pod cannot be placed due to nodeSelector
and resources.limits
and only consider nodes that fulfil these requirements.
CodecoApp CR
apiVersion: codeco.he-codeco.eu/v1alpha1
kind: CodecoApp
metadata:
generation: 1
name: he-codeco-uc-p1-smartcity-codecoapp
namespace: he-codeco-acm
resourceVersion: "1456"
uid: 5c948d7e-43d6-425b-b0b2-76402b606e07
spec:
appName: acm-swm-app
appEnergyLimit: "20"
appFailureTolerance: ""
complianceClass: High
qosClass: Gold
securityClass: Good
codecoapp-msspec:
###################### Data Migration 1 #################################
- serviceName: data-migration-1
nwbandwidth: "1.2"
nwlatency: "3"
metadata:
labels:
app: data-migration-1
prometheus: "true"
podspec:
containers:
- name: roscore
image: hecodeco/uc-p1-smartcity-roscore:latest
ports:
- containerPort: 11311
env:
- name: ROS_MASTER_URI
value: "http://0.0.0.0:11311"
- name: ROSCONSOLE_STDOUT_LINE_BUFFERED
value: "1"
- name: rslidar-sdk
image: hecodeco/uc-p1-smartcity-rslidar-sdk:latest
ports:
- name: rslidar
containerPort: 6699
protocol: UDP
hostPort: 6699
env:
- name: ROS_MASTER_URI
value: "http://localhost:11311"
- name: data-migration
image: hecodeco/uc-p1-smartcity-data-migration:latest
ports:
- containerPort: 9002
- containerPort: 9091
env:
- name: WEBSOCKET_PORT
value: "9002"
- name: ROS_MASTER_URI
value: "http://localhost:11311"
- name: PROMETHEUS_URI
value: "0.0.0.0:9091"
- name: TAGS
value: "codecotest"
- name: TOPIC_NAME
value: "/rslidar_points"
volumeMounts:
- name: machine-id
mountPath: /etc/machine-id
############### OPTIONAL ##################
- name: publisher
image: hecodeco/uc-p1-smartcity-publisher:latest
env:
- name: ROS_MASTER_URI
value: "http://localhost:11311"
- name: ROSCONSOLE_STDOUT_LINE_BUFFERED
value: "1"
- name: PUBLISHER_DIRECTORY
value: "/data/"
- name: TOPIC_NAME
value: "/rslidar_points"
- name: FRAME_ID
value: "rslidar"
- name: PUBLISHER_RATE
value: "0.1"
- name: STOP_AFTER_PUBLISHING
value: "0"
- name: PUBLISH_ORIGINAL_TIME
value: "0"
volumeMounts:
- name: data
mountPath: /data/
volumes:
- name: data
hostPath:
path: /media/networks/XavierSSD512/codeco-pointpillars/CUDA-PointPillars/data/a9_dataset_r02_s01/point_clouds/s110_lidar_ouster_south/
############### OPTIONAL ##################
- name: machine-id
hostPath:
path: /etc/machine-id
nodeSelector:
kubernetes.io/arch: arm64
kubernetes.io/hostname: k8s-edge-1
serviceChannels:
- channelName: data-migration-1-channel-processing-pod-1-source-1
advancedChannelSettings:
minBandwidth: "5"
frameSize: "100"
maxDelay: "1"
sendInterval: "10"
otherService:
appName: acm-swm-app
port: 9002
serviceName: processing-pod-1-source-1
- channelName: data-migration-1-channel-processing-pod-2-source-1
advancedChannelSettings:
minBandwidth: "5"
frameSize: "100"
maxDelay: "1"
sendInterval: "10"
otherService:
appName: acm-swm-app
port: 9002
serviceName: processing-pod-2-source-1
###################### Processing Pod 1 (source 1) ######################
- serviceName: processing-pod-1-source-1
nwbandwidth: "1.2"
nwlatency: "3"
metadata:
labels:
app: pointpillars
prometheus: "true"
podspec:
containers:
- name: pointpillars
image: hecodeco/uc-p1-smartcity-pointpillars:latest
imagePullPolicy: Always
ports:
- containerPort: 8080
- containerPort: 9091
resources:
limits:
cpu: "1"
memory: 1Gi
nvidia.com/gpu: 1
env:
- name: CLIENT_PORT
value: "8080"
- name: WEBSOCKET_URI
value: "ws://data-migration-1.he-codeco-acm.svc.cluster.local:9002"
- name: PROMETHEUS_URI
value: "0.0.0.0:9091"
- name: MAX_QUEUE_SIZE
value: "100"
- name: MIN_QUEUE_SIZE
value: "10"
- name: TAGS
value: "codecotest"
- name: LOG_LEVEL
value: "info"
- name: COLLECTOR_URI
value: "http://collector.he-codeco-acm.svc.cluster.local/submit"
volumeMounts:
- name: machine-id
mountPath: /etc/machine-id
volumes:
- name: machine-id
hostPath:
path: /etc/machine-id
nodeSelector:
kubernetes.io/arch: arm64
serviceChannels:
- channelName: processing-pod-1-source-1-channel
advancedChannelSettings:
minBandwidth: "5"
frameSize: "100"
maxDelay: "1"
sendInterval: "10"
otherService:
appName: acm-swm-app
port: 8080
serviceName: data-migration-1
###################### Processing Pod 2 (source 1) ######################
- serviceName: processing-pod-2-source-1
nwbandwidth: "1.2"
nwlatency: "3"
metadata:
labels:
app: pointpillars
prometheus: "true"
podspec:
containers:
- name: pointpillars
image: hecodeco/uc-p1-smartcity-pointpillars:latest
imagePullPolicy: Always
ports:
- containerPort: 8080
- containerPort: 9091
resources:
limits:
cpu: "1"
memory: 1Gi
nvidia.com/gpu: 1
env:
- name: CLIENT_PORT
value: "8080"
- name: WEBSOCKET_URI
value: "ws://data-migration-1.he-codeco-acm.svc.cluster.local:9002"
- name: PROMETHEUS_URI
value: "0.0.0.0:9091"
- name: MAX_QUEUE_SIZE
value: "100"
- name: MIN_QUEUE_SIZE
value: "10"
- name: TAGS
value: "codecotest"
- name: LOG_LEVEL
value: "info"
- name: COLLECTOR_URI
value: "http://collector.he-codeco-acm.svc.cluster.local/submit"
volumeMounts:
- name: machine-id
mountPath: /etc/machine-id
volumes:
- name: machine-id
hostPath:
path: /etc/machine-id
nodeSelector:
kubernetes.io/arch: arm64
serviceChannels:
- channelName: processing-pod-2-source-1-channel
advancedChannelSettings:
minBandwidth: "5"
frameSize: "100"
maxDelay: "1"
sendInterval: "10"
otherService:
appName: acm-swm-app
port: 8080
serviceName: data-migration-1
Solver pod logs
solved: options: verbose:true, host:0.0.0.0, port:5000, max-requests:—
model:{infrastructure:{nodes:{id:"k8s-edge-1" CPU:{capacity:8000} memory:{capacity:32513966080}} nodes:{id:"k8s-master" CPU:{capacity:4000} memory:{capacity:16766124032} labels:"siemens.com.qosscheduler.master"} networks:{id:"k8s" links:{id:"k8s-edge-1-k8s-edge-1" source:"k8s-edge-1" target:"k8s-edge-1" medium:"k8s-edge-1-k8s-edge-1" latency:100000} links:{id:"k8s-edge-1-k8s-master" source:"k8s-edge-1" target:"k8s-master" medium:"k8s-edge-1-k8s-master" latency:100000} links:{id:"k8s-master-k8s-edge-1" source:"k8s-master" target:"k8s-edge-1" medium:"k8s-master-k8s-edge-1" latency:100000} links:{id:"k8s-master-k8s-master" source:"k8s-master" target:"k8s-master" medium:"k8s-master-k8s-master" latency:100000} paths:{id:"k8s-edge-1-k8s-edge-1-2297793460" links:"k8s-edge-1-k8s-edge-1"} paths:{id:"k8s-edge-1-k8s-master-2072150989" links:"k8s-edge-1-k8s-master"} paths:{id:"k8s-master-k8s-edge-1-2793655007" links:"k8s-master-k8s-edge-1"} paths:{id:"k8s-master-k8s-master-3995146986" links:"k8s-master-k8s-master"} media:{id:"k8s-edge-1-k8s-edge-1" bandwidth:{capacity:1000000000}} media:{id:"k8s-edge-1-k8s-master" bandwidth:{capacity:1000000000}} media:{id:"k8s-master-k8s-edge-1" bandwidth:{capacity:1000000000}} media:{id:"k8s-master-k8s-master" bandwidth:{capacity:1000000000}}}} app_group:{applications:{id:"acm-swm-app" workloads:{id:"acm-swm-app-data-migration-1"} workloads:{id:"acm-swm-app-processing-pod-1-source-1"} workloads:{id:"acm-swm-app-processing-pod-2-source-1"}} channels:{id:"data-migration-1-channel-processing-pod-1-source-1" source_application:"acm-swm-app" source_workload:"acm-swm-app-data-migration-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-processing-pod-1-source-1" latency:1000000000 bandwidth:5 frame_size:100 12:9002} channels:{id:"data-migration-1-channel-processing-pod-2-source-1" source_application:"acm-swm-app" source_workload:"acm-swm-app-data-migration-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-processing-pod-2-source-1" latency:1000000000 bandwidth:5 frame_size:100 12:9002} channels:{id:"processing-pod-1-source-1-channel" source_application:"acm-swm-app" source_workload:"acm-swm-app-processing-pod-1-source-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-data-migration-1" latency:1000000000 bandwidth:5 frame_size:100 12:8080} channels:{id:"processing-pod-2-source-1-channel" source_application:"acm-swm-app" source_workload:"acm-swm-app-processing-pod-2-source-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-data-migration-1" latency:1000000000 bandwidth:5 frame_size:100 12:8080}} costs:{} recommendations:{workloads:{application:"acm-swm-app" workload:"acm-swm-app-data-migration-1"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-1-source-1"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-2-source-1"}}} options:"{}"
workloads:{application:"acm-swm-app" workload:"acm-swm-app-data-migration-1" node:"k8s-master"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-1-source-1" node:"k8s-master"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-2-source-1" node:"k8s-master"} channels:{channel:"data-migration-1-channel-processing-pod-1-source-1" network:"k8s" path:"k8s-master-k8s-master-2297793460"} channels:{channel:"data-migration-1-channel-processing-pod-2-source-1" network:"k8s" path:"k8s-master-k8s-master-2297793460"} channels:{channel:"processing-pod-1-source-1-channel" network:"k8s" path:"k8s-master-k8s-master-2297793460"} channels:{channel:"processing-pod-2-source-1-channel" network:"k8s" path:"k8s-master-k8s-master-2297793460"}