Skip to content

Solver ignores nodeSelector and resource requirements

Our use case requires the pods to be scheduled only on worker nodes that have a GPU (nvidia.com/gpu) and the arm64/v8 architecture. Therefore, we included nodeSelector and resources.limits in our CodecoApp CR.

However, we noticed that the solver sometimes will still place the 3 pods on the master initially, where they can't be scheduled by k8s due to the nodeSelector and resources.limits field. As a result, the CODECO components basically deadlock, since PDLC-RL will only start after the first successful placement and the solver will only attempt to run the initial placement once.

Ideally, the solver should filter out all the nodes for each pod, where the pod cannot be placed due to nodeSelector and resources.limits and only consider nodes that fulfil these requirements.

CodecoApp CR
apiVersion: codeco.he-codeco.eu/v1alpha1
kind: CodecoApp
metadata:
  generation: 1
  name: he-codeco-uc-p1-smartcity-codecoapp
  namespace: he-codeco-acm
  resourceVersion: "1456"
  uid: 5c948d7e-43d6-425b-b0b2-76402b606e07
spec:
  appName: acm-swm-app
  appEnergyLimit: "20"
  appFailureTolerance: ""
  complianceClass: High
  qosClass: Gold
  securityClass: Good
  codecoapp-msspec:
###################### Data Migration 1 #################################
  - serviceName: data-migration-1
    nwbandwidth: "1.2"
    nwlatency: "3"
    metadata:
      labels:
        app: data-migration-1
        prometheus: "true"
    podspec:
      containers:
      - name: roscore
        image: hecodeco/uc-p1-smartcity-roscore:latest
        ports:
        - containerPort: 11311
        env:
        - name: ROS_MASTER_URI
          value: "http://0.0.0.0:11311"
        - name: ROSCONSOLE_STDOUT_LINE_BUFFERED
          value: "1"
      - name: rslidar-sdk
        image: hecodeco/uc-p1-smartcity-rslidar-sdk:latest
        ports:
        - name: rslidar
          containerPort: 6699
          protocol: UDP
          hostPort: 6699
        env:
        - name: ROS_MASTER_URI
          value: "http://localhost:11311"
      - name: data-migration
        image: hecodeco/uc-p1-smartcity-data-migration:latest
        ports:
        - containerPort: 9002
        - containerPort: 9091
        env:
        - name: WEBSOCKET_PORT
          value: "9002"
        - name: ROS_MASTER_URI
          value: "http://localhost:11311"
        - name: PROMETHEUS_URI
          value: "0.0.0.0:9091"
        - name: TAGS
          value: "codecotest"
        - name: TOPIC_NAME
          value: "/rslidar_points"
        volumeMounts:
        - name: machine-id
          mountPath: /etc/machine-id
      ############### OPTIONAL ##################
      - name: publisher
        image: hecodeco/uc-p1-smartcity-publisher:latest
        env:
        - name: ROS_MASTER_URI
          value: "http://localhost:11311"
        - name: ROSCONSOLE_STDOUT_LINE_BUFFERED
          value: "1"
        - name: PUBLISHER_DIRECTORY
          value: "/data/"
        - name: TOPIC_NAME
          value: "/rslidar_points"
        - name: FRAME_ID
          value: "rslidar"
        - name: PUBLISHER_RATE
          value: "0.1"
        - name: STOP_AFTER_PUBLISHING
          value: "0"
        - name: PUBLISH_ORIGINAL_TIME
          value: "0"
        volumeMounts:
        - name: data
          mountPath: /data/
      volumes:
      - name: data
        hostPath:
          path: /media/networks/XavierSSD512/codeco-pointpillars/CUDA-PointPillars/data/a9_dataset_r02_s01/point_clouds/s110_lidar_ouster_south/
      ############### OPTIONAL ##################
      - name: machine-id
        hostPath:
          path: /etc/machine-id
      nodeSelector:  
        kubernetes.io/arch: arm64
        kubernetes.io/hostname: k8s-edge-1
    serviceChannels:
    - channelName: data-migration-1-channel-processing-pod-1-source-1
      advancedChannelSettings:
        minBandwidth: "5"
        frameSize: "100"
        maxDelay: "1"
        sendInterval: "10"
      otherService:
        appName: acm-swm-app
        port: 9002
        serviceName: processing-pod-1-source-1
    - channelName: data-migration-1-channel-processing-pod-2-source-1
      advancedChannelSettings:
          minBandwidth: "5"
          frameSize: "100"
          maxDelay: "1"
          sendInterval: "10"
      otherService:
        appName: acm-swm-app
        port: 9002
        serviceName: processing-pod-2-source-1
###################### Processing Pod 1 (source 1) ######################
  - serviceName: processing-pod-1-source-1
    nwbandwidth: "1.2"
    nwlatency: "3"
    metadata:
      labels:
        app: pointpillars
        prometheus: "true"
    podspec:
      containers:
      - name: pointpillars
        image: hecodeco/uc-p1-smartcity-pointpillars:latest
        imagePullPolicy: Always
        ports:
        - containerPort: 8080
        - containerPort: 9091
        resources:
          limits:
            cpu: "1"
            memory: 1Gi
            nvidia.com/gpu: 1
        env:
        - name: CLIENT_PORT
          value: "8080"
        - name: WEBSOCKET_URI
          value: "ws://data-migration-1.he-codeco-acm.svc.cluster.local:9002"
        - name: PROMETHEUS_URI
          value: "0.0.0.0:9091"
        - name: MAX_QUEUE_SIZE
          value: "100"
        - name: MIN_QUEUE_SIZE
          value: "10"
        - name: TAGS
          value: "codecotest"
        - name: LOG_LEVEL
          value: "info"
        - name: COLLECTOR_URI
          value: "http://collector.he-codeco-acm.svc.cluster.local/submit"
        volumeMounts:
        - name: machine-id
          mountPath: /etc/machine-id
      volumes:
      - name: machine-id
        hostPath:
          path: /etc/machine-id
      nodeSelector:  
        kubernetes.io/arch: arm64
    serviceChannels:
    - channelName: processing-pod-1-source-1-channel
      advancedChannelSettings:
          minBandwidth: "5"
          frameSize: "100"
          maxDelay: "1"
          sendInterval: "10"
      otherService:
        appName: acm-swm-app
        port: 8080
        serviceName: data-migration-1
###################### Processing Pod 2 (source 1) ######################
  - serviceName: processing-pod-2-source-1
    nwbandwidth: "1.2"
    nwlatency: "3"
    metadata:
      labels:
        app: pointpillars
        prometheus: "true"
    podspec:
      containers:
      - name: pointpillars
        image: hecodeco/uc-p1-smartcity-pointpillars:latest
        imagePullPolicy: Always
        ports:
        - containerPort: 8080
        - containerPort: 9091
        resources:
          limits:
            cpu: "1"
            memory: 1Gi
            nvidia.com/gpu: 1
        env:
        - name: CLIENT_PORT
          value: "8080"
        - name: WEBSOCKET_URI
          value: "ws://data-migration-1.he-codeco-acm.svc.cluster.local:9002"
        - name: PROMETHEUS_URI
          value: "0.0.0.0:9091"
        - name: MAX_QUEUE_SIZE
          value: "100"
        - name: MIN_QUEUE_SIZE
          value: "10"
        - name: TAGS
          value: "codecotest"
        - name: LOG_LEVEL
          value: "info"
        - name: COLLECTOR_URI
          value: "http://collector.he-codeco-acm.svc.cluster.local/submit"
        volumeMounts:
        - name: machine-id
          mountPath: /etc/machine-id
      volumes:
      - name: machine-id
        hostPath:
          path: /etc/machine-id
      nodeSelector:  
        kubernetes.io/arch: arm64
    serviceChannels:
    - channelName: processing-pod-2-source-1-channel
      advancedChannelSettings:
        minBandwidth: "5"
        frameSize: "100"
        maxDelay: "1"
        sendInterval: "10"
      otherService:
        appName: acm-swm-app
        port: 8080
        serviceName: data-migration-1
Solver pod logs
solved: options: verbose:true, host:0.0.0.0, port:5000, max-requests:—
model:{infrastructure:{nodes:{id:"k8s-edge-1" CPU:{capacity:8000} memory:{capacity:32513966080}} nodes:{id:"k8s-master" CPU:{capacity:4000} memory:{capacity:16766124032} labels:"siemens.com.qosscheduler.master"} networks:{id:"k8s" links:{id:"k8s-edge-1-k8s-edge-1" source:"k8s-edge-1" target:"k8s-edge-1" medium:"k8s-edge-1-k8s-edge-1" latency:100000} links:{id:"k8s-edge-1-k8s-master" source:"k8s-edge-1" target:"k8s-master" medium:"k8s-edge-1-k8s-master" latency:100000} links:{id:"k8s-master-k8s-edge-1" source:"k8s-master" target:"k8s-edge-1" medium:"k8s-master-k8s-edge-1" latency:100000} links:{id:"k8s-master-k8s-master" source:"k8s-master" target:"k8s-master" medium:"k8s-master-k8s-master" latency:100000} paths:{id:"k8s-edge-1-k8s-edge-1-2297793460" links:"k8s-edge-1-k8s-edge-1"} paths:{id:"k8s-edge-1-k8s-master-2072150989" links:"k8s-edge-1-k8s-master"} paths:{id:"k8s-master-k8s-edge-1-2793655007" links:"k8s-master-k8s-edge-1"} paths:{id:"k8s-master-k8s-master-3995146986" links:"k8s-master-k8s-master"} media:{id:"k8s-edge-1-k8s-edge-1" bandwidth:{capacity:1000000000}} media:{id:"k8s-edge-1-k8s-master" bandwidth:{capacity:1000000000}} media:{id:"k8s-master-k8s-edge-1" bandwidth:{capacity:1000000000}} media:{id:"k8s-master-k8s-master" bandwidth:{capacity:1000000000}}}} app_group:{applications:{id:"acm-swm-app" workloads:{id:"acm-swm-app-data-migration-1"} workloads:{id:"acm-swm-app-processing-pod-1-source-1"} workloads:{id:"acm-swm-app-processing-pod-2-source-1"}} channels:{id:"data-migration-1-channel-processing-pod-1-source-1" source_application:"acm-swm-app" source_workload:"acm-swm-app-data-migration-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-processing-pod-1-source-1" latency:1000000000 bandwidth:5 frame_size:100 12:9002} channels:{id:"data-migration-1-channel-processing-pod-2-source-1" source_application:"acm-swm-app" source_workload:"acm-swm-app-data-migration-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-processing-pod-2-source-1" latency:1000000000 bandwidth:5 frame_size:100 12:9002} channels:{id:"processing-pod-1-source-1-channel" source_application:"acm-swm-app" source_workload:"acm-swm-app-processing-pod-1-source-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-data-migration-1" latency:1000000000 bandwidth:5 frame_size:100 12:8080} channels:{id:"processing-pod-2-source-1-channel" source_application:"acm-swm-app" source_workload:"acm-swm-app-processing-pod-2-source-1" target_application:"acm-swm-app" target_workload:"acm-swm-app-data-migration-1" latency:1000000000 bandwidth:5 frame_size:100 12:8080}} costs:{} recommendations:{workloads:{application:"acm-swm-app" workload:"acm-swm-app-data-migration-1"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-1-source-1"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-2-source-1"}}} options:"{}"
workloads:{application:"acm-swm-app" workload:"acm-swm-app-data-migration-1" node:"k8s-master"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-1-source-1" node:"k8s-master"} workloads:{application:"acm-swm-app" workload:"acm-swm-app-processing-pod-2-source-1" node:"k8s-master"} channels:{channel:"data-migration-1-channel-processing-pod-1-source-1" network:"k8s" path:"k8s-master-k8s-master-2297793460"} channels:{channel:"data-migration-1-channel-processing-pod-2-source-1" network:"k8s" path:"k8s-master-k8s-master-2297793460"} channels:{channel:"processing-pod-1-source-1-channel" network:"k8s" path:"k8s-master-k8s-master-2297793460"} channels:{channel:"processing-pod-2-source-1-channel" network:"k8s" path:"k8s-master-k8s-master-2297793460"}