diff --git a/playbookconfig/src/playbooks/host_vars/bootstrap/default.yml b/playbookconfig/src/playbooks/host_vars/bootstrap/default.yml index 05a30e0ac..32b7a5aa0 100644 --- a/playbookconfig/src/playbooks/host_vars/bootstrap/default.yml +++ b/playbookconfig/src/playbooks/host_vars/bootstrap/default.yml @@ -291,3 +291,18 @@ override_files_dir: "{{ lookup('env', 'HOME') }}" # When set to true, disk partitions that were previously used for Ceph data are # not wiped. Otherwise, all disks are wiped as part of the bootstrap. wipe_ceph_osds: false + +# K8S_PLUGINS +# =========== +# +# The following parameters are used for enabling Kubernetes device +# plugins implemented by hardware/software vendors. By default, +# plugins are disabled. +# +# "k8s_plugins" is a list of selected vendor device plugins to be +# enabled as part of the bootstrap. When a device plugin is enabled, +# its pods will be deployed when the plugin label (e.g. intelgpu=enabled) +# is assigned to the hosts in the cluster as part of system bring-up. + +k8s_plugins: +# intel-gpu-plugin: intelgpu=enabled diff --git a/playbookconfig/src/playbooks/roles/bootstrap/bringup-essential-services/tasks/bringup_kubemaster.yml b/playbookconfig/src/playbooks/roles/bootstrap/bringup-essential-services/tasks/bringup_kubemaster.yml index b65e28521..a975a3db5 100644 --- a/playbookconfig/src/playbooks/roles/bootstrap/bringup-essential-services/tasks/bringup_kubemaster.yml +++ b/playbookconfig/src/playbooks/roles/bootstrap/bringup-essential-services/tasks/bringup_kubemaster.yml @@ -239,6 +239,21 @@ - name: Remove taint from master node shell: "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node controller-0 node-role.kubernetes.io/master- || true" +- block: + - name: Applying kubernetes plugins + include_role: + name: bootstrap/plugins + tasks_from: "{{ item }}" + with_items: "{{ k8s_plugins }}" + + - name: Create kube plugin list file + copy: + content: "{{ k8s_plugins }}" + dest: /etc/platform/enabled_kube_plugins + mode: 0640 + group: sys_protected + when: k8s_plugins + - name: Add kubelet service override copy: src: "{{ kubelet_override_template }}" diff --git a/playbookconfig/src/playbooks/roles/bootstrap/plugins/tasks/intel-gpu-plugin.yml b/playbookconfig/src/playbooks/roles/bootstrap/plugins/tasks/intel-gpu-plugin.yml new file mode 100644 index 000000000..72d89d502 --- /dev/null +++ b/playbookconfig/src/playbooks/roles/bootstrap/plugins/tasks/intel-gpu-plugin.yml @@ -0,0 +1,8 @@ +--- +- name: Create Intel GPU device plugin config file + template: + src: "intel-gpu-plugin.yaml.j2" + dest: /etc/kubernetes/intel-gpu-daemonset.yaml + +- name: Activate Intel GPU device plugin + command: "kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f /etc/kubernetes/intel-gpu-daemonset.yaml" diff --git a/playbookconfig/src/playbooks/roles/bootstrap/plugins/templates/intel-gpu-plugin.yaml.j2 b/playbookconfig/src/playbooks/roles/bootstrap/plugins/templates/intel-gpu-plugin.yaml.j2 new file mode 100644 index 000000000..62acadd6d --- /dev/null +++ b/playbookconfig/src/playbooks/roles/bootstrap/plugins/templates/intel-gpu-plugin.yaml.j2 @@ -0,0 +1,73 @@ +# Intel GPU device plugin +# Based on: +# https://github.com/intel/intel-device-plugins-for-kubernetes/blob/master/deployments/gpu_plugin/gpu_plugin.yaml +# +# The following modifications have been made: +# - A nodeSelector of 'intelgpu' has been added to ensure the gpu device plugin +# pods only run on appropriately labelled nodes. +# - The daemonset is modified to tolerate all NoSchedule taints + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: intel-device-plugin + namespace: kube-system + +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: intel-gpu-plugin + namespace: kube-system + labels: + app: intel-gpu-plugin +spec: + selector: + matchLabels: + app: intel-gpu-plugin + template: + metadata: + labels: + app: intel-gpu-plugin + spec: + nodeSelector: + intelgpu: enabled + {% if system_mode != "simplex" -%} + openstack-compute-node: enabled + {%- endif %} + + tolerations: + - operator: Exists + effect: NoSchedule + serviceAccountName: intel-device-plugin + imagePullSecrets: + - name: docker-registry-secret + containers: + - name: intel-gpu-plugin + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + image: "{{ docker_registry.url }}/starlingx/intel-gpu-plugin:master-distroless-stable-latest" + imagePullPolicy: IfNotPresent + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - name: devfs + mountPath: /dev + - name: sysfs + mountPath: /sys + - name: kubeletsockets + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: devfs + hostPath: + path: /dev + - name: sysfs + hostPath: + path: /sys + - name: kubeletsockets + hostPath: + path: /var/lib/kubelet/device-plugins