diff --git a/debian_pkg_dirs b/debian_pkg_dirs index 449fb94de..de44351d9 100644 --- a/debian_pkg_dirs +++ b/debian_pkg_dirs @@ -45,6 +45,7 @@ golang-github-dev/golang-github-cilium-ebpf-dev golang-github-dev/golang-github-coreos-go-systemd-dev golang-github-dev/golang-github-opencontainers-specs-dev golang-github-dev/golang-github-vishvananda-netlink +gpu/gpu-operator grub/grub2 grub/grubby kubernetes/armada diff --git a/gpu/gpu-operator/debian/deb_folder/changelog b/gpu/gpu-operator/debian/deb_folder/changelog new file mode 100644 index 000000000..0fa199f69 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/changelog @@ -0,0 +1,5 @@ +gpu-operator (1.8.1) unstable; urgency=medium + + * Initial release. + + -- Andre Kantek Thu, 27 Jul 2022 14:00:42 +0000 diff --git a/gpu/gpu-operator/debian/deb_folder/control b/gpu/gpu-operator/debian/deb_folder/control new file mode 100644 index 000000000..5172d093e --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/control @@ -0,0 +1,14 @@ +Source: gpu-operator +Section: admin +Priority: optional +Maintainer: StarlingX Developers +Build-Depends: debhelper-compat (= 13), helm +Standards-Version: 4.5.1 +Homepage: https://www.starlingx.io + +Package: gpu-operator +Architecture: any +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: The NVIDIA GPU Operator uses the operator framework within + Kubernetes to automate the management of all NVIDIA software components + needed to provision GPU diff --git a/gpu/gpu-operator/debian/deb_folder/copyright b/gpu/gpu-operator/debian/deb_folder/copyright new file mode 100644 index 000000000..7db1ad9a0 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/copyright @@ -0,0 +1,29 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Upstream-Contact: https://github.com/NVIDIA/gpu-operator/ +Source: https://github.com/NVIDIA/gpu-operator/ +Files: * +Copyright: (C) 2018-2022 https://github.com/NVIDIA/gpu-operator/ +License: Apache-2 + +Upstream-Name: gpu-operator +Upstream-Contact: StarlingX Developers +Source: https://opendev.org/starlingx/integ/src/branch/master/gpu/gpu-operator/ +Files: debian/* +Copyright: (c) 2022 Wind River Systems, Inc. +License: Apache-2 + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + . + On Debian-based systems the full text of the Apache version 2.0 license + can be found in `/usr/share/common-licenses/Apache-2.0'. + diff --git a/gpu/gpu-operator/debian/deb_folder/gpu-operator.install b/gpu/gpu-operator/debian/deb_folder/gpu-operator.install new file mode 100644 index 000000000..6fb208476 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/gpu-operator.install @@ -0,0 +1 @@ +opt/extracharts/gpu-operator-v3-1.8.1.tgz \ No newline at end of file diff --git a/gpu/gpu-operator/debian/deb_folder/rules b/gpu/gpu-operator/debian/deb_folder/rules new file mode 100644 index 000000000..ea8fba3d2 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/rules @@ -0,0 +1,23 @@ +#!/usr/bin/make -f + +export HELM_VER = v3 +export PKG_VERSION = 1.8.1 +export DEBIAN_DESTDIR := $(CURDIR)/debian/tmp + +%: + dh $@ + +override_dh_auto_build: + mkdir -p deployments/gpu-operator/assets/state-driver/ + mkdir -p deployments/gpu-operator/assets/state-operator-validation/ + cp assets/state-driver/0500_daemonset.yaml deployments/gpu-operator/assets/state-driver/0500_daemonset.yaml + cp assets/state-operator-validation/0500_daemonset.yaml deployments/gpu-operator/assets/state-operator-validation/0500_daemonset.yaml + helm lint deployments/gpu-operator + mkdir build_results + helm package --version ${HELM_VER}-${PKG_VERSION} --app-version v${PKG_VERSION} -d build_results deployments/gpu-operator + +override_dh_auto_install: + # Install the app tar file. + install -d -m 755 ${DEBIAN_DESTDIR}/opt/extracharts + install -p -D -m 644 build_results/gpu-operator-${HELM_VER}-${PKG_VERSION}.tgz ${DEBIAN_DESTDIR}/opt/extracharts + dh_install diff --git a/gpu/gpu-operator/debian/meta_data.yaml b/gpu/gpu-operator/debian/meta_data.yaml new file mode 100644 index 000000000..a6974e191 --- /dev/null +++ b/gpu/gpu-operator/debian/meta_data.yaml @@ -0,0 +1,11 @@ +--- +debname: gpu-operator +debver: 1.8.1 +dl_path: + name: gpu-operator-v1.8.1.tar.gz + url: https://github.com/NVIDIA/gpu-operator/archive/refs/tags/v1.8.1.tar.gz + md5sum: 03c7346c724774ecd63d33ba7d8e110a + sha256sum: 42e08c95ce5b558a296cb31c98a6beeef3b551d47d236fa082db7fa5c44ad471 +revision: + dist: $STX_DIST + PKG_GITREVCOUNT: true diff --git a/gpu/gpu-operator/debian/patches/deployments-setup-configmap-with-assets-for-volumemo.patch b/gpu/gpu-operator/debian/patches/deployments-setup-configmap-with-assets-for-volumemo.patch new file mode 100644 index 000000000..2215c093c --- /dev/null +++ b/gpu/gpu-operator/debian/patches/deployments-setup-configmap-with-assets-for-volumemo.patch @@ -0,0 +1,136 @@ +From 1094b6f1593ec454b3a6313ecf9fae53f8c66899 Mon Sep 17 00:00:00 2001 +From: Babak Sarashki +Date: Sat, 6 Mar 2021 00:22:40 +0000 +Subject: [PATCH 1/2] deployments: setup configmap with assets for volumemounts + +This feature allows inclusion of assets/ in the helm chart and their +export to the gpu-operator pod through configmap volumeMounts. + +Signed-off-by: Babak Sarashki +--- + .../gpu-operator/templates/operator.yaml | 44 +++++++++++++++++++ + .../templates/operator_configmap.yaml | 36 +++++++++++++++ + deployments/gpu-operator/values.yaml | 2 + + 3 files changed, 82 insertions(+) + create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml + +diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml +index 1d81f74..c97b4b1 100644 +--- a/deployments/gpu-operator/templates/operator.yaml ++++ b/deployments/gpu-operator/templates/operator.yaml +@@ -49,6 +49,44 @@ spec: + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true ++ ++ {{- if eq .Values.operator.include_assets "include_assets" }} ++ {{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }} ++ subPath: {{ printf "gfd_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }} ++ subPath: {{ printf "state_container_toolkit_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }} ++ subPath: {{ printf "state_device_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }} ++ subPath: {{ printf "state_device_validation_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }} ++ subPath: {{ printf "state_driver_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }} ++ subPath: {{ printf "state_monitor_%s" (base $path) }} ++ {{- end }} ++ {{- end }} + livenessProbe: + httpGet: + path: /healthz +@@ -72,6 +110,12 @@ spec: + - name: host-os-release + hostPath: + path: "/etc/os-release" ++ {{- if eq .Values.operator.include_assets "include_assets" }} ++ - name: assets ++ configMap: ++ name: operator-configmap ++ {{- end }} ++ + {{- with .Values.operator.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} +diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml +new file mode 100644 +index 0000000..61f366e +--- /dev/null ++++ b/deployments/gpu-operator/templates/operator_configmap.yaml +@@ -0,0 +1,36 @@ ++{{- if eq .Values.operator.include_assets "include_assets" }} ++apiVersion: v1 ++kind: ConfigMap ++metadata: ++ name: operator-configmap ++data: ++{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} ++{{ printf "gfd_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} ++{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} ++{{ printf "state_device_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} ++{{ printf "state_device_validation_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} ++{{ printf "state_driver_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} ++{{ printf "state_monitor_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++{{- end }} +diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml +index 78a4757..6689636 100644 +--- a/deployments/gpu-operator/values.yaml ++++ b/deployments/gpu-operator/values.yaml +@@ -70,6 +70,8 @@ operator: + values: [""] + logging: + timeEncoding: epoch ++ # Set "include_assets" true to include assets/gpu-operator with the helm chart ++ include_assets: "" + resources: + limits: + cpu: 500m +-- +2.17.1 + diff --git a/gpu/gpu-operator/debian/patches/enablement-support-on-starlingx-cloud-platform.patch b/gpu/gpu-operator/debian/patches/enablement-support-on-starlingx-cloud-platform.patch new file mode 100644 index 000000000..1c8b93809 --- /dev/null +++ b/gpu/gpu-operator/debian/patches/enablement-support-on-starlingx-cloud-platform.patch @@ -0,0 +1,867 @@ +From 65ac63ca1bc8517f3f0c3560498de758149a3800 Mon Sep 17 00:00:00 2001 +From: Babak Sarashki +Date: Sun, 7 Mar 2021 17:19:08 +0000 +Subject: [PATCH] enablement: support on starlingx cloud platform + +StarlingX is a cloud infrastructure software stack for edge. +It has an immutable file system, and system configruation. For +instance changes to set containerd runtime by the gpu-operator +will be overriden and must be avoided. + +This commit enables gpu-operator on Starlingx (starlingx.io). +The changes to the gpu-operator include bundling modified assets +and a modified version of the nvidia-driver build script with the +helm charts. + +The modifications include host-mounting the kernel headers and +kernel build directory onto the respective mount points inside +the driver pod namespace; modifying the nvidia-driver to account +for pre-installed kernel packages; and pre-installing the nvidia- +toolkit version 1.7.1-ubi8. The defaultRuntime is expected to +be containerd. + +To load the operator on starlingx: + +$ source /etc/platform/openrc +[...(keystone_admin)]$ system service-parameter-add \ + platform container_runtime \ + custom_container_runtime=nvidia:/path/to/nvidia-container-runtime + +[...(keystone_admin)]$ system host-lock 1; system host-unlock 1 + +Signed-off-by: Babak Sarashki +--- + assets/state-driver/0500_daemonset.yaml | 47 ++- + .../0500_daemonset.yaml | 18 ++ + deployments/gpu-operator/Chart.yaml | 3 + + .../charts/stx-toolkit-installer/.helmignore | 23 ++ + .../charts/stx-toolkit-installer/Chart.yaml | 6 + + .../templates/_helpers.tpl | 6 + + .../templates/toolkit.yaml | 71 +++++ + .../charts/stx-toolkit-installer/values.yaml | 8 + + .../templates/build_configmap.yaml | 291 ++++++++++++++++++ + .../gpu-operator/templates/clusterpolicy.yaml | 4 +- + .../gpu-operator/templates/operator.yaml | 52 +++- + .../templates/operator_confimap.yaml | 61 ++++ + deployments/gpu-operator/values.yaml | 15 +- + 13 files changed, 583 insertions(+), 22 deletions(-) + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml + create mode 100644 deployments/gpu-operator/templates/build_configmap.yaml + create mode 100644 deployments/gpu-operator/templates/operator_confimap.yaml + +diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml +index 4cd1617..c8aefd2 100644 +--- a/assets/state-driver/0500_daemonset.yaml ++++ b/assets/state-driver/0500_daemonset.yaml +@@ -35,7 +35,6 @@ spec: + valueFrom: + fieldRef: + fieldPath: spec.nodeName +- # always use runc for driver containers + - name: NVIDIA_VISIBLE_DEVICES + value: void + securityContext: +@@ -72,8 +71,14 @@ spec: + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-driver-ctr +- command: ["nvidia-driver"] +- args: ["init"] ++ command: ["/bin/bash"] ++ args: ++ - "-c" ++ - "--" ++ - > ++ tar -C /usr/host-include -c . -f - | tar -C /usr/include -xvf -; ++ ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so; ++ /usr/local/bin/nvidia-driver init; + securityContext: + privileged: true + seLinuxOptions: +@@ -94,6 +99,22 @@ spec: + - name: run-mellanox-drivers + mountPath: /run/mellanox/drivers + mountPropagation: HostToContainer ++ - name: host-modules ++ mountPath: /lib/modules ++ readOnly: false ++ - name: host-include ++ mountPath: /usr/host-include ++ readOnly: false ++ - name: host-kernel-devel ++ mountPath: /usr/src/kernels ++ readOnly: true ++ - name: host-usr-src ++ mountPath: /usr/host-src ++ readOnly: false ++ - name: vol11 ++ mountPath: /usr/local/bin/nvidia-driver ++ subPath: nvidia-driver-build-script ++ readOnly: true + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-peermem-ctr +@@ -157,4 +178,22 @@ spec: + hostPath: + path: /run/nvidia/validations + type: DirectoryOrCreate +- ++ - name: host-modules ++ hostPath: ++ path: /lib/modules ++ - name: host-kernel-devel ++ hostPath: ++ path: /usr/src/kernels/ ++ - name: host-include ++ hostPath: ++ path: /usr/include ++ - name: host-usr-src ++ hostPath: ++ path: /usr/src ++ - name: vol11 ++ configMap: ++ name: nvidia-driver ++ defaultMode: 0777 ++ items: ++ - key: nvidia-driver-build-script ++ path: nvidia-driver-build-script +diff --git a/assets/state-operator-validation/0500_daemonset.yaml b/assets/state-operator-validation/0500_daemonset.yaml +index 266c9d6..ce226fa 100644 +--- a/assets/state-operator-validation/0500_daemonset.yaml ++++ b/assets/state-operator-validation/0500_daemonset.yaml +@@ -75,6 +75,10 @@ spec: + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional ++ - name: vol12 ++ mountPath: /var/nvidia/manifests/cuda-workload-validation.yaml ++ subPath: cuda-workload-validation.yaml ++ readOnly: true + - name: plugin-validation + image: "FILLED_BY_OPERATOR" + command: ['sh', '-c'] +@@ -98,6 +102,10 @@ spec: + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional ++ - name: vol12 ++ mountPath: /var/nvidia/manifests/plugin-workload-validation.yaml ++ subPath: plugin-workload-validation.yaml ++ readOnly: true + containers: + - image: "FILLED_BY_OPERATOR" + name: nvidia-operator-validator +@@ -113,6 +121,7 @@ spec: + - name: run-nvidia-validations + mountPath: "/run/nvidia/validations" + mountPropagation: Bidirectional ++ terminationGracePeriodSeconds: 60 + volumes: + - name: run-nvidia-validations + hostPath: +@@ -121,3 +130,12 @@ spec: + - name: driver-install-path + hostPath: + path: /run/nvidia/driver ++ - name: vol12 ++ configMap: ++ name: nvidia-validator ++ defaultMode: 0444 ++ items: ++ - key: cuda-workload-validation.yaml ++ path: cuda-workload-validation.yaml ++ - key: plugin-workload-validation.yaml ++ path: plugin-workload-validation.yaml +diff --git a/deployments/gpu-operator/Chart.yaml b/deployments/gpu-operator/Chart.yaml +index 0b379a3..7b743e4 100644 +--- a/deployments/gpu-operator/Chart.yaml ++++ b/deployments/gpu-operator/Chart.yaml +@@ -22,3 +22,6 @@ dependencies: + version: 0.8.2 + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts + condition: nfd.enabled ++ - name: stx-toolkit-installer ++ version: 0.1.0 ++ condition: toolkit-installer.enabled +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore +new file mode 100644 +index 0000000..0e8a0eb +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore +@@ -0,0 +1,23 @@ ++# Patterns to ignore when building packages. ++# This supports shell glob matching, relative path matching, and ++# negation (prefixed with !). Only one pattern per line. ++.DS_Store ++# Common VCS dirs ++.git/ ++.gitignore ++.bzr/ ++.bzrignore ++.hg/ ++.hgignore ++.svn/ ++# Common backup files ++*.swp ++*.bak ++*.tmp ++*.orig ++*~ ++# Various IDEs ++.project ++.idea/ ++*.tmproj ++.vscode/ +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml +new file mode 100644 +index 0000000..c195c58 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml +@@ -0,0 +1,6 @@ ++apiVersion: v2 ++appVersion: v0.1.0 ++name: stx-toolkit-installer ++description: "Standalone nvidia toolkit installer for starlingx" ++type: application ++version: 1.7.1-ubi8 +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl +new file mode 100644 +index 0000000..b6f6274 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl +@@ -0,0 +1,6 @@ ++{{/* ++Full image name with tag ++*/}} ++{{- define "toolkit-installer.fullimage" -}} ++{{- .Values.toolkit.repository -}}/{{- .Values.toolkit.image -}}:{{- .Values.toolkit.version | default .Chart.AppVersion -}} ++{{- end }} +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml +new file mode 100644 +index 0000000..3cbec11 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml +@@ -0,0 +1,71 @@ ++apiVersion: apps/v1 ++kind: DaemonSet ++metadata: ++ name: toolkit-installer ++ namespace: kube-system ++ labels: ++ app.kubernetes.io/component: "toolkit-installer" ++ {{ $.Release.labels }} ++spec: ++ selector: ++ matchLabels: ++ {{ $.Release.labels }} ++ app.kubernetes.io/component: "toolkit-installer" ++ app: "toolkit-installer" ++ template: ++ metadata: ++ labels: ++ {{ $.Release.labels }} ++ app.kubernetes.io/component: "toolkit-installer" ++ app: "toolkit-installer" ++ spec: ++ containers: ++ - name: toolkit-daemon ++ image: {{ include "toolkit-installer.fullimage" . }} ++ lifecycle: ++ preStop: ++ exec: ++ command: ++ - "/bin/sh" ++ - "-c" ++ - "--" ++ - > ++ if [ $toolkit_force_clean == "true" ] ; then ++ while [[ -f /var/run/nvidia/validations/cuda-ready ]] || ++ [[ -f /var/run/nvidia/validations/driver-ready ]] || ++ [[ -f /var/run/nvidia/validations/plugin-ready ]] || ++ [[ -f /var/run/nvidia/validations/toolkit-ready ]] ; ++ do ++ echo "waiting for gpu pods to exit" ++ sleep 10; ++ done; ++ sleep 60; ++ rm -rf /usr/local/nvidia/toolkit; ++ fi; ++ command: ["/bin/bash"] ++ args: ++ - "-c" ++ - "--" ++ - > ++ ./toolkit install /usr/local/nvidia/toolkit; ++ sleep infinity; ++ env: ++ - name: toolkit_force_clean ++ value: {{ quote .Values.global.toolkit_force_clean }} ++ volumeMounts: ++ - name: toolkitdest ++ mountPath: /usr/local/nvidia ++ readOnly: false ++ - name: varrunnvidia ++ mountPath: /var/run/nvidia ++ readOnly: true ++ {{ if (.Values.global.toolkit_force_clean) and (eq .Values.gobal.toolkit_force_clean "true") }} ++ terminationGracePeriodSeconds: 120 ++ {{- end }} ++ volumes: ++ - name: toolkitdest ++ hostPath: ++ path: /usr/local/nvidia ++ - name: varrunnvidia ++ hostPath: ++ path: /var/run/nvidia +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml +new file mode 100644 +index 0000000..b898dc2 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml +@@ -0,0 +1,8 @@ ++toolkit: ++ repository: nvidia ++ image: container-toolkit ++ version: 1.7.1-ubi8 ++ imagePullPolicy: IfNotPresent ++ imagePullSecrets: [] ++ priorityClassName: system-node-critical ++ defaultRuntime: containerd +diff --git a/deployments/gpu-operator/templates/build_configmap.yaml b/deployments/gpu-operator/templates/build_configmap.yaml +new file mode 100644 +index 0000000..a7453a4 +--- /dev/null ++++ b/deployments/gpu-operator/templates/build_configmap.yaml +@@ -0,0 +1,291 @@ ++{{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }} ++apiVersion: v1 ++kind: Namespace ++metadata: ++ name: "gpu-operator-resources" ++--- ++apiVersion: v1 ++kind: ConfigMap ++metadata: ++ name: nvidia-driver ++ namespace: gpu-operator-resources ++data: ++ nvidia-driver-build-script: | ++ #! /bin/bash ++ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. ++ # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier: ++ # Apache-2.0. ++ # This script is from: https://gitlab.com/nvidia/container-images/driver. ++ # It is modified and included under configmap for platforms that require ++ # pre-installed packages. Such platforms have the option to modify the ++ # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for ++ # further customizations. ++ ++ set -eu ++ ++ RUN_DIR=/run/nvidia ++ PID_FILE=${RUN_DIR}/${0##*/}.pid ++ DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"} ++ KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver ++ KERNEL_VERSION="$(uname -r)" ++ ++ _install_tools() { ++ yum clean all ++ yum install -y centos-release-scl ++ yum install -y epel-release ++ yum install -y devtoolset-8-build devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-make ++ } ++ ++ # Load the kernel modules and start persistenced. ++ _load_driver() { ++ echo "Loading IPMI kernel module..." ++ modprobe ipmi_msghandler ++ ++ echo "Loading NVIDIA driver kernel modules..." ++ modprobe -a nvidia nvidia-uvm nvidia-modeset ++ ++ echo "Starting NVIDIA persistence daemon..." ++ nvidia-persistenced --persistence-mode ++ } ++ ++ # Stop persistenced and unload the kernel modules if they are currently loaded. ++ _unload_driver() { ++ local rmmod_args=() ++ local nvidia_deps=0 ++ local nvidia_refs=0 ++ local nvidia_uvm_refs=0 ++ local nvidia_modeset_refs=0 ++ ++ echo "Stopping NVIDIA persistence daemon..." ++ if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then ++ local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) ++ ++ kill -SIGTERM "${pid}" ++ for i in $(seq 1 10); do ++ kill -0 "${pid}" 2> /dev/null || break ++ sleep 0.1 ++ done ++ if [ $i -eq 10 ]; then ++ echo "Could not stop NVIDIA persistence daemon" >&2 ++ return 1 ++ fi ++ fi ++ ++ echo "Unloading NVIDIA driver kernel modules..." ++ if [ -f /sys/module/nvidia_modeset/refcnt ]; then ++ nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) ++ rmmod_args+=("nvidia-modeset") ++ ((++nvidia_deps)) ++ fi ++ if [ -f /sys/module/nvidia_uvm/refcnt ]; then ++ nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) ++ rmmod_args+=("nvidia-uvm") ++ ((++nvidia_deps)) ++ fi ++ if [ -f /sys/module/nvidia/refcnt ]; then ++ nvidia_refs=$(< /sys/module/nvidia/refcnt) ++ rmmod_args+=("nvidia") ++ fi ++ if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then ++ echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 ++ return 1 ++ fi ++ ++ if [ ${#rmmod_args[@]} -gt 0 ]; then ++ rmmod ${rmmod_args[@]} ++ fi ++ return 0 ++ } ++ ++ # Link and install the kernel modules from a precompiled package using the nvidia-installer. ++ _install_driver() { ++ local install_args=() ++ ++ # Default is standard kernel. ++ if [ ! -z ${IGNORE_PREEMPT_RT_PRESENCE+x} ] ; then ++ echo "WARN: IGNORE_PREEMPT_RT_PRESENCE set" ++ echo "Build Target PREEMPT_RT best effort" ++ fi; ++ ++ _install_tools ++ export PATH=/opt/rh/devtoolset-8/root/usr/bin${PATH:+:${PATH}} ++ export PCP_DIR=/opt/rh/devtoolset-8/root ++ ++ echo "Installing NVIDIA driver kernel modules..." ++ cd /usr/src/nvidia-${DRIVER_VERSION} ++ # rm -rf /lib/modules/${KERNEL_VERSION}/video ++ ++ if [ "${ACCEPT_LICENSE}" = "yes" ]; then ++ install_args+=("--accept-license") ++ fi ++ nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"} ++ } ++ ++ # Mount the driver rootfs into the run directory with the exception of sysfs. ++ _mount_rootfs() { ++ echo "Mounting NVIDIA driver rootfs..." ++ mount --make-runbindable /sys ++ mount --make-private /sys ++ mkdir -p ${RUN_DIR}/driver ++ mount --rbind / ${RUN_DIR}/driver ++ } ++ ++ # Unmount the driver rootfs from the run directory. ++ _unmount_rootfs() { ++ echo "Unmounting NVIDIA driver rootfs..." ++ if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then ++ umount -l -R ${RUN_DIR}/driver ++ fi ++ } ++ ++ init() { ++ echo -e "\n========== NVIDIA Software Installer ==========\n" ++ echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" ++ ++ exec 3> ${PID_FILE} ++ if ! flock -n 3; then ++ echo "An instance of the NVIDIA driver is already running, aborting" ++ exit 1 ++ fi ++ echo $$ >&3 ++ ++ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM ++ trap "_shutdown" EXIT ++ ++ _unload_driver || exit 1 ++ _unmount_rootfs ++ ++ ( ++ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia.ko ] || ++ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-uvm.ko ] || ++ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-modeset.ko ] ++ ) && _install_driver ++ ++ _load_driver ++ _mount_rootfs ++ ++ echo "Done, now waiting for signal" ++ sleep infinity & ++ trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM ++ trap - EXIT ++ while true; do wait $! || continue; done ++ exit 0 ++ } ++ ++ usage() { ++ cat >&2 <