From 0d6690b9bc73669bd61620efb73afaabd4e3dfc2 Mon Sep 17 00:00:00 2001 From: Elisamara Aoki Goncalves Date: Mon, 26 Sep 2022 16:31:01 -0300 Subject: [PATCH] Remove gpu-operator documentation details (r6,r7,dsR6,dsR7) Signed-off-by: Elisamara Aoki Goncalves Change-Id: I6b963f6a294843e829041c4ba00834866610e19e --- ...vidia-gpu-operator-for-pci-passthrough.rst | 162 ------------------ .../delete-the-gpu-operator.rst | 58 ------- .../index-node-mgmt-kub-5ff5993b9c60.rst | 10 -- 3 files changed, 230 deletions(-) delete mode 100644 doc/source/node_management/kubernetes/hardware_acceleration_devices/configure-nvidia-gpu-operator-for-pci-passthrough.rst delete mode 100644 doc/source/node_management/kubernetes/hardware_acceleration_devices/delete-the-gpu-operator.rst diff --git a/doc/source/node_management/kubernetes/hardware_acceleration_devices/configure-nvidia-gpu-operator-for-pci-passthrough.rst b/doc/source/node_management/kubernetes/hardware_acceleration_devices/configure-nvidia-gpu-operator-for-pci-passthrough.rst deleted file mode 100644 index 1be3fcf37..000000000 --- a/doc/source/node_management/kubernetes/hardware_acceleration_devices/configure-nvidia-gpu-operator-for-pci-passthrough.rst +++ /dev/null @@ -1,162 +0,0 @@ - -.. fgy1616003207054 -.. _configure-nvidia-gpu-operator-for-pci-passthrough: - -================================================= -Configure NVIDIA GPU Operator for PCI Passthrough -================================================= - -|release-caveat| - -This section provides instructions for configuring NVIDIA GPU Operator. - -.. rubric:: |context| - -.. note:: - NVIDIA GPU Operator is only supported for standard performance kernel - profile. There is no support provided for low-latency performance kernel - profile. - -NVIDIA GPU Operator automates the installation, maintenance, and management of -NVIDIA software needed to provision NVIDIA GPU and provisioning of pods that -require nvidia.com/gpu resources. - -NVIDIA GPU Operator is delivered as a Helm chart to install a number of services -and pods to automate the provisioning of NVIDIA GPUs with the needed NVIDIA -software components. These components include: - -.. _fgy1616003207054-ul-sng-blk-z4b: - -- NVIDIA drivers \(to enable CUDA which is a parallel computing platform\) - -- Kubernetes device plugin for GPUs - -- NVIDIA Container Runtime - -- Automatic Node labelling - -- DCGM \(NVIDIA Data Center GPU Manager\) based monitoring - -.. rubric:: |prereq| - -Download the **gpu-operator-v3-1.8.1.4.tgz** file at -`http://mirror.starlingx.cengn.ca/mirror/starlingx/release/latest_release/centos/containers/inputs/downloads/ -`__. - -Use the following steps to configure the GPU Operator container: - -.. rubric:: |proc| - -#. Lock the hosts\(s\). - - .. code-block:: none - - ~(keystone_admin)]$ system host-lock - -#. Configure the Container Runtime host path to the NVIDIA runtime which will - be installed by the GPU Operator Helm deployment. - - .. code-block:: none - - ~(keystone_admin)]$ system service-parameter-add platform container_runtime custom_container_runtime=nvidia:/usr/local/nvidia/toolkit/nvidia-container-runtime - -#. Unlock the hosts\(s\). Once the system is unlocked, the system will reboot automatically. - - .. code-block:: none - - ~(keystone_admin)]$ system host-unlock - -#. Install the GPU Operator Helm charts. - - .. code-block:: none - - ~(keystone_admin)]$ helm install gpu-operator /path/to/gpu-operator-v3-1.8.1.4.tgz - -#. Check if the GPU Operator is deployed using the following command. - - .. code-block:: none - - ~(keystone_admin)]$ kubectl get pods –A - NAMESPACE NAME READY STATUS RESTARTS AGE - .............. - default gpu-operator-5dddfcbb58-xpwbh 1/1 Running 0 3m13s - default gpu-operator-node-feature-discovery-master-58d884d5cc-56qch 1/1 Running 0 3m13s - default gpu-operator-node-feature-discovery-worker-p495j 1/1 Running 0 3m13s - gpu-operator-resources gpu-feature-discovery-swmj8 1/1 Running 0 2m52s - gpu-operator-resources nvidia-cuda-validator-zcfp9 0/1 Completed 0 2m31s - gpu-operator-resources nvidia-dcgm-9447k 1/1 Running 0 2m52s - gpu-operator-resources nvidia-dcgm-exporter-9c82q 1/1 Running 0 2m52s - gpu-operator-resources nvidia-device-plugin-daemonset-ljm4q 1/1 Running 0 2m52s - gpu-operator-resources nvidia-device-plugin-validator-j9kjz 0/1 Completed 0 2m25s - gpu-operator-resources nvidia-driver-daemonset-qph2s 1/1 Running 0 2m52s - gpu-operator-resources nvidia-operator-validator-dw6sc 1/1 Running 0 2m52s - .......... - kube-system toolkit-installer-xzrt8 1/1 Running 0 3m13s - - The plugin validation pod is marked completed. - -#. Check if the nvidia.com/gpu resources are available using the following command. - - .. code-block:: none - - ~(keystone_admin)]$ kubectl describe nodes | grep nvidia - -#. Create a pod that uses the NVIDIA RuntimeClass and requests a - nvidia.com/gpu resource. Update the nvidia-usage-example-pod.yml file to launch - a pod NVIDIA GPU. For example: - - .. code-block:: none - - cat < nvidia-usage-example-pod.yml - apiVersion: v1 - kind: Pod - metadata: - name: nvidia-usage-example-pod - spec: - runtimeClassName: nvidia - containers: - - name: nvidia-usage-example-pod - image: nvidia/samples:cuda10.2-vectorAdd - imagePullPolicy: IfNotPresent - command: [ "/bin/bash", "-c", "--" ] - args: [ "while true; do sleep 300000; done;" ] - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - EOF - -#. Create a pod using the following command. - - .. code-block:: none - - ~(keystone_admin)]$ kubectl create -f nvidia-usage-example-pod.yml - -#. Check that the pod has been set up correctly. The status of the NVIDIA device is displayed in the table. - - .. code-block:: none - - ~(keystone_admin)]$ kubectl exec -it nvidia-usage-example-pod -- nvidia-smi - +-----------------------------------------------------------------------------+ - | NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 | - |-------------------------------+----------------------+----------------------+ - | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | - | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | - | | | MIG M. | - |===============================+======================+======================| - | 0 Tesla T4 On | 00000000:AF:00.0 Off | 0 | - | N/A 28C P8 14W / 70W | 0MiB / 15109MiB | 0% Default | - | | | N/A | - +-------------------------------+----------------------+----------------------+ - - +-----------------------------------------------------------------------------+ - | Processes: | - | GPU GI CI PID Type Process name GPU Memory | - | ID ID Usage | - |=============================================================================| - | No running processes found | - +-----------------------------------------------------------------------------+ - - For information on deleting the GPU Operator, see :ref:`Delete the GPU - Operator `. diff --git a/doc/source/node_management/kubernetes/hardware_acceleration_devices/delete-the-gpu-operator.rst b/doc/source/node_management/kubernetes/hardware_acceleration_devices/delete-the-gpu-operator.rst deleted file mode 100644 index 70d231758..000000000 --- a/doc/source/node_management/kubernetes/hardware_acceleration_devices/delete-the-gpu-operator.rst +++ /dev/null @@ -1,58 +0,0 @@ - -.. nsr1616019467549 -.. _delete-the-gpu-operator: - -======================= -Delete the GPU Operator -======================= - -|release-caveat| - -Use the commands in this section to delete the GPU Operator, if required. - -.. rubric:: |prereq| - -Ensure that all user generated pods with access to `nvidia.com/gpu` resources are deleted first. - -.. rubric:: |proc| - -#. Remove the GPU Operator pods from the system using the following commands: - - .. code-block:: none - - ~(keystone_admin)]$ helm delete gpu-operator - -#. Remove the GPU Operator, and remove the service parameter platform - `container\_runtime custom\_container\_runtime` from the system, using the - following commands: - - #. Lock the host\(s\). - - .. code-block:: none - - ~(keystone_admin)]$ system host-lock - - #. List the service parameter using the following command. - - .. code-block:: none - - ~(keystone_admin)]$ system service-parameter-list - - #. Remove the service parameter platform `container\_runtime custom\_container\_runtime` - from the system, using the following command. - - .. code-block:: none - - ~(keystone_admin)]$ system service-parameter-delete - - where ```` is the ID of the service parameter, for example, 3c509c97-92a6-4882-a365-98f1599a8f56. - - #. Unlock the hosts\(s\). - - .. code-block:: none - - ~(keystone_admin)]$ system host-unlock - - For information on configuring the GPU Operator, see :ref:`Configure NVIDIA - GPU Operator for PCI Passthrough Operator - `. diff --git a/doc/source/node_management/kubernetes/index-node-mgmt-kub-5ff5993b9c60.rst b/doc/source/node_management/kubernetes/index-node-mgmt-kub-5ff5993b9c60.rst index 0f5a0079a..6ab76248c 100644 --- a/doc/source/node_management/kubernetes/index-node-mgmt-kub-5ff5993b9c60.rst +++ b/doc/source/node_management/kubernetes/index-node-mgmt-kub-5ff5993b9c60.rst @@ -346,16 +346,6 @@ N3000 and ACC100 replacement hardware_acceleration_devices/fec-replacement-with-different-vendor-or-device-id-b1ab1440e15f hardware_acceleration_devices/n3000-and-acc100-replacement-with-the-same-vendor-and-device-id-cccabcdc5d43 -******************* -NVIDIA GPU Operator -******************* - -.. toctree:: - :maxdepth: 1 - - hardware_acceleration_devices/configure-nvidia-gpu-operator-for-pci-passthrough - hardware_acceleration_devices/delete-the-gpu-operator - ------------------------ Host hardware management