From bd36b5f01871b26730f7280fcc234c729df7d7ee Mon Sep 17 00:00:00 2001
From: Will Szumski <will@stackhpc.com>
Date: Fri, 10 Nov 2023 10:59:51 +0000
Subject: [PATCH] Adds initial support for vGPUs

Adds support for configuring vGPUs on NVIDIA hardware.

Closes-Bug: #2028843
Change-Id: I4c8481f7429dbf50d51e26bd7acee24e16ee6c79
---
 ansible/inventory/group_vars/all/kolla        |   3 +
 ansible/inventory/group_vars/all/vgpu         |  16 +
 ansible/overcloud-host-configure.yml          |   1 +
 ansible/vgpu.yml                              |  42 +++
 doc/source/conf.py                            |   1 +
 doc/source/configuration/reference/index.rst  |   1 +
 doc/source/configuration/reference/vgpu.rst   | 310 ++++++++++++++++++
 etc/kayobe/inventory/groups                   |  15 +-
 etc/kayobe/vgpu.yml                           |  22 ++
 ...-nvidia-vgpu-support-b4c6f08a06c373c6.yaml |   7 +
 requirements.yml                              |   2 +
 11 files changed, 418 insertions(+), 2 deletions(-)
 create mode 100644 ansible/inventory/group_vars/all/vgpu
 create mode 100644 ansible/vgpu.yml
 create mode 100644 doc/source/configuration/reference/vgpu.rst
 create mode 100644 etc/kayobe/vgpu.yml
 create mode 100644 releasenotes/notes/adds-nvidia-vgpu-support-b4c6f08a06c373c6.yaml

diff --git a/ansible/inventory/group_vars/all/kolla b/ansible/inventory/group_vars/all/kolla
index 9efe74e47..93c7a895c 100644
--- a/ansible/inventory/group_vars/all/kolla
+++ b/ansible/inventory/group_vars/all/kolla
@@ -362,6 +362,9 @@ kolla_overcloud_inventory_top_level_group_map:
   compute:
     groups:
       - compute
+  compute-vgpu:
+    groups:
+      - compute-vgpu
   monitoring:
     groups:
       - monitoring
diff --git a/ansible/inventory/group_vars/all/vgpu b/ansible/inventory/group_vars/all/vgpu
new file mode 100644
index 000000000..43c828fc0
--- /dev/null
+++ b/ansible/inventory/group_vars/all/vgpu
@@ -0,0 +1,16 @@
+---
+################################################################################
+# VGPU configuration.
+
+# URL pointing to location of GRID driver. Examples are:
+# "file://path/on/ansible/control/host"
+# "http://webserver/NVIDIA-GRID-Linux-KVM-525.105.14-525.105.17-528.89.zip"
+vgpu_driver_url: ""
+
+# Flag to control whether the vGPU playbook should automatically reboot the
+# hypervisor. Note: this is necessary for the driver to be loaded correctly.
+# Caution should be used when changing this option.
+vgpu_do_reboot: true
+
+# Time to wait when rebooting the host before failing.
+vgpu_reboot_timeout: 600
diff --git a/ansible/overcloud-host-configure.yml b/ansible/overcloud-host-configure.yml
index 1fa8383cf..e203c8dac 100644
--- a/ansible/overcloud-host-configure.yml
+++ b/ansible/overcloud-host-configure.yml
@@ -31,3 +31,4 @@
 - import_playbook: "apparmor-libvirt.yml"
 - import_playbook: "swift-block-devices.yml"
 - import_playbook: "compute-libvirt-host.yml"
+- import_playbook: "vgpu.yml"
diff --git a/ansible/vgpu.yml b/ansible/vgpu.yml
new file mode 100644
index 000000000..ebda83fa6
--- /dev/null
+++ b/ansible/vgpu.yml
@@ -0,0 +1,42 @@
+---
+- name: Configure IOMMU
+  hosts: iommu
+  tags:
+    - iommu
+    - vgpu
+  tasks:
+    - import_role:
+        name: stackhpc.linux.iommu
+  handlers:
+    - name: Register that a reboot is required
+      set_fact:
+        kayobe_needs_reboot: true
+      listen: reboot
+
+- name: Configure NVIDIA VGPUs
+  hosts: vgpu
+  tags:
+    - vgpu
+  tasks:
+    - import_role:
+        name: stackhpc.linux.vgpu
+  handlers:
+    - name: Register that a reboot is required
+      set_fact:
+        kayobe_needs_reboot: true
+      listen: reboot
+
+- name: Reboot when required
+  hosts: iommu:vgpu
+  tags:
+    - reboot
+    - iommu
+    - vgpu
+  tasks:
+    - name: Reboot
+      reboot:
+        reboot_timeout: "{{ vgpu_reboot_timeout }}"
+      become: true
+      when:
+        - kayobe_needs_reboot | default(false) | bool
+        - vgpu_do_reboot | bool
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 9ed27550f..3c150fec6 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -127,5 +127,6 @@ openstackdocs_projects = [
     'ironic-python-agent',
     'kolla',
     'kolla-ansible',
+    'nova',
     'neutron',
 ]
diff --git a/doc/source/configuration/reference/index.rst b/doc/source/configuration/reference/index.rst
index 100b7ae84..f88c1f03a 100644
--- a/doc/source/configuration/reference/index.rst
+++ b/doc/source/configuration/reference/index.rst
@@ -23,4 +23,5 @@ options.
    docker-registry
    seed-custom-containers
    infra-vms
+   vgpu
    nova-cells
diff --git a/doc/source/configuration/reference/vgpu.rst b/doc/source/configuration/reference/vgpu.rst
new file mode 100644
index 000000000..693c89d21
--- /dev/null
+++ b/doc/source/configuration/reference/vgpu.rst
@@ -0,0 +1,310 @@
+============
+Virtual GPUs
+============
+
+Kayobe contains playbooks to configure virtualised GPUs on supported NVIDIA hardware.
+This allows you to statically create mdev devices that can be used by Nova to present
+a virtualised GPU to guest VMs. Known working GPUs are:
+
+- NVIDIA A100
+
+BIOS configuration
+==================
+
+Intel
+-----
+
+* Enable ``VT-x`` in the BIOS for virtualisation support.
+* Enable ``VT-d`` in the BIOS for IOMMU support.
+
+AMD
+---
+
+* Enable ``AMD-V`` in the BIOS for virtualisation support.
+* Enable ``AMD-Vi`` in the BIOS for IOMMU support.
+
+Example: Dell
+-------------
+
+Enabling SR-IOV with `racadm`:
+
+.. code:: shell
+
+    /opt/dell/srvadmin/bin/idracadm7 set BIOS.IntegratedDevices.SriovGlobalEnable Enabled
+    /opt/dell/srvadmin/bin/idracadm7 jobqueue create BIOS.Setup.1-1
+    <reboot>
+
+Enabling CPU Virtualization with `racadm`:
+
+.. code:: shell
+
+    /opt/dell/srvadmin/bin/idracadm7 set BIOS.ProcSettings.ProcVirtualization Enabled
+    /opt/dell/srvadmin/bin/idracadm7 jobqueue create BIOS.Setup.1-1
+    <reboot>
+
+
+Obtain driver from NVIDIA licensing portal
+==========================================
+
+Download NVIDIA GRID driver from `here <https://docs.nvidia.com/grid/latest/grid-software-quick-start-guide/index.html#redeeming-pak-and-downloading-grid-software>`__
+(This requires a login).
+
+.. _Configuration:
+
+Configuration
+=============
+
+.. seealso::
+
+   For further context, please see:
+
+   * :ref:`configuration-kayobe`
+
+Add hosts with supported GPUs to the ``compute-vgpu`` group. If using bifrost
+and the ``kayobe overcloud inventory discover`` mechanism, this can be achieved with:
+
+.. code-block:: yaml
+   :caption: ``$KAYOBE_CONFIG_PATH/overcloud.yml``
+
+   overcloud_group_hosts_map:
+     compute-vgpu:
+       - "computegpu000"
+
+Configure the location of the NVIDIA driver:
+
+.. code-block:: yaml
+   :caption: ``$KAYOBE_CONFIG_PATH/vgpu.yml``
+
+    ---
+    vgpu_driver_url: "https://example.com/NVIDIA-GRID-Linux-KVM-525.105.14-525.105.17-528.89.zip"
+
+If you don't know which vGPU types your card supports, these
+can be determined by following :ref:`VGPU_Types`.
+
+You can then define ``group_vars`` describing the vGPU configuration:
+
+.. code-block:: yaml
+   :caption: ``$KAYOBE_CONFIG_PATH/inventory/group_vars/compute-vgpu/vgpu``
+
+   #nvidia-692 GRID A100D-4C
+   #nvidia-693 GRID A100D-8C
+   #nvidia-694 GRID A100D-10C
+   #nvidia-695 GRID A100D-16C
+   #nvidia-696 GRID A100D-20C
+   #nvidia-697 GRID A100D-40C
+   #nvidia-698 GRID A100D-80C
+   #nvidia-699 GRID A100D-1-10C
+   #nvidia-700 GRID A100D-2-20C
+   #nvidia-701 GRID A100D-3-40C
+   #nvidia-702 GRID A100D-4-40C
+   #nvidia-703 GRID A100D-7-80C
+   #nvidia-707 GRID A100D-1-10CME
+   vgpu_definitions:
+     # Configuring a MIG backed VGPU
+     - pci_address: "0000:17:00.0"
+       mig_devices:
+         # This section describes how to partition the card using MIG. The key
+         # in the dictionary represents a MIG profile supported by your card and
+         # the value is the number of MIG devices of that type that you want
+         # to create. The vGPUS are then created on top of these MIG devices.
+         # The available profiles can be found in the NVIDIA documentation:
+         # https://docs.nvidia.com/grid/15.0/grid-vgpu-user-guide/index.html#virtual-gpu-types-grid-reference
+         "1g.10gb": 1
+         "2g.20gb": 3
+       virtual_functions:
+         # The mdev type is the NVIDIA identifier for a particular vGPU. When using
+         # MIG backed vGPUs these must match up with your MIG devices. See the NVIDIA
+         # vGPU types section in this document.
+         - mdev_type: nvidia-700
+           index: 0
+         - mdev_type: nvidia-700
+           index: 1
+         - mdev_type: nvidia-700
+           index: 2
+         - mdev_type: nvidia-699
+           index: 3
+     # Configuring a card in a time-sliced configuration (non-MIG backed)
+     - pci_address: "0000:65:00.0"
+       virtual_functions:
+         - mdev_type: nvidia-697
+           index: 0
+         - mdev_type: nvidia-697
+           index: 1
+
+To apply this configuration, use:
+
+.. code:: shell
+
+    (kayobe) $ kayobe overcloud host configure -t vgpu
+
+.. _VGPU_Types:
+
+NVIDIA vGPU types
+=================
+
+.. seealso::
+
+    For further context, please see:
+
+    * `NVIDIA docs on vGPU types <https://docs.nvidia.com/grid/15.0/grid-vgpu-user-guide/index.html#virtual-gpu-types-grid-reference>`__
+    * :ref:`configuration-kolla-ansible`
+
+The NVIDIA vGPU drivers must be installed to be able to query for the available
+vGPU types. This can be achieved by not defining any virtual functions in the
+vGPU definition:
+
+.. code-block:: yaml
+   :caption: ``$KAYOBE_CONFIG_PATH/inventory/group_vars/compute-vgpu/vgpu``
+
+   vgpu_definitions:
+     - pci_address: "0000:17:00.0"
+       virtual_functions: []
+
+See :ref:`Configuration`. You can then use ``mdevctl`` to query for the
+available vGPU types.
+
+.. code:: shell
+
+   mdevctl types
+
+.. _VGPU_Kolla_Configuration:
+
+Kolla Ansible configuration
+===========================
+
+.. seealso::
+
+   For further context, please see:
+
+   * :nova-doc:`Attaching virtual GPU devices to guests in the Nova documentation <admin/virtual-gpu.html>`
+   * :ref:`configuration-kolla-ansible`
+
+To use the mdev devices that were created, modify ``nova.conf`` to add a list of mdev devices that
+can be passed through to guests:
+
+.. code-block:: ini
+   :caption: ``$KAYOBE_CONFIG_PATH/kolla/config/nova/nova-compute.conf``
+
+   {% raw %}
+   {% if inventory_hostname in groups['compute-vgpu'] %}
+   [devices]
+   enabled_mdev_types = nvidia-700, nvidia-699, nvidia-697
+   [mdev_nvidia-700]
+   device_addresses = 0000:17:00.4,0000:17:00.5,0000:17:00.6
+   mdev_class = CUSTOM_NVIDIA_700
+   [mdev_nvidia-699]
+   device_addresses = 0000:17:00.7
+   mdev_class = CUSTOM_NVIDIA_699
+   [mdev_nvidia-697]
+   device_addresses = 0000:65:00.4,0000:65:00.5
+   mdev_class = CUSTOM_NVIDIA_697
+   {% endif %}
+   {% endraw %}
+
+You will need to adjust the PCI addresses to match the virtual function
+addresses. These can be obtained by checking the mdevctl configuration after
+applying the :ref:`configuration <Configuration>`:
+
+.. code-block:: shell
+
+   # mdevctl list
+   73269d0f-b2c9-438d-8f28-f9e4bc6c6995 0000:17:00.4 nvidia-700 manual (defined)
+   dc352ef3-efeb-4a5d-a48e-912eb230bc76 0000:17:00.5 nvidia-700 manual (defined)
+   a464fbae-1f89-419a-a7bd-3a79c7b2eef4 0000:17:00.6 nvidia-700 manual (defined)
+   f3b823d3-97c8-4e0a-ae1b-1f102dcb3bce 0000:17:00.7 nvidia-699 manual (defined)
+   330be289-ba3f-4416-8c8a-b46ba7e51284 0000:65:00.4 nvidia-700 manual (defined)
+   1ba5392c-c61f-4f48-8fb1-4c6b2bbb0673 0000:65:00.5 nvidia-700 manual (defined)
+   f6868020-eb3a-49c6-9701-6c93e4e3fa9c 0000:65:00.6 nvidia-700 manual (defined)
+   00501f37-c468-5ba4-8be2-8d653c4604ed 0000:65:00.7 nvidia-699 manual (defined)
+
+The mdev_class maps to a resource class that you can set in your flavor definition.
+Note that if you only define a single mdev type on a given hypervisor, then the
+mdev_class configuration option is silently ignored and it will use the ``VGPU``
+resource class (See `bug 1943934 <https://bugs.launchpad.net/nova/+bug/1943934>`__).
+
+To apply the configuration to Nova:
+
+.. code:: shell
+
+   (kayobe) $ kayobe overcloud service deploy -kt nova
+
+OpenStack flavors
+=================
+
+Define some flavors that request the resource class that was configured in ``nova.conf``.
+An example definition, that can be used with ``openstack.cloud.compute_flavor`` Ansible module,
+is shown below:
+
+.. code-block:: yaml
+
+   openstack.cloud.compute_flavor:
+     name: "vgpu.a100.2g.20gb"
+     ram: 65536
+     disk: 30
+     vcpus: 8
+     is_public: false
+     extra_specs:
+       hw:cpu_policy: "dedicated"
+       hw:cpu_thread_policy: "prefer"
+       hw:mem_page_size: "1GB"
+       hw:cpu_sockets: 2
+       hw:numa_nodes: 8
+       hw_rng:allowed: "True"
+       resources:CUSTOM_NVIDIA_700: "1"
+
+Changing VGPU device types
+==========================
+
+Converting the second card to an NVIDIA-698 (whole card). The hypervisor
+should be empty so we can freely delete mdevs. If this is not the case you will need
+to check which mdevs are in use and use extreme caution. First clean up the mdev
+definitions to make room for the new device:
+
+.. code:: shell
+
+   [stack@computegpu000 ~]$ sudo mdevctl list
+   5c630867-a673-5d75-aa31-a499e6c7cb19 0000:21:00.4 nvidia-697 manual (defined)
+   eaa6e018-308e-58e2-b351-aadbcf01f5a8 0000:21:00.5 nvidia-697 manual (defined)
+   72291b01-689b-5b7a-9171-6b3480deabf4 0000:81:00.4 nvidia-697 manual (defined)
+   0a47ffd1-392e-5373-8428-707a4e0ce31a 0000:81:00.5 nvidia-697 manual (defined)
+
+   [stack@computegpu000 ~]$ sudo mdevctl stop --uuid 72291b01-689b-5b7a-9171-6b3480deabf4
+   [stack@computegpu000 ~]$ sudo mdevctl stop --uuid 0a47ffd1-392e-5373-8428-707a4e0ce31a
+
+   [stack@computegpu000 ~]$ sudo mdevctl undefine --uuid 0a47ffd1-392e-5373-8428-707a4e0ce31a
+   [stack@computegpu000 ~]$ sudo mdevctl undefine --uuid 72291b01-689b-5b7a-9171-6b3480deabf4
+
+   [stack@computegpu000 ~]$ sudo mdevctl list --defined
+   5c630867-a673-5d75-aa31-a499e6c7cb19 0000:21:00.4 nvidia-697 manual (active)
+   eaa6e018-308e-58e2-b351-aadbcf01f5a8 0000:21:00.5 nvidia-697 manual (active)
+
+   # We can re-use the first virtual function
+
+Secondly remove the systemd unit that starts the mdev device:
+
+.. code:: shell
+
+   [stack@computegpu000 ~]$ sudo rm /etc/systemd/system/multi-user.target.wants/nvidia-mdev@0a47ffd1-392e-5373-8428-707a4e0ce31a.service
+   [stack@computegpu000 ~]$ sudo rm /etc/systemd/system/multi-user.target.wants/nvidia-mdev@72291b01-689b-5b7a-9171-6b3480deabf4.service
+
+Adapt your :ref:`Kayobe <Configuration>` and :ref:`Kolla Ansible <VGPU_Kolla_Configuration>` configuration to
+match the desired state and then re-run host configure:
+
+.. code:: shell
+
+   (kayobe) $ kayobe overcloud host configure --tags vgpu --limit computegpu000
+
+Check the result:
+
+.. code:: shell
+
+   [stack@computegpu000 ~]$ mdevctl list
+   5c630867-a673-5d75-aa31-a499e6c7cb19 0000:21:00.4 nvidia-697 manual
+   eaa6e018-308e-58e2-b351-aadbcf01f5a8 0000:21:00.5 nvidia-697 manual
+   72291b01-689b-5b7a-9171-6b3480deabf4 0000:81:00.4 nvidia-698 manual
+
+Reconfigure nova to match the change:
+
+.. code:: shell
+
+   (kayobe) $ kayobe overcloud service reconfigure -kt nova --kolla-limit computegpu000 --skip-prechecks
diff --git a/etc/kayobe/inventory/groups b/etc/kayobe/inventory/groups
index 16619fed8..645a550b5 100644
--- a/etc/kayobe/inventory/groups
+++ b/etc/kayobe/inventory/groups
@@ -41,8 +41,11 @@ controllers
 [storage]
 # Empty group to provide declaration of storage group.
 
-[compute]
-# Empty group to provide declaration of compute group.
+[compute-vgpu]
+# Empty group to provide declaration of compute-vgpu group.
+
+[compute:children]
+compute-vgpu
 
 [overcloud:children]
 controllers
@@ -51,6 +54,14 @@ monitoring
 storage
 compute
 
+###############################################################################
+# Feature control groups
+[vgpu:children]
+compute-vgpu
+
+[iommu:children]
+vgpu
+
 ###############################################################################
 # Service groups.
 
diff --git a/etc/kayobe/vgpu.yml b/etc/kayobe/vgpu.yml
new file mode 100644
index 000000000..c304f123c
--- /dev/null
+++ b/etc/kayobe/vgpu.yml
@@ -0,0 +1,22 @@
+---
+################################################################################
+# VGPU configuration.
+
+# URL pointing to location of GRID driver. Examples are:
+# "file://path/on/ansible/control/host"
+# "http://webserver/NVIDIA-GRID-Linux-KVM-525.105.14-525.105.17-528.89.zip"
+# Default is: None.
+#vgpu_driver_url:
+
+# Flag to control whether the vGPU playbook should automatically reboot the
+# hypervisor. Note: this is necessary for the driver to be loaded correctly.
+# Caution should be used when changing this option. Default is true.
+#vgpu_do_reboot:
+
+# Time to wait when rebooting the host before failing.
+# Default is 600 (seconds).
+#vgpu_reboot_timeout:
+
+###############################################################################
+# Dummy variable to allow Ansible to accept this file.
+workaround_ansible_issue_8743: yes
diff --git a/releasenotes/notes/adds-nvidia-vgpu-support-b4c6f08a06c373c6.yaml b/releasenotes/notes/adds-nvidia-vgpu-support-b4c6f08a06c373c6.yaml
new file mode 100644
index 000000000..b67cccf1d
--- /dev/null
+++ b/releasenotes/notes/adds-nvidia-vgpu-support-b4c6f08a06c373c6.yaml
@@ -0,0 +1,7 @@
+---
+features:
+  - |
+    Adds initial support for configuring NVIDIA vGPUs. For more information please see
+    the `new documentation
+    <https://docs.openstack.org/kayobe/latest/configuration/reference/vgpu.yml>`_
+    for this feature.
diff --git a/requirements.yml b/requirements.yml
index 6a05407f4..1e9807561 100644
--- a/requirements.yml
+++ b/requirements.yml
@@ -7,6 +7,8 @@ collections:
     version: 1.1.1
   - name: openstack.cloud
     version: '<3'
+  - name: stackhpc.linux
+    version: 1.1.0
 
 roles:
   - src: ahuffman.resolv