Add gpu support for k8s/openshift pods
This adds the option to request GPUs for kubernetes and openshift pods. Since the resource name depends on the GPU vendor and the cluster installation, this option is left for the user to define it in the node pool. To leverage the ability of some schedulers to use fractional GPUs, the actual GPU value is read as a string. For GPUs, requests and limits cannot be decoupled (cf. https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/), so the same value will be used for requests and limits. Change-Id: Ibe33b06c374a431f164080edb34c3a501c360df7
This commit is contained in:
parent
128f7bf237
commit
3fa6821437
@ -370,6 +370,25 @@ Selecting the kubernetes driver adds the following options to the
|
||||
label type; specifies the ephemeral-storage limit in
|
||||
MB for the pod.
|
||||
|
||||
.. attr:: gpu
|
||||
:type: float
|
||||
|
||||
Only used by the
|
||||
:value:`providers.[kubernetes].pools.labels.type.pod`
|
||||
label type; specifies the amount of gpu allocated to the pod.
|
||||
This will be used to set both requests and limits to the same
|
||||
value, based on how kubernetes assigns gpu resources:
|
||||
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/.
|
||||
|
||||
.. attr:: gpu-resource
|
||||
:type: str
|
||||
|
||||
Only used by the
|
||||
:value:`providers.[kubernetes].pools.labels.type.pod`
|
||||
label type; specifies the custom schedulable resource
|
||||
associated with the installed gpu that is available
|
||||
in the cluster.
|
||||
|
||||
.. attr:: env
|
||||
:type: list
|
||||
:default: []
|
||||
|
@ -292,6 +292,21 @@ Selecting the openshift pods driver adds the following options to the
|
||||
|
||||
Specifies the ephemeral-storage limit in MB for the pod.
|
||||
|
||||
.. attr:: gpu
|
||||
:type: float
|
||||
|
||||
Specifies the amount of gpu allocated to the pod.
|
||||
This will be used to set both requests and limits to the same
|
||||
value, based on how kubernetes assigns gpu resources:
|
||||
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/.
|
||||
|
||||
.. attr:: gpu-resource
|
||||
:type: str
|
||||
|
||||
Specifies the custom schedulable resource
|
||||
associated with the installed gpu that is available
|
||||
in the cluster.
|
||||
|
||||
.. attr:: python-path
|
||||
:type: str
|
||||
:default: auto
|
||||
|
@ -381,6 +381,25 @@ Selecting the openshift driver adds the following options to the
|
||||
label type; specifies the ephemeral-storage limit in
|
||||
MB for the pod.
|
||||
|
||||
.. attr:: gpu
|
||||
:type: float
|
||||
|
||||
Only used by the
|
||||
:value:`providers.[openshift].pools.labels.type.pod`
|
||||
label type; specifies the amount of gpu allocated to the pod.
|
||||
This will be used to set both requests and limits to the same
|
||||
value, based on how kubernetes assigns gpu resources:
|
||||
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/.
|
||||
|
||||
.. attr:: gpu-resource
|
||||
:type: str
|
||||
|
||||
Only used by the
|
||||
:value:`providers.[openshift].pools.labels.type.pod`
|
||||
label type; specifies the custom schedulable resource
|
||||
associated with the installed gpu that is available
|
||||
in the cluster.
|
||||
|
||||
.. attr:: env
|
||||
:type: list
|
||||
:default: []
|
||||
|
@ -82,6 +82,8 @@ class KubernetesPool(ConfigPool):
|
||||
'memory-limit', default_memory_limit)
|
||||
pl.storage_limit = label.get(
|
||||
'storage-limit', default_storage_limit)
|
||||
pl.gpu = label.get('gpu')
|
||||
pl.gpu_resource = label.get('gpu-resource')
|
||||
pl.env = label.get('env', [])
|
||||
pl.node_selector = label.get('node-selector')
|
||||
pl.privileged = label.get('privileged')
|
||||
@ -143,6 +145,8 @@ class KubernetesProviderConfig(ProviderConfig):
|
||||
'cpu-limit': int,
|
||||
'memory-limit': int,
|
||||
'storage-limit': int,
|
||||
'gpu': float,
|
||||
'gpu-resource': str,
|
||||
'env': [env_var],
|
||||
'node-selector': dict,
|
||||
'privileged': bool,
|
||||
|
@ -333,6 +333,9 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
limits['memory'] = '%dMi' % int(label.memory_limit)
|
||||
if label.storage_limit:
|
||||
limits['ephemeral-storage'] = '%dM' % int(label.storage_limit)
|
||||
if label.gpu_resource and label.gpu:
|
||||
requests[label.gpu_resource] = '%.2f' % label.gpu
|
||||
limits[label.gpu_resource] = '%.2f' % label.gpu
|
||||
resources = {}
|
||||
if requests:
|
||||
resources['requests'] = requests
|
||||
@ -428,6 +431,8 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
resources["ram"] = provider_label.memory
|
||||
if provider_label.storage:
|
||||
resources["ephemeral-storage"] = provider_label.storage
|
||||
if provider_label.gpu and provider_label.gpu_resource:
|
||||
resources[provider_label.gpu_resource] = provider_label.gpu
|
||||
resources.update(provider_label.extra_resources)
|
||||
return QuotaInformation(instances=1, **resources)
|
||||
|
||||
|
@ -83,6 +83,8 @@ class OpenshiftPool(ConfigPool):
|
||||
'memory-limit', default_memory_limit)
|
||||
pl.storage_limit = label.get(
|
||||
'storage-limit', default_storage_limit)
|
||||
pl.gpu = label.get('gpu')
|
||||
pl.gpu_resource = label.get('gpu-resource')
|
||||
pl.python_path = label.get('python-path', 'auto')
|
||||
pl.shell_type = label.get('shell-type')
|
||||
pl.env = label.get('env', [])
|
||||
@ -149,6 +151,8 @@ class OpenshiftProviderConfig(ProviderConfig):
|
||||
'cpu-limit': int,
|
||||
'memory-limit': int,
|
||||
'storage-limit': int,
|
||||
'gpu': float,
|
||||
'gpu-resource': str,
|
||||
'python-path': str,
|
||||
'shell-type': str,
|
||||
'env': [env_var],
|
||||
|
@ -253,6 +253,9 @@ class OpenshiftProvider(Provider, QuotaSupport):
|
||||
limits['memory'] = '%dMi' % int(label.memory_limit)
|
||||
if label.storage_limit:
|
||||
limits['ephemeral-storage'] = '%dM' % int(label.storage_limit)
|
||||
if label.gpu_resource and label.gpu:
|
||||
requests[label.gpu_resource] = '%.2f' % label.gpu
|
||||
limits[label.gpu_resource] = '%.2f' % label.gpu
|
||||
resources = {}
|
||||
if requests:
|
||||
resources['requests'] = requests
|
||||
@ -344,6 +347,8 @@ class OpenshiftProvider(Provider, QuotaSupport):
|
||||
resources["ram"] = provider_label.memory
|
||||
if provider_label.storage:
|
||||
resources["ephemeral-storage"] = provider_label.storage
|
||||
if provider_label.gpu and provider_label.gpu_resource:
|
||||
resources[provider_label.gpu_resource] = provider_label.gpu
|
||||
resources.update(provider_label.extra_resources)
|
||||
return QuotaInformation(instances=1, **resources)
|
||||
|
||||
|
@ -72,6 +72,8 @@ class OpenshiftPodsProviderConfig(OpenshiftProviderConfig):
|
||||
'cpu-limit': int,
|
||||
'memory-limit': int,
|
||||
'storage-limit': int,
|
||||
'gpu': str,
|
||||
'gpu-resource': str,
|
||||
'python-path': str,
|
||||
'shell-type': str,
|
||||
'env': [env_var],
|
||||
|
@ -13,6 +13,7 @@ labels:
|
||||
- name: pod-custom-cpu
|
||||
- name: pod-custom-mem
|
||||
- name: pod-custom-storage
|
||||
- name: pod-custom-gpu
|
||||
|
||||
providers:
|
||||
- name: kubespray
|
||||
@ -35,3 +36,7 @@ providers:
|
||||
- name: pod-custom-storage
|
||||
type: pod
|
||||
storage: 20
|
||||
- name: pod-custom-gpu
|
||||
type: pod
|
||||
gpu-resource: gpu-vendor.example/example-gpu
|
||||
gpu: 0.5
|
||||
|
@ -13,6 +13,7 @@ labels:
|
||||
- name: pod-custom-cpu
|
||||
- name: pod-custom-mem
|
||||
- name: pod-custom-storage
|
||||
- name: pod-custom-gpu
|
||||
|
||||
providers:
|
||||
- name: openshift
|
||||
@ -35,3 +36,7 @@ providers:
|
||||
- name: pod-custom-storage
|
||||
type: pod
|
||||
storage: 20
|
||||
- name: pod-custom-gpu
|
||||
type: pod
|
||||
gpu-resource: gpu-vendor.example/example-gpu
|
||||
gpu: 0.5
|
||||
|
@ -262,6 +262,7 @@ class TestDriverKubernetes(tests.DBTestCase):
|
||||
req.node_types.append('pod-custom-cpu')
|
||||
req.node_types.append('pod-custom-mem')
|
||||
req.node_types.append('pod-custom-storage')
|
||||
req.node_types.append('pod-custom-gpu')
|
||||
self.zk.storeNodeRequest(req)
|
||||
|
||||
self.log.debug("Waiting for request %s", req.id)
|
||||
@ -273,6 +274,7 @@ class TestDriverKubernetes(tests.DBTestCase):
|
||||
node_cust_cpu = self.zk.getNode(req.nodes[1])
|
||||
node_cust_mem = self.zk.getNode(req.nodes[2])
|
||||
node_cust_storage = self.zk.getNode(req.nodes[3])
|
||||
node_cust_gpu = self.zk.getNode(req.nodes[4])
|
||||
|
||||
resources_default = {
|
||||
'instances': 1,
|
||||
@ -298,12 +300,20 @@ class TestDriverKubernetes(tests.DBTestCase):
|
||||
'ram': 1024,
|
||||
'ephemeral-storage': 20,
|
||||
}
|
||||
resources_cust_gpu = {
|
||||
'instances': 1,
|
||||
'cores': 2,
|
||||
'ram': 1024,
|
||||
'ephemeral-storage': 10,
|
||||
'gpu-vendor.example/example-gpu': 0.5
|
||||
}
|
||||
|
||||
self.assertDictEqual(resources_default, node_default.resources)
|
||||
self.assertDictEqual(resources_cust_cpu, node_cust_cpu.resources)
|
||||
self.assertDictEqual(resources_cust_mem, node_cust_mem.resources)
|
||||
self.assertDictEqual(resources_cust_storage,
|
||||
node_cust_storage.resources)
|
||||
self.assertDictEqual(resources_cust_gpu, node_cust_gpu.resources)
|
||||
|
||||
ns, pod = self.fake_k8s_client._pod_requests[0]
|
||||
self.assertEqual(pod['spec']['containers'][0]['resources'], {
|
||||
@ -361,7 +371,26 @@ class TestDriverKubernetes(tests.DBTestCase):
|
||||
},
|
||||
})
|
||||
|
||||
for node in (node_default, node_cust_cpu, node_cust_mem):
|
||||
ns, pod = self.fake_k8s_client._pod_requests[4]
|
||||
self.assertEqual(pod['spec']['containers'][0]['resources'], {
|
||||
'limits': {
|
||||
'cpu': 2,
|
||||
'ephemeral-storage': '10M',
|
||||
'memory': '1024Mi',
|
||||
'gpu-vendor.example/example-gpu': '0.50'
|
||||
},
|
||||
'requests': {
|
||||
'cpu': 2,
|
||||
'ephemeral-storage': '10M',
|
||||
'memory': '1024Mi',
|
||||
'gpu-vendor.example/example-gpu': '0.50'
|
||||
},
|
||||
})
|
||||
|
||||
for node in (node_default,
|
||||
node_cust_cpu,
|
||||
node_cust_mem,
|
||||
node_cust_gpu):
|
||||
node.state = zk.DELETING
|
||||
self.zk.storeNode(node)
|
||||
self.waitForNodeDeletion(node)
|
||||
|
@ -275,6 +275,7 @@ class TestDriverOpenshift(tests.DBTestCase):
|
||||
req.node_types.append('pod-custom-cpu')
|
||||
req.node_types.append('pod-custom-mem')
|
||||
req.node_types.append('pod-custom-storage')
|
||||
req.node_types.append('pod-custom-gpu')
|
||||
self.zk.storeNodeRequest(req)
|
||||
|
||||
self.log.debug("Waiting for request %s", req.id)
|
||||
@ -286,6 +287,7 @@ class TestDriverOpenshift(tests.DBTestCase):
|
||||
node_cust_cpu = self.zk.getNode(req.nodes[1])
|
||||
node_cust_mem = self.zk.getNode(req.nodes[2])
|
||||
node_cust_storage = self.zk.getNode(req.nodes[3])
|
||||
node_cust_gpu = self.zk.getNode(req.nodes[4])
|
||||
|
||||
resources_default = {
|
||||
'instances': 1,
|
||||
@ -311,12 +313,20 @@ class TestDriverOpenshift(tests.DBTestCase):
|
||||
'ram': 1024,
|
||||
'ephemeral-storage': 20,
|
||||
}
|
||||
resources_cust_gpu = {
|
||||
'instances': 1,
|
||||
'cores': 2,
|
||||
'ram': 1024,
|
||||
'ephemeral-storage': 10,
|
||||
'gpu-vendor.example/example-gpu': 0.5
|
||||
}
|
||||
|
||||
self.assertDictEqual(resources_default, node_default.resources)
|
||||
self.assertDictEqual(resources_cust_cpu, node_cust_cpu.resources)
|
||||
self.assertDictEqual(resources_cust_mem, node_cust_mem.resources)
|
||||
self.assertDictEqual(resources_cust_storage,
|
||||
node_cust_storage.resources)
|
||||
self.assertDictEqual(resources_cust_gpu, node_cust_gpu.resources)
|
||||
|
||||
ns, pod = self.fake_k8s_client._pod_requests[0]
|
||||
self.assertEqual(pod['spec']['containers'][0]['resources'], {
|
||||
@ -374,7 +384,26 @@ class TestDriverOpenshift(tests.DBTestCase):
|
||||
},
|
||||
})
|
||||
|
||||
for node in (node_default, node_cust_cpu, node_cust_mem):
|
||||
ns, pod = self.fake_k8s_client._pod_requests[4]
|
||||
self.assertEqual(pod['spec']['containers'][0]['resources'], {
|
||||
'limits': {
|
||||
'cpu': 2,
|
||||
'ephemeral-storage': '10M',
|
||||
'memory': '1024Mi',
|
||||
'gpu-vendor.example/example-gpu': '0.50'
|
||||
},
|
||||
'requests': {
|
||||
'cpu': 2,
|
||||
'ephemeral-storage': '10M',
|
||||
'memory': '1024Mi',
|
||||
'gpu-vendor.example/example-gpu': '0.50'
|
||||
},
|
||||
})
|
||||
|
||||
for node in (node_default,
|
||||
node_cust_cpu,
|
||||
node_cust_mem,
|
||||
node_cust_gpu):
|
||||
node.state = zk.DELETING
|
||||
self.zk.storeNode(node)
|
||||
self.waitForNodeDeletion(node)
|
||||
|
5
releasenotes/notes/pod-gpu-0edcd573dd813244.yaml
Normal file
5
releasenotes/notes/pod-gpu-0edcd573dd813244.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add support for requesting gpu resources
|
||||
in kubernetes and openshift drivers.
|
Loading…
x
Reference in New Issue
Block a user