Merge "Default limits for k8s labels and quota support"

This commit is contained in:
Zuul 2022-05-31 09:19:59 +00:00 committed by Gerrit Code Review
commit 6416b14838
15 changed files with 366 additions and 18 deletions

View File

@ -96,6 +96,45 @@ Selecting the kubernetes driver adds the following options to the
A dictionary of key-value pairs that will be stored with the node data A dictionary of key-value pairs that will be stored with the node data
in ZooKeeper. The keys and values can be any arbitrary string. in ZooKeeper. The keys and values can be any arbitrary string.
.. attr:: max-cores
:type: int
Maximum number of cores usable from this pool. This can be used
to limit usage of the kubernetes backend. If not defined nodepool can
use all cores up to the limit of the backend.
.. attr:: max-servers
:type: int
Maximum number of pods spawnable from this pool. This can
be used to limit the number of pods. If not defined
nodepool can create as many servers the kubernetes backend allows.
.. attr:: max-ram
:type: int
Maximum ram usable from this pool. This can be used to limit
the amount of ram allocated by nodepool. If not defined
nodepool can use as much ram as the kubernetes backend allows.
.. attr:: default-label-cpu
:type: int
Only used by the
:value:`providers.[kubernetes].pools.labels.type.pod` label type;
specifies specifies a default value for
:attr:`providers.[kubernetes].pools.labels.cpu` for all labels of
this pool that do not set their own value.
.. attr:: default-label-memory
:type: int
Only used by the
:value:`providers.[kubernetes].pools.labels.type.pod` label type;
specifies a default value for
:attr:`providers.[kubernetes].pools.labels.memory` for all labels of
this pool that do not set their own value.
.. attr:: labels .. attr:: labels
:type: list :type: list

View File

@ -37,6 +37,10 @@ class KubernetesPool(ConfigPool):
def load(self, pool_config, full_config): def load(self, pool_config, full_config):
super().load(pool_config) super().load(pool_config)
self.name = pool_config['name'] self.name = pool_config['name']
self.max_cores = pool_config.get('max-cores')
self.max_ram = pool_config.get('max-ram')
self.default_label_cpu = pool_config.get('default-label-cpu')
self.default_label_memory = pool_config.get('default-label-memory')
self.labels = {} self.labels = {}
for label in pool_config.get('labels', []): for label in pool_config.get('labels', []):
pl = KubernetesLabel() pl = KubernetesLabel()
@ -46,8 +50,8 @@ class KubernetesPool(ConfigPool):
pl.image_pull = label.get('image-pull', 'IfNotPresent') pl.image_pull = label.get('image-pull', 'IfNotPresent')
pl.python_path = label.get('python-path', 'auto') pl.python_path = label.get('python-path', 'auto')
pl.shell_type = label.get('shell-type') pl.shell_type = label.get('shell-type')
pl.cpu = label.get('cpu') pl.cpu = label.get('cpu', self.default_label_cpu)
pl.memory = label.get('memory') pl.memory = label.get('memory', self.default_label_memory)
pl.env = label.get('env', []) pl.env = label.get('env', [])
pl.node_selector = label.get('node-selector') pl.node_selector = label.get('node-selector')
pl.pool = self pl.pool = self
@ -101,6 +105,10 @@ class KubernetesProviderConfig(ProviderConfig):
pool.update({ pool.update({
v.Required('name'): str, v.Required('name'): str,
v.Required('labels'): [k8s_label], v.Required('labels'): [k8s_label],
v.Optional('max-cores'): int,
v.Optional('max-ram'): int,
v.Optional('default-label-cpu'): int,
v.Optional('default-label-memory'): int,
}) })
provider = { provider = {

View File

@ -46,6 +46,10 @@ class K8SLauncher(NodeLauncher):
else: else:
self.node.connection_type = "kubectl" self.node.connection_type = "kubectl"
self.node.interface_ip = resource['pod'] self.node.interface_ip = resource['pod']
pool = self.handler.provider.pools.get(self.node.pool)
resources = self.handler.manager.quotaNeededByLabel(
self.node.type[0], pool)
self.node.resources = resources.get_resources()
self.zk.storeNode(self.node) self.zk.storeNode(self.node)
self.log.info("Resource %s is ready" % resource['name']) self.log.info("Resource %s is ready" % resource['name'])

View File

@ -288,7 +288,15 @@ class KubernetesProvider(Provider, QuotaSupport):
pod_body = { pod_body = {
'apiVersion': 'v1', 'apiVersion': 'v1',
'kind': 'Pod', 'kind': 'Pod',
'metadata': {'name': label.name}, 'metadata': {
'name': label.name,
'labels': {
'nodepool_node_id': node.id,
'nodepool_provider_name': self.provider.name,
'nodepool_pool_name': pool,
'nodepool_node_label': label.name,
}
},
'spec': spec_body, 'spec': spec_body,
'restartPolicy': 'Never', 'restartPolicy': 'Never',
} }
@ -323,8 +331,13 @@ class KubernetesProvider(Provider, QuotaSupport):
default=math.inf) default=math.inf)
def quotaNeededByLabel(self, ntype, pool): def quotaNeededByLabel(self, ntype, pool):
# TODO: return real quota information about a label provider_label = pool.labels[ntype]
return QuotaInformation(cores=1, instances=1, ram=1, default=1) resources = {}
if provider_label.cpu:
resources["cores"] = provider_label.cpu
if provider_label.memory:
resources["ram"] = provider_label.memory
return QuotaInformation(instances=1, default=1, **resources)
def unmanagedQuotaUsed(self): def unmanagedQuotaUsed(self):
# TODO: return real quota information about quota # TODO: return real quota information about quota

View File

@ -227,8 +227,8 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
if check_tenant_quota and not self._hasTenantQuota(req, pm): if check_tenant_quota and not self._hasTenantQuota(req, pm):
# Defer request for it to be handled and fulfilled at a later # Defer request for it to be handled and fulfilled at a later
# run. # run.
log.debug( log.debug("Deferring request %s because it would "
"Deferring request because it would exceed tenant quota") "exceed tenant quota", req)
continue continue
log.debug("Locking request") log.debug("Locking request")
@ -326,9 +326,10 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
**self.nodepool.config.tenant_resource_limits[tenant_name]) **self.nodepool.config.tenant_resource_limits[tenant_name])
tenant_quota.subtract(used_quota) tenant_quota.subtract(used_quota)
log.debug("Current tenant quota: %s", tenant_quota) log.debug("Current tenant quota for %s: %s", tenant_name, tenant_quota)
tenant_quota.subtract(needed_quota) tenant_quota.subtract(needed_quota)
log.debug("Predicted remaining tenant quota: %s", tenant_quota) log.debug("Predicted remaining tenant quota for %s: %s",
tenant_name, tenant_quota)
return tenant_quota.non_negative() return tenant_quota.non_negative()
def _getUsedQuotaForTenant(self, tenant_name): def _getUsedQuotaForTenant(self, tenant_name):

View File

@ -0,0 +1,32 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: pod-default
- name: pod-custom-cpu
- name: pod-custom-mem
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
default-label-cpu: 2
default-label-memory: 1024
labels:
- name: pod-default
type: pod
- name: pod-custom-cpu
type: pod
cpu: 4
- name: pod-custom-mem
type: pod
memory: 2048

View File

@ -0,0 +1,25 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: pod-fedora
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
max-cores: 4
labels:
- name: pod-fedora
type: pod
image: docker.io/fedora:28
cpu: 2

View File

@ -0,0 +1,25 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: pod-fedora
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
max-ram: 2048
labels:
- name: pod-fedora
type: pod
image: docker.io/fedora:28
memory: 1024

View File

@ -0,0 +1,24 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: pod-fedora
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
max-servers: 2
labels:
- name: pod-fedora
type: pod
image: docker.io/fedora:28

View File

@ -0,0 +1,28 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
tenant-resource-limits:
- tenant-name: tenant-1
max-cores: 4
labels:
- name: pod-fedora
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
labels:
- name: pod-fedora
type: pod
image: docker.io/fedora:28
cpu: 2

View File

@ -0,0 +1,28 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
tenant-resource-limits:
- tenant-name: tenant-1
max-ram: 2048
labels:
- name: pod-fedora
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
labels:
- name: pod-fedora
type: pod
image: docker.io/fedora:28
memory: 1024

View File

@ -0,0 +1,27 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
tenant-resource-limits:
- tenant-name: tenant-1
max-servers: 2
labels:
- name: pod-fedora
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
labels:
- name: pod-fedora
type: pod
image: docker.io/fedora:28

View File

@ -22,7 +22,6 @@ providers:
context: admin-cluster.local context: admin-cluster.local
pools: pools:
- name: main - name: main
max-servers: 2
node-attributes: node-attributes:
key1: value1 key1: value1
key2: value2 key2: value2

View File

@ -156,15 +156,83 @@ class TestDriverKubernetes(tests.DBTestCase):
self.waitForNodeDeletion(node) self.waitForNodeDeletion(node)
def test_kubernetes_max_servers(self): def test_kubernetes_default_label_resources(self):
configfile = self.setup_config('kubernetes.yaml') configfile = self.setup_config('kubernetes-default-limits.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('pod-default')
req.node_types.append('pod-custom-cpu')
req.node_types.append('pod-custom-mem')
self.zk.storeNodeRequest(req)
self.log.debug("Waiting for request %s", req.id)
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
self.assertNotEqual(req.nodes, [])
node_default = self.zk.getNode(req.nodes[0])
node_cust_cpu = self.zk.getNode(req.nodes[1])
node_cust_mem = self.zk.getNode(req.nodes[2])
resources_default = {
'instances': 1,
'cores': 2,
'ram': 1024,
}
resources_cust_cpu = {
'instances': 1,
'cores': 4,
'ram': 1024,
}
resources_cust_mem = {
'instances': 1,
'cores': 2,
'ram': 2048,
}
self.assertDictEqual(resources_default, node_default.resources)
self.assertDictEqual(resources_cust_cpu, node_cust_cpu.resources)
self.assertDictEqual(resources_cust_mem, node_cust_mem.resources)
for node in (node_default, node_cust_cpu, node_cust_mem):
node.state = zk.DELETING
self.zk.storeNode(node)
self.waitForNodeDeletion(node)
def test_kubernetes_pool_quota_servers(self):
self._test_kubernetes_quota('kubernetes-pool-quota-servers.yaml')
def test_kubernetes_pool_quota_cores(self):
self._test_kubernetes_quota('kubernetes-pool-quota-cores.yaml')
def test_kubernetes_pool_quota_ram(self):
self._test_kubernetes_quota('kubernetes-pool-quota-ram.yaml')
def test_kubernetes_tenant_quota_servers(self):
self._test_kubernetes_quota(
'kubernetes-tenant-quota-servers.yaml', pause=False)
def test_kubernetes_tenant_quota_cores(self):
self._test_kubernetes_quota(
'kubernetes-tenant-quota-cores.yaml', pause=False)
def test_kubernetes_tenant_quota_ram(self):
self._test_kubernetes_quota(
'kubernetes-tenant-quota-ram.yaml', pause=False)
def _test_kubernetes_quota(self, config, pause=True):
configfile = self.setup_config(config)
pool = self.useNodepool(configfile, watermark_sleep=1) pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start() pool.start()
# Start two pods to hit max-server limit # Start two pods to hit max-server limit
reqs = [] reqs = []
for x in [1, 2]: for _ in [1, 2]:
req = zk.NodeRequest() req = zk.NodeRequest()
req.state = zk.REQUESTED req.state = zk.REQUESTED
req.tenant_name = 'tenant-1'
req.node_types.append('pod-fedora') req.node_types.append('pod-fedora')
self.zk.storeNodeRequest(req) self.zk.storeNodeRequest(req)
reqs.append(req) reqs.append(req)
@ -179,13 +247,19 @@ class TestDriverKubernetes(tests.DBTestCase):
# Now request a third pod that will hit the limit # Now request a third pod that will hit the limit
max_req = zk.NodeRequest() max_req = zk.NodeRequest()
max_req.state = zk.REQUESTED max_req.state = zk.REQUESTED
max_req.tenant_name = 'tenant-1'
max_req.node_types.append('pod-fedora') max_req.node_types.append('pod-fedora')
self.zk.storeNodeRequest(max_req) self.zk.storeNodeRequest(max_req)
# if at pool quota, the handler will get paused
# but not if at tenant quota
if pause:
# The previous request should pause the handler # The previous request should pause the handler
pool_worker = pool.getPoolWorkers('kubespray') pool_worker = pool.getPoolWorkers('kubespray')
while not pool_worker[0].paused_handler: while not pool_worker[0].paused_handler:
time.sleep(0.1) time.sleep(0.1)
else:
self.waitForNodeRequest(max_req, (zk.REQUESTED,))
# Delete the earlier two pods freeing space for the third. # Delete the earlier two pods freeing space for the third.
for req in fulfilled_reqs: for req in fulfilled_reqs:
@ -195,5 +269,5 @@ class TestDriverKubernetes(tests.DBTestCase):
self.waitForNodeDeletion(node) self.waitForNodeDeletion(node)
# We should unpause and fulfill this now # We should unpause and fulfill this now
req = self.waitForNodeRequest(max_req) req = self.waitForNodeRequest(max_req, (zk.FULFILLED,))
self.assertEqual(req.state, zk.FULFILLED) self.assertEqual(req.state, zk.FULFILLED)

View File

@ -0,0 +1,21 @@
---
features:
- |
Config options for kubernetes providers were added to define default limits
for cpu and memory for pod-type labels.
* attr:`providers.[kubernetes].pools.default-label-cpu`
* attr:`providers.[kubernetes].pools.default-label-memory`
These values will apply to all pod-type labels within the same pool that do
not override these limits. This allows to enforce resource limits on pod
labels. It thereby enables to account for pool and tenant quotas in terms
of cpu and memory consumption. New config options for kubernetes pools
therefore also include
* attr:`providers.[kubernetes].pools.max-cores`
* attr:`providers.[kubernetes].pools.max-ram`
The exsisting tenant quota settings apply accordingly. Note that cpu and
memory quotas can still not be considered for labels that do not specify
any limits, i.e. neither a pool default, nor label specific limit is set.