zuul-operator/zuul_operator/zuul.py
James E. Blair eff9f360f7 Use kopf operator framework
This switches from the ansible/dhall operator framework to kopf,
an operator framework written in pure Python.  This allows us to:

* Build the operator application as a Python app.
* Build the operator image using the opendev python builder images.
* Run the operator as a Python CLI program "zuul-operator".
* Write procedural Python code to handle operator tasks (such as
  creating new nodepool launchers when providers are added).
* Use Jinja for templating config files and k8s resource files
  (direct pythonic manipulation of resources is an option too).

The new CR nearly matches the existing one, with some minor differences.

Some missing features and documentation are added in the commits
immediately following; they should be reviewed and merged as a unit.

Also, fx waiting for scheduler to settle in functional test since
we changed this log line in Zuul.

Change-Id: Ib37b67e3444b7cd44692d48eee77775ee9049e9f

Change-Id: I70ec31ecd8fe264118215944022b2e7b513dced9
2021-07-20 13:16:07 -07:00

341 lines
12 KiB
Python

# Copyright 2021 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import copy
import base64
import hashlib
import jinja2
import pykube
import yaml
from . import objects
from . import utils
from . import certmanager
from . import pxc
from . import zookeeper
class Zuul:
def __init__(self, namespace, name, logger, spec):
self.api = pykube.HTTPClient(pykube.KubeConfig.from_env())
self.namespace = namespace
self.name = name
self.log = logger
self.spec = copy.deepcopy(dict(spec))
self.zuul_conf_sha = None
db_secret = spec.get('database', {}).get('secretName')
if db_secret:
self.db_secret = db_secret
self.db_key = spec.get('database', {}).get('key', 'dburi')
self.manage_db = False
else:
self.db_secret = 'zuul-db'
self.db_key = 'dburi'
self.manage_db = True
self.nodepool_secret = spec.get('launcher', {}).get('config',{}).\
get('secretName')
zk_str = spec.get('zookeeper', {}).get('connectionString')
zk_tls = spec.get('zookeeper', {}).get('secretName')
if zk_str:
self.zk_str = zk_str
self.zk_tls = zk_tls
self.manage_zk = False
else:
self.manage_zk = True
self.tenant_secret = spec.get('scheduler', {}).\
get('config', {}).get('secretName')
ex = self.spec.setdefault('executor', {})
self.cert_manager = certmanager.CertManager(
self.api, self.namespace, self.log)
self.installing_cert_manager = False
def install_cert_manager(self):
if self.cert_manager.is_installed():
return
self.installing_cert_manager = True
self.cert_manager.install()
def wait_for_cert_manager(self):
if not self.installing_cert_manager:
return
self.log.info("Waiting for Cert-Manager")
self.cert_manager.wait_for_webhook()
def create_cert_manager_ca(self):
self.cert_manager.create_ca()
def install_zk(self):
if not self.manage_zk:
self.log.info("ZK is externally managed")
return
self.zk = zookeeper.ZooKeeper(self.api, self.namespace, self.log)
self.zk.create()
def wait_for_zk(self):
if not self.manage_zk:
return
self.log.info("Waiting for ZK cluster")
self.zk.wait_for_cluster()
# A two-part process for PXC so that this can run while other
# installations are happening.
def install_db(self):
if not self.manage_db:
self.log.info("DB is externally managed")
return
# TODO: get this from spec
small = True
self.log.info("DB is internally managed")
self.pxc = pxc.PXC(self.api, self.namespace, self.log)
if not self.pxc.is_installed():
self.log.info("Installing PXC operator")
self.pxc.create_operator()
self.log.info("Creating PXC cluster")
self.pxc.create_cluster(small)
def wait_for_db(self):
if not self.manage_db:
return
self.log.info("Waiting for PXC cluster")
self.pxc.wait_for_cluster()
dburi = self.get_db_uri()
if not dburi:
self.log.info("Creating database")
self.pxc.create_database()
def get_db_uri(self):
try:
obj = objects.Secret.objects(self.api).\
filter(namespace=self.namespace).\
get(name=self.db_secret)
uri = base64.b64decode(obj.obj['data'][self.db_key]).decode('utf8')
return uri
except pykube.exceptions.ObjectDoesNotExist:
return None
def write_zuul_conf(self):
dburi = self.get_db_uri()
for volume in self.spec.get('jobVolumes', []):
key = f"{volume['context']}_{volume['access']}_paths"
paths = self.spec['executor'].get(key, '')
if paths:
paths += ':'
paths += volume['path']
self.spec['executor'][key] = paths
connections = self.spec['connections']
# Copy in any information from connection secrets
for connection_name, connection in connections.items():
if 'secretName' in connection:
obj = objects.Secret.objects(self.api).\
filter(namespace=self.namespace).\
get(name=connection['secretName'])
for k, v in obj.obj['data'].items():
if k == 'sshkey':
v = f'/etc/zuul/connections/{connection_name}/sshkey'
else:
v = base64.b64decode(v)
connection[k] = v
kw = {'dburi': dburi,
'namespace': self.namespace,
'connections': connections,
'spec': self.spec}
env = jinja2.Environment(
loader=jinja2.PackageLoader('zuul_operator', 'templates'))
tmpl = env.get_template('zuul.conf')
text = tmpl.render(**kw)
# Create a sha of the zuul.conf so that we can set it as an
# annotation on objects which should be recreated when it
# changes.
m = hashlib.sha256()
m.update(text.encode('utf8'))
self.zuul_conf_sha = m.hexdigest()
utils.update_secret(self.api, self.namespace, 'zuul-config',
string_data={'zuul.conf': text})
def write_nodepool_conf(self):
self.nodepool_provider_secrets = {}
# load nodepool config
if not self.nodepool_secret:
self.log.warning("No nodepool config secret found")
try:
obj = objects.Secret.objects(self.api).\
filter(namespace=self.namespace).\
get(name=self.nodepool_secret)
except pykube.exceptions.ObjectDoesNotExist:
self.log.error("Nodepool config secret not found")
return None
# Shard the config so we can create a deployment + secret for
# each provider.
nodepool_yaml = yaml.safe_load(base64.b64decode(obj.obj['data']['nodepool.yaml']))
nodepool_yaml['zookeeper-servers'] = [
{'host': f'zookeeper.{self.namespace}',
'port': 2281},
]
nodepool_yaml['zookeeper-tls'] = {
'cert': '/tls/client/tls.crt',
'key': '/tls/client/tls.key',
'ca': '/tls/client/ca.crt',
}
for provider in nodepool_yaml['providers']:
self.log.info("Configuring provider %s", provider.get('name'))
secret_name = f"nodepool-config-{self.name}-{provider['name']}"
provider_yaml = nodepool_yaml.copy()
provider_yaml['providers'] = [provider]
text = yaml.dump(provider_yaml)
utils.update_secret(self.api, self.namespace, secret_name,
string_data={'nodepool.yaml': text})
self.nodepool_provider_secrets[provider['name']] = secret_name
def create_nodepool(self):
# Create secrets
self.write_nodepool_conf()
# Create providers
for provider_name, secret_name in\
self.nodepool_provider_secrets.items():
kw = {
'zuul_version': '4.1.0',
'instance_name': self.name,
'provider_name': provider_name,
'nodepool_config_secret_name': secret_name,
'external_config': self.spec.get('externalConfig', {}),
}
utils.apply_file(self.api, 'nodepool-launcher.yaml',
namespace=self.namespace, **kw)
# Get current providers
providers = objects.Deployment.objects(self.api).filter(
namespace=self.namespace,
selector={'app.kubernetes.io/instance': self.name,
'app.kubernetes.io/component': 'nodepool-launcher',
'app.kubernetes.io/name': 'nodepool',
'app.kubernetes.io/part-of': 'zuul'})
new_providers = set(self.nodepool_provider_secrets.keys())
old_providers = set([x.labels['operator.zuul-ci.org/nodepool-provider']
for x in providers])
# delete any unecessary provider deployments and secrets
for unused_provider in old_providers - new_providers:
self.log.info("Deleting unused provider %s", unused_provider)
deployment_name = f"nodepool-launcher-{self.name}-{unused_provider}"
secret_name = f"nodepool-config-{self.name}-{unused_provider}"
try:
obj = objects.Deployment.objects(self.api).filter(
namespace=self.namespace).get(deployment_name)
obj.delete()
except pykube.exceptions.ObjectDoesNotExist:
pass
try:
obj = objects.Secret.objects(self.api).filter(
namespace=self.namespace).get(secret_name)
obj.delete()
except pykube.exceptions.ObjectDoesNotExist:
pass
def create_zuul(self):
kw = {
'zuul_conf_sha': self.zuul_conf_sha,
'zuul_version': '4.1.0',
'zuul_web': {
'replicas': 3,
},
'zuul_executor': {
'replicas': 3,
},
'zuul_tenant_secret': self.tenant_secret,
'instance_name': self.name,
'connections': self.spec['connections'],
'executor_ssh_secret': self.spec['executor'].get(
'sshkey', {}).get('secretName'),
'spec': self.spec,
}
utils.apply_file(self.api, 'zuul.yaml', namespace=self.namespace, **kw)
self.create_nodepool()
def smart_reconfigure(self):
self.log.info("Smart reconfigure")
try:
obj = objects.Secret.objects(self.api).\
filter(namespace=self.namespace).\
get(name=self.tenant_secret)
tenant_config = base64.b64decode(
obj.obj['data']['main.yaml'])
except pykube.exceptions.ObjectDoesNotExist:
self.log.error("Tenant config secret not found")
return
m = hashlib.sha256()
m.update(tenant_config)
conf_sha = m.hexdigest()
expected = f"{conf_sha} /etc/zuul/tenant/main.yaml"
for obj in objects.Pod.objects(self.api).filter(
namespace=self.namespace,
selector={'app.kubernetes.io/instance': 'zuul',
'app.kubernetes.io/component': 'zuul-scheduler',
'app.kubernetes.io/name': 'zuul'}):
self.log.info("Waiting for config to update on %s",
obj.name)
delay = 10
retries = 30
timeout = delay * retries
command = [
'/usr/bin/timeout',
str(timeout),
'/bin/sh',
'-c',
f'while !( echo -n "{expected}" | sha256sum -c - ); do sleep {delay}; done'
]
resp = utils.pod_exec(self.namespace, obj.name, command)
self.log.debug("Response: %s", resp)
if '/etc/zuul/tenant/main.yaml: OK' in resp:
self.log.info("Issuing smart-reconfigure on %s", obj.name)
command = [
'zuul-scheduler',
'smart-reconfigure',
]
resp = utils.pod_exec(self.namespace, obj.name, command)
self.log.debug("Response: %s", resp)
else:
self.log.error("Tenant config file never updated on %s",
obj.name)