kolla-ansible/ansible/module_utils/kolla_docker_worker.py
Michal Nasiadka 1497ab2ab3 systemd: handle running container without systemd unit
MariaDB bootstrap has a phase where the first MariaDB container
is running with Galera bootstrap - after a check that WSREP
is synced is successful - we restart the container.

The bootstrap container is named mariadb and running with
docker_restart_policy: "no" - the restarted container should be running
in systemd.

Before this patch the code created a systemd unit but it was initially
stopped - so stopping was always a success - and the container would be
killed with SIGKILL on removal (which obviously breaks MariaDB).

This patch also improves docker/systemd stops by waiting for real
unit/container stop and adds failing CI for containers that are
killed with signal 9.

Closes-Bug: #2029613

Change-Id: I0a03e509ce228a50e081fcab44d2b4831251190c
2023-08-17 14:57:39 +00:00

525 lines
19 KiB
Python

# Copyright 2015 Sam Yaple
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# FIXME(yoctozepto): restart_policy is *not* checked in the container
import docker
import json
import os
from ansible.module_utils.kolla_container_worker import COMPARE_CONFIG_CMD
from ansible.module_utils.kolla_container_worker import ContainerWorker
from distutils.version import StrictVersion
def get_docker_client():
return docker.APIClient
class DockerWorker(ContainerWorker):
def __init__(self, module):
super().__init__(module)
options = {
'version': self.params.get('api_version'),
'timeout': self.params.get('client_timeout'),
}
self.dc = get_docker_client()(**options)
self._cgroupns_mode_supported = (
StrictVersion(self.dc._version) >= StrictVersion('1.41'))
self._dimensions_kernel_memory_removed = (
StrictVersion(self.dc._version) >= StrictVersion('1.42'))
if self._dimensions_kernel_memory_removed:
self.dimension_map.pop('kernel_memory', None)
def generate_tls(self):
tls = {'verify': self.params.get('tls_verify')}
tls_cert = self.params.get('tls_cert'),
tls_key = self.params.get('tls_key'),
tls_cacert = self.params.get('tls_cacert')
if tls['verify']:
if tls_cert:
self.check_file(tls_cert)
self.check_file(tls_key)
tls['client_cert'] = (tls_cert, tls_key)
if tls_cacert:
self.check_file(tls_cacert)
tls['verify'] = tls_cacert
return docker.tls.TLSConfig(**tls)
def check_file(self, path):
if not os.path.isfile(path):
self.module.fail_json(
failed=True,
msg='There is no file at "{}"'.format(path)
)
if not os.access(path, os.R_OK):
self.module.fail_json(
failed=True,
msg='Permission denied for file at "{}"'.format(path)
)
def check_image(self):
find_image = ':'.join(self.parse_image())
for image in self.dc.images():
repo_tags = image.get('RepoTags')
if not repo_tags:
continue
for image_name in repo_tags:
if image_name == find_image:
return image
def check_volume(self):
for vol in self.dc.volumes()['Volumes'] or list():
if vol['Name'] == self.params.get('name'):
return vol
def check_container(self):
find_name = '/{}'.format(self.params.get('name'))
for cont in self.dc.containers(all=True):
if find_name in cont['Names']:
return cont
def get_container_info(self):
container = self.check_container()
if not container:
return None
return self.dc.inspect_container(self.params.get('name'))
def compare_pid_mode(self, container_info):
new_pid_mode = self.params.get('pid_mode')
current_pid_mode = container_info['HostConfig'].get('PidMode')
if not current_pid_mode:
current_pid_mode = None
if new_pid_mode != current_pid_mode:
return True
def compare_image(self, container_info=None):
container_info = container_info or self.get_container_info()
parse_repository_tag = docker.utils.parse_repository_tag
if not container_info:
return True
new_image = self.check_image()
current_image = container_info['Image']
if not new_image:
return True
if new_image['Id'] != current_image:
return True
# NOTE(Jeffrey4l) when new image and the current image have
# the same id, but the tag name different.
elif (parse_repository_tag(container_info['Config']['Image']) !=
parse_repository_tag(self.params.get('image'))):
return True
def compare_volumes(self, container_info):
volumes, binds = self.generate_volumes()
current_vols = container_info['Config'].get('Volumes')
current_binds = container_info['HostConfig'].get('Binds')
if not volumes:
volumes = list()
if not current_vols:
current_vols = list()
if not current_binds:
current_binds = list()
if set(volumes).symmetric_difference(set(current_vols)):
return True
new_binds = list()
if binds:
for k, v in binds.items():
new_binds.append("{}:{}:{}".format(k, v['bind'], v['mode']))
if set(new_binds).symmetric_difference(set(current_binds)):
return True
def compare_config(self):
try:
job = self.dc.exec_create(
self.params['name'],
COMPARE_CONFIG_CMD,
user='root',
)
output = self.dc.exec_start(job)
exec_inspect = self.dc.exec_inspect(job)
except docker.errors.APIError as e:
# NOTE(yoctozepto): If we have a client error, then the container
# cannot be used for config check (e.g., is restarting, or stopped
# in the mean time) - assume config is stale = return True.
# Else, propagate the server error back.
if e.is_client_error():
return True
else:
raise
# Exit codes:
# 0: not changed
# 1: changed
# 137: abrupt exit -> changed
# else: error
if exec_inspect['ExitCode'] == 0:
return False
elif exec_inspect['ExitCode'] == 1:
return True
elif exec_inspect['ExitCode'] == 137:
# NOTE(yoctozepto): This is Docker's command exit due to container
# exit. It means the container is unstable so we are better off
# marking it as requiring a restart due to config update.
return True
else:
raise Exception('Failed to compare container configuration: '
'ExitCode: %s Message: %s' %
(exec_inspect['ExitCode'], output))
def get_image_id(self):
full_image = self.params.get('image')
image = self.dc.images(name=full_image, quiet=True)
return image[0] if len(image) == 1 else None
def pull_image(self):
if self.params.get('auth_username'):
self.dc.login(
username=self.params.get('auth_username'),
password=self.params.get('auth_password'),
registry=self.params.get('auth_registry'),
email=self.params.get('auth_email')
)
image, tag = self.parse_image()
old_image_id = self.get_image_id()
statuses = [
json.loads(line.strip().decode('utf-8')) for line in self.dc.pull(
repository=image, tag=tag, stream=True
)
]
for status in reversed(statuses):
if 'error' in status:
if status['error'].endswith('not found'):
self.module.fail_json(
msg="The requested image does not exist: {}:{}".format(
image, tag),
failed=True
)
else:
self.module.fail_json(
msg="Unknown error message: {}".format(
status['error']),
failed=True
)
new_image_id = self.get_image_id()
self.changed = old_image_id != new_image_id
def remove_container(self):
self.changed |= self.systemd.remove_unit_file()
if self.check_container():
self.changed = True
# NOTE(jeffrey4l): in some case, docker failed to remove container
# filesystem and raise error. But the container info is
# disappeared already. If this happens, assume the container is
# removed.
try:
self.dc.remove_container(
container=self.params.get('name'),
force=True
)
self.systemd.remove_unit_file()
except docker.errors.APIError:
if self.check_container():
raise
def parse_dimensions(self, dimensions):
# When the data object contains types such as
# docker.types.Ulimit, Ansible will fail when these are
# returned via exit_json or fail_json. HostConfig is derived from dict,
# but its constructor requires additional arguments.
# to avoid that, here do copy the dimensions and return a new one.
dimensions = dimensions.copy()
supported = {'cpu_period', 'cpu_quota', 'cpu_shares',
'cpuset_cpus', 'cpuset_mems', 'mem_limit',
'mem_reservation', 'memswap_limit',
'kernel_memory', 'blkio_weight', 'ulimits'}
unsupported = set(dimensions) - supported
if unsupported:
self.module.exit_json(failed=True,
msg=repr("Unsupported dimensions"),
unsupported_dimensions=unsupported)
ulimits = dimensions.get('ulimits')
if ulimits:
dimensions['ulimits'] = self.build_ulimits(ulimits)
return dimensions
def build_ulimits(self, ulimits):
ulimits_opt = []
for key, value in ulimits.items():
soft = value.get('soft')
hard = value.get('hard')
ulimits_opt.append(docker.types.Ulimit(name=key,
soft=soft,
hard=hard))
return ulimits_opt
def build_host_config(self, binds):
options = {
'network_mode': 'host',
'ipc_mode': self.params.get('ipc_mode'),
'cap_add': self.params.get('cap_add'),
'security_opt': self.params.get('security_opt'),
'pid_mode': self.params.get('pid_mode'),
'privileged': self.params.get('privileged'),
'tmpfs': self.generate_tmpfs(),
'volumes_from': self.params.get('volumes_from')
}
dimensions = self.params.get('dimensions')
if dimensions:
dimensions = self.parse_dimensions(dimensions)
options.update(dimensions)
if binds:
options['binds'] = binds
host_config = self.dc.create_host_config(**options)
if self._cgroupns_mode_supported:
# NOTE(yoctozepto): python-docker does not support CgroupnsMode
# natively so we stuff it in manually.
cgroupns_mode = self.params.get('cgroupns_mode')
if cgroupns_mode is not None:
host_config['CgroupnsMode'] = cgroupns_mode
# detached containers should only log to journald
if self.params.get('detach'):
options['log_config'] = docker.types.LogConfig(
type=docker.types.LogConfig.types.NONE)
return host_config
def _inject_env_var(self, environment_info):
newenv = {
'KOLLA_SERVICE_NAME': self.params.get('name').replace('_', '-')
}
environment_info.update(newenv)
return environment_info
def _format_env_vars(self):
env = self._inject_env_var(self.params.get('environment'))
return {k: "" if env[k] is None else env[k] for k in env}
def build_container_options(self):
volumes, binds = self.generate_volumes()
options = {
'command': self.params.get('command'),
'detach': self.params.get('detach'),
'environment': self._format_env_vars(),
'host_config': self.build_host_config(binds),
'labels': self.params.get('labels'),
'image': self.params.get('image'),
'name': self.params.get('name'),
'volumes': volumes,
'tty': self.params.get('tty'),
}
healthcheck = self.parse_healthcheck(self.params.get('healthcheck'))
if healthcheck:
options.update(healthcheck)
return options
def create_container(self):
self.changed = True
options = self.build_container_options()
self.dc.create_container(**options)
if self.params.get('restart_policy') != 'no':
self.changed |= self.systemd.create_unit_file()
def recreate_or_restart_container(self):
self.changed = True
container = self.check_container()
# get config_strategy from env
environment = self.params.get('environment')
config_strategy = environment.get('KOLLA_CONFIG_STRATEGY')
if not container:
self.start_container()
return
# If config_strategy is COPY_ONCE or container's parameters are
# changed, try to start a new one.
if config_strategy == 'COPY_ONCE' or self.check_container_differs():
# NOTE(mgoddard): Pull the image if necessary before stopping the
# container, otherwise a failure to pull the image will leave the
# container stopped.
if not self.check_image():
self.pull_image()
self.stop_container()
self.remove_container()
self.start_container()
elif config_strategy == 'COPY_ALWAYS':
self.restart_container()
def start_container(self):
if not self.check_image():
self.pull_image()
container = self.check_container()
if container and self.check_container_differs():
self.stop_container()
self.remove_container()
container = self.check_container()
if not container:
self.create_container()
container = self.check_container()
if not container['Status'].startswith('Up '):
self.changed = True
if self.params.get('restart_policy') == 'no':
self.dc.start(container=self.params.get('name'))
else:
self.systemd.create_unit_file()
if not self.systemd.start():
self.module.fail_json(
changed=True,
msg="Container timed out",
**self.check_container())
# We do not want to detach so we wait around for container to exit
if not self.params.get('detach'):
rc = self.dc.wait(self.params.get('name'))
# NOTE(jeffrey4l): since python docker package 3.0, wait return a
# dict all the time.
if isinstance(rc, dict):
rc = rc['StatusCode']
# Include container's return code, standard output and error in the
# result.
self.result['rc'] = rc
self.result['stdout'] = self.dc.logs(self.params.get('name'),
stdout=True, stderr=False)
self.result['stderr'] = self.dc.logs(self.params.get('name'),
stdout=False, stderr=True)
if self.params.get('remove_on_exit'):
self.stop_container()
self.remove_container()
if rc != 0:
self.module.fail_json(
changed=True,
msg="Container exited with non-zero return code %s" % rc,
**self.result
)
def stop_container(self):
name = self.params.get('name')
graceful_timeout = self.params.get('graceful_timeout')
if not graceful_timeout:
graceful_timeout = 10
container = self.check_container()
if not container:
ignore_missing = self.params.get('ignore_missing')
if not ignore_missing:
self.module.fail_json(
msg="No such container: {} to stop".format(name))
elif not container['Status'].startswith('Exited '):
self.changed = True
if not self.systemd.check_unit_file():
self.dc.stop(name, timeout=graceful_timeout)
else:
self.systemd.stop()
def stop_and_remove_container(self):
container = self.check_container()
if container:
self.stop_container()
self.remove_container()
def restart_container(self):
name = self.params.get('name')
graceful_timeout = self.params.get('graceful_timeout')
if not graceful_timeout:
graceful_timeout = 10
info = self.get_container_info()
if not info:
self.module.fail_json(
msg="No such container: {}".format(name))
else:
self.changed = True
if self.params.get('restart_policy') != 'no':
self.systemd.create_unit_file()
if not self.systemd.restart():
self.module.fail_json(
changed=True,
msg="Container timed out",
**self.check_container())
else:
self.dc.stop(name, timeout=graceful_timeout)
self.dc.start(name)
def create_volume(self):
if not self.check_volume():
self.changed = True
self.dc.create_volume(name=self.params.get('name'), driver='local')
def remove_volume(self):
if self.check_volume():
self.changed = True
try:
self.dc.remove_volume(name=self.params.get('name'))
except docker.errors.APIError as e:
if e.response.status_code == 409:
self.module.fail_json(
failed=True,
msg="Volume named '{}' is currently in-use".format(
self.params.get('name')
)
)
raise
def remove_image(self):
if self.check_image():
self.changed = True
try:
self.dc.remove_image(image=self.params.get('image'))
except docker.errors.APIError as e:
if e.response.status_code == 409:
self.module.fail_json(
failed=True,
msg="Image '{}' is currently in-use".format(
self.params.get('image')
)
)
elif e.response.status_code == 500:
self.module.fail_json(
failed=True,
msg="Server error"
)
raise
def ensure_image(self):
if not self.check_image():
self.pull_image()