kolla-ansible/ansible/module_utils/kolla_systemd_worker.py
Michal Nasiadka 1497ab2ab3 systemd: handle running container without systemd unit
MariaDB bootstrap has a phase where the first MariaDB container
is running with Galera bootstrap - after a check that WSREP
is synced is successful - we restart the container.

The bootstrap container is named mariadb and running with
docker_restart_policy: "no" - the restarted container should be running
in systemd.

Before this patch the code created a systemd unit but it was initially
stopped - so stopping was always a success - and the container would be
killed with SIGKILL on removal (which obviously breaks MariaDB).

This patch also improves docker/systemd stops by waiting for real
unit/container stop and adds failing CI for containers that are
killed with signal 9.

Closes-Bug: #2029613

Change-Id: I0a03e509ce228a50e081fcab44d2b4831251190c
2023-08-17 14:57:39 +00:00

210 lines
5.9 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from string import Template
from time import sleep
import dbus
TEMPLATE = '''# ${service_name}
# autogenerated by Kolla-Ansible
[Unit]
Description=docker ${service_name}
After=docker.service
Requires=docker.service
StartLimitIntervalSec=${restart_timeout}
StartLimitBurst=${restart_retries}
[Service]
ExecStart=/usr/bin/docker start -a ${name}
ExecStop=/usr/bin/docker stop ${name} -t ${graceful_timeout}
Restart=${restart_policy}
RestartSec=${restart_duration}
[Install]
WantedBy=multi-user.target
'''
class SystemdWorker(object):
def __init__(self, params):
name = params.get('name', None)
# systemd is not needed
if not name:
return None
restart_policy = params.get('restart_policy', 'no')
if restart_policy == 'unless-stopped':
restart_policy = 'always'
# NOTE(hinermar): duration * retries should be less than timeout
# otherwise service will indefinitely try to restart.
# Also, correct timeout and retries values should probably be
# checked at the module level inside kolla_docker.py
restart_timeout = params.get('client_timeout', 120)
restart_retries = params.get('restart_retries', 10)
restart_duration = (restart_timeout // restart_retries) - 1
# container info
self.container_dict = dict(
name=name,
service_name='kolla-' + name + '-container.service',
engine='docker',
deps='docker.service',
graceful_timeout=params.get('graceful_timeout'),
restart_policy=restart_policy,
restart_timeout=restart_timeout,
restart_retries=restart_retries,
restart_duration=restart_duration
)
# systemd
self.manager = self.get_manager()
self.job_mode = 'replace'
self.sysdir = '/etc/systemd/system/'
# templating
self.template = Template(TEMPLATE)
def get_manager(self):
sysbus = dbus.SystemBus()
systemd1 = sysbus.get_object(
'org.freedesktop.systemd1',
'/org/freedesktop/systemd1'
)
return dbus.Interface(systemd1, 'org.freedesktop.systemd1.Manager')
def start(self):
if self.perform_action(
'StartUnit',
self.container_dict['service_name'],
self.job_mode
):
return self.wait_for_unit(self.container_dict['restart_timeout'])
return False
def restart(self):
if self.perform_action(
'RestartUnit',
self.container_dict['service_name'],
self.job_mode
):
return self.wait_for_unit(self.container_dict['restart_timeout'])
return False
def stop(self):
if self.perform_action(
'StopUnit',
self.container_dict['service_name'],
self.job_mode
):
return self.wait_for_unit(
self.container_dict['restart_timeout'],
state='dead'
)
return False
def reload(self):
return self.perform_action(
'Reload',
self.container_dict['service_name'],
self.job_mode
)
def enable(self):
return self.perform_action(
'EnableUnitFiles',
[self.container_dict['service_name']],
False,
True
)
def perform_action(self, function, *args):
try:
getattr(self.manager, function)(*args)
return True
except Exception:
return False
def check_unit_file(self):
return os.path.isfile(
self.sysdir + self.container_dict['service_name']
)
def check_unit_change(self, new_content=''):
if not new_content:
new_content = self.generate_unit_file()
if self.check_unit_file():
with open(
self.sysdir + self.container_dict['service_name'], 'r'
) as f:
curr_content = f.read()
# return whether there was change in the unit file
return curr_content != new_content
return True
def generate_unit_file(self):
return self.template.substitute(self.container_dict)
def create_unit_file(self):
file_content = self.generate_unit_file()
if self.check_unit_change(file_content):
with open(
self.sysdir + self.container_dict['service_name'], 'w'
) as f:
f.write(file_content)
self.reload()
self.enable()
return True
return False
def remove_unit_file(self):
if self.check_unit_file():
os.remove(self.sysdir + self.container_dict['service_name'])
self.reload()
return True
else:
return False
def get_unit_state(self):
unit_list = self.manager.ListUnits()
for service in unit_list:
if str(service[0]) == self.container_dict['service_name']:
return str(service[4])
return None
def wait_for_unit(self, timeout, state='running'):
delay = 5
elapsed = 0
while True:
if self.get_unit_state() == state:
return True
elif elapsed > timeout:
return False
else:
sleep(delay)
elapsed += delay