From 1497ab2ab30475083289cc1fea54464b695fca49 Mon Sep 17 00:00:00 2001 From: Michal Nasiadka Date: Thu, 3 Aug 2023 19:54:31 +0000 Subject: [PATCH] systemd: handle running container without systemd unit MariaDB bootstrap has a phase where the first MariaDB container is running with Galera bootstrap - after a check that WSREP is synced is successful - we restart the container. The bootstrap container is named mariadb and running with docker_restart_policy: "no" - the restarted container should be running in systemd. Before this patch the code created a systemd unit but it was initially stopped - so stopping was always a success - and the container would be killed with SIGKILL on removal (which obviously breaks MariaDB). This patch also improves docker/systemd stops by waiting for real unit/container stop and adds failing CI for containers that are killed with signal 9. Closes-Bug: #2029613 Change-Id: I0a03e509ce228a50e081fcab44d2b4831251190c --- ansible/module_utils/kolla_docker_worker.py | 7 +++---- ansible/module_utils/kolla_systemd_worker.py | 13 +++++++++---- tests/check-logs.sh | 8 ++++++++ tests/kolla_docker_tests/test_docker_worker.py | 1 + 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/ansible/module_utils/kolla_docker_worker.py b/ansible/module_utils/kolla_docker_worker.py index 72400ea202..2741b006d4 100644 --- a/ansible/module_utils/kolla_docker_worker.py +++ b/ansible/module_utils/kolla_docker_worker.py @@ -446,11 +446,10 @@ class DockerWorker(ContainerWorker): msg="No such container: {} to stop".format(name)) elif not container['Status'].startswith('Exited '): self.changed = True - if self.params.get('restart_policy') != 'no': - self.systemd.create_unit_file() - self.systemd.stop() - else: + if not self.systemd.check_unit_file(): self.dc.stop(name, timeout=graceful_timeout) + else: + self.systemd.stop() def stop_and_remove_container(self): container = self.check_container() diff --git a/ansible/module_utils/kolla_systemd_worker.py b/ansible/module_utils/kolla_systemd_worker.py index 2d8a59d1cb..72c50b3164 100644 --- a/ansible/module_utils/kolla_systemd_worker.py +++ b/ansible/module_utils/kolla_systemd_worker.py @@ -106,11 +106,16 @@ class SystemdWorker(object): return False def stop(self): - return self.perform_action( + if self.perform_action( 'StopUnit', self.container_dict['service_name'], self.job_mode - ) + ): + return self.wait_for_unit( + self.container_dict['restart_timeout'], + state='dead' + ) + return False def reload(self): return self.perform_action( @@ -190,12 +195,12 @@ class SystemdWorker(object): return None - def wait_for_unit(self, timeout): + def wait_for_unit(self, timeout, state='running'): delay = 5 elapsed = 0 while True: - if self.get_unit_state() == 'running': + if self.get_unit_state() == state: return True elif elapsed > timeout: return False diff --git a/tests/check-logs.sh b/tests/check-logs.sh index 42a8a4541e..2da6b55967 100755 --- a/tests/check-logs.sh +++ b/tests/check-logs.sh @@ -22,6 +22,10 @@ function check_fluentd_log_file_for_level { sudo egrep "\[$2\]:" $1 } +function check_docker_log_file_for_sigkill { + sudo journalctl --no-pager -u ${CONTAINER_ENGINE}.service | grep "signal 9" +} + function filter_out_expected_critical { # $1: file # Filter out expected critical log messages that we do not want to fail the @@ -79,6 +83,10 @@ if check_fluentd_log_file_for_level $fluentd_log_file error >/dev/null; then echo >> $fluentd_error_summary_file fi +if check_docker_log_file_for_sigkill >/dev/null; then + any_critical=1 + echo "(critical) Found containers killed using signal 9 (SIGKILL) in docker logs." +fi if [[ $any_critical -eq 1 ]]; then echo "Found critical log messages - failing job." diff --git a/tests/kolla_docker_tests/test_docker_worker.py b/tests/kolla_docker_tests/test_docker_worker.py index 2c5efd716a..d85c2255d8 100644 --- a/tests/kolla_docker_tests/test_docker_worker.py +++ b/tests/kolla_docker_tests/test_docker_worker.py @@ -536,6 +536,7 @@ class TestContainer(base.BaseTestCase): 'action': 'stop_container', 'restart_policy': 'no'}) self.dw.dc.containers.return_value = self.fake_data['containers'] + self.dw.systemd.check_unit_file.return_value = False self.dw.stop_container() self.assertTrue(self.dw.changed)