systemd: handle running container without systemd unit

MariaDB bootstrap has a phase where the first MariaDB container is running with Galera bootstrap - after a check that WSREP is synced is successful - we restart the container. The bootstrap container is named mariadb and running with docker_restart_policy: "no" - the restarted container should be running in systemd. Before this patch the code created a systemd unit but it was initially stopped - so stopping was always a success - and the container would be killed with SIGKILL on removal (which obviously breaks MariaDB). This patch also improves docker/systemd stops by waiting for real unit/container stop and adds failing CI for containers that are killed with signal 9. Closes-Bug: #2029613 Change-Id: I0a03e509ce228a50e081fcab44d2b4831251190c
2023-08-03 19:54:31 +00:00 · 2023-08-03 19:54:31 +00:00 · 1497ab2ab3
commit 1497ab2ab3
parent 01a911b27d
4 changed files with 21 additions and 8 deletions
--- a/ansible/module_utils/kolla_docker_worker.py
+++ b/ansible/module_utils/kolla_docker_worker.py
@ -446,11 +446,10 @@ class DockerWorker(ContainerWorker):
                    msg="No such container: {} to stop".format(name))
        elif not container['Status'].startswith('Exited '):
            self.changed = True
-            if self.params.get('restart_policy') != 'no':
-                self.systemd.create_unit_file()
-                self.systemd.stop()
-            else:
+            if not self.systemd.check_unit_file():
                self.dc.stop(name, timeout=graceful_timeout)
+            else:
+                self.systemd.stop()

    def stop_and_remove_container(self):
        container = self.check_container()
--- a/ansible/module_utils/kolla_systemd_worker.py
+++ b/ansible/module_utils/kolla_systemd_worker.py
@ -106,11 +106,16 @@ class SystemdWorker(object):
        return False

    def stop(self):
-        return self.perform_action(
+        if self.perform_action(
            'StopUnit',
            self.container_dict['service_name'],
            self.job_mode
-        )
+        ):
+            return self.wait_for_unit(
+                self.container_dict['restart_timeout'],
+                state='dead'
+            )
+        return False

    def reload(self):
        return self.perform_action(
@ -190,12 +195,12 @@ class SystemdWorker(object):

        return None

-    def wait_for_unit(self, timeout):
+    def wait_for_unit(self, timeout, state='running'):
        delay = 5
        elapsed = 0

        while True:
-            if self.get_unit_state() == 'running':
+            if self.get_unit_state() == state:
                return True
            elif elapsed > timeout:
                return False
--- a/tests/check-logs.sh
+++ b/tests/check-logs.sh
@ -22,6 +22,10 @@ function check_fluentd_log_file_for_level {
    sudo egrep "\[$2\]:" $1
 }

+function check_docker_log_file_for_sigkill {
+    sudo journalctl --no-pager -u ${CONTAINER_ENGINE}.service | grep "signal 9"
+}
+
 function filter_out_expected_critical {
    # $1: file
    # Filter out expected critical log messages that we do not want to fail the
@ -79,6 +83,10 @@ if check_fluentd_log_file_for_level $fluentd_log_file error >/dev/null; then
    echo >> $fluentd_error_summary_file
 fi

+if check_docker_log_file_for_sigkill >/dev/null; then
+    any_critical=1
+    echo "(critical) Found containers killed using signal 9 (SIGKILL) in docker logs."
+fi

 if [[ $any_critical -eq 1 ]]; then
    echo "Found critical log messages - failing job."
--- a/tests/kolla_docker_tests/test_docker_worker.py
+++ b/tests/kolla_docker_tests/test_docker_worker.py
@ -536,6 +536,7 @@ class TestContainer(base.BaseTestCase):
                                    'action': 'stop_container',
                                    'restart_policy': 'no'})
        self.dw.dc.containers.return_value = self.fake_data['containers']
+        self.dw.systemd.check_unit_file.return_value = False
        self.dw.stop_container()

        self.assertTrue(self.dw.changed)