systemd: handle running container without systemd unit
MariaDB bootstrap has a phase where the first MariaDB container is running with Galera bootstrap - after a check that WSREP is synced is successful - we restart the container. The bootstrap container is named mariadb and running with docker_restart_policy: "no" - the restarted container should be running in systemd. Before this patch the code created a systemd unit but it was initially stopped - so stopping was always a success - and the container would be killed with SIGKILL on removal (which obviously breaks MariaDB). This patch also improves docker/systemd stops by waiting for real unit/container stop and adds failing CI for containers that are killed with signal 9. Closes-Bug: #2029613 Change-Id: I0a03e509ce228a50e081fcab44d2b4831251190c
This commit is contained in:
parent
01a911b27d
commit
1497ab2ab3
@ -446,11 +446,10 @@ class DockerWorker(ContainerWorker):
|
|||||||
msg="No such container: {} to stop".format(name))
|
msg="No such container: {} to stop".format(name))
|
||||||
elif not container['Status'].startswith('Exited '):
|
elif not container['Status'].startswith('Exited '):
|
||||||
self.changed = True
|
self.changed = True
|
||||||
if self.params.get('restart_policy') != 'no':
|
if not self.systemd.check_unit_file():
|
||||||
self.systemd.create_unit_file()
|
|
||||||
self.systemd.stop()
|
|
||||||
else:
|
|
||||||
self.dc.stop(name, timeout=graceful_timeout)
|
self.dc.stop(name, timeout=graceful_timeout)
|
||||||
|
else:
|
||||||
|
self.systemd.stop()
|
||||||
|
|
||||||
def stop_and_remove_container(self):
|
def stop_and_remove_container(self):
|
||||||
container = self.check_container()
|
container = self.check_container()
|
||||||
|
@ -106,11 +106,16 @@ class SystemdWorker(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
return self.perform_action(
|
if self.perform_action(
|
||||||
'StopUnit',
|
'StopUnit',
|
||||||
self.container_dict['service_name'],
|
self.container_dict['service_name'],
|
||||||
self.job_mode
|
self.job_mode
|
||||||
|
):
|
||||||
|
return self.wait_for_unit(
|
||||||
|
self.container_dict['restart_timeout'],
|
||||||
|
state='dead'
|
||||||
)
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
def reload(self):
|
def reload(self):
|
||||||
return self.perform_action(
|
return self.perform_action(
|
||||||
@ -190,12 +195,12 @@ class SystemdWorker(object):
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def wait_for_unit(self, timeout):
|
def wait_for_unit(self, timeout, state='running'):
|
||||||
delay = 5
|
delay = 5
|
||||||
elapsed = 0
|
elapsed = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
if self.get_unit_state() == 'running':
|
if self.get_unit_state() == state:
|
||||||
return True
|
return True
|
||||||
elif elapsed > timeout:
|
elif elapsed > timeout:
|
||||||
return False
|
return False
|
||||||
|
@ -22,6 +22,10 @@ function check_fluentd_log_file_for_level {
|
|||||||
sudo egrep "\[$2\]:" $1
|
sudo egrep "\[$2\]:" $1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function check_docker_log_file_for_sigkill {
|
||||||
|
sudo journalctl --no-pager -u ${CONTAINER_ENGINE}.service | grep "signal 9"
|
||||||
|
}
|
||||||
|
|
||||||
function filter_out_expected_critical {
|
function filter_out_expected_critical {
|
||||||
# $1: file
|
# $1: file
|
||||||
# Filter out expected critical log messages that we do not want to fail the
|
# Filter out expected critical log messages that we do not want to fail the
|
||||||
@ -79,6 +83,10 @@ if check_fluentd_log_file_for_level $fluentd_log_file error >/dev/null; then
|
|||||||
echo >> $fluentd_error_summary_file
|
echo >> $fluentd_error_summary_file
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if check_docker_log_file_for_sigkill >/dev/null; then
|
||||||
|
any_critical=1
|
||||||
|
echo "(critical) Found containers killed using signal 9 (SIGKILL) in docker logs."
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $any_critical -eq 1 ]]; then
|
if [[ $any_critical -eq 1 ]]; then
|
||||||
echo "Found critical log messages - failing job."
|
echo "Found critical log messages - failing job."
|
||||||
|
@ -536,6 +536,7 @@ class TestContainer(base.BaseTestCase):
|
|||||||
'action': 'stop_container',
|
'action': 'stop_container',
|
||||||
'restart_policy': 'no'})
|
'restart_policy': 'no'})
|
||||||
self.dw.dc.containers.return_value = self.fake_data['containers']
|
self.dw.dc.containers.return_value = self.fake_data['containers']
|
||||||
|
self.dw.systemd.check_unit_file.return_value = False
|
||||||
self.dw.stop_container()
|
self.dw.stop_container()
|
||||||
|
|
||||||
self.assertTrue(self.dw.changed)
|
self.assertTrue(self.dw.changed)
|
||||||
|
Loading…
Reference in New Issue
Block a user