kolla-ansible/tests/check-logs.sh
Michal Nasiadka 1497ab2ab3 systemd: handle running container without systemd unit
MariaDB bootstrap has a phase where the first MariaDB container
is running with Galera bootstrap - after a check that WSREP
is synced is successful - we restart the container.

The bootstrap container is named mariadb and running with
docker_restart_policy: "no" - the restarted container should be running
in systemd.

Before this patch the code created a systemd unit but it was initially
stopped - so stopping was always a success - and the container would be
killed with SIGKILL on removal (which obviously breaks MariaDB).

This patch also improves docker/systemd stops by waiting for real
unit/container stop and adds failing CI for containers that are
killed with signal 9.

Closes-Bug: #2029613

Change-Id: I0a03e509ce228a50e081fcab44d2b4831251190c
2023-08-17 14:57:39 +00:00

95 lines
3.0 KiB
Bash
Executable File

#!/bin/bash
# Check for CRITICAL, ERROR or WARNING messages in log files.
set -o errexit
set -o pipefail
# Enable unbuffered output for Ansible in Jenkins.
export PYTHONUNBUFFERED=1
function check_openstack_log_file_for_level {
# $1: file
# $2: log level
# Filter out false positives from logged config options.
sudo egrep " $2 " $1 | egrep -v "(logging_exception_prefix|rate_limit_except_level)"
}
function check_fluentd_log_file_for_level {
# $1: file
# $2: log level
sudo egrep "\[$2\]:" $1
}
function check_docker_log_file_for_sigkill {
sudo journalctl --no-pager -u ${CONTAINER_ENGINE}.service | grep "signal 9"
}
function filter_out_expected_critical {
# $1: file
# Filter out expected critical log messages that we do not want to fail the
# job.
case $1 in
*/neutron-server.log)
# Sometimes we see this during shutdown (upgrade).
# See: https://bugs.launchpad.net/neutron/+bug/1863579
grep -v "WSREP has not yet prepared node for application use" |
grep -v "Failed to fetch token data from identity server"
;;
*)
# Sometimes we see this during upgrades of Keystone.
# Usually in Placement but also in Neutron and Nova.
# Especially in AIO.
grep -v "Failed to fetch token data from identity server"
;;
esac
}
any_critical=0
for level in CRITICAL ERROR WARNING; do
all_file=/tmp/logs/kolla/all-${level}.log
# remove the file to avoid collecting duplicates (upgrade, post)
rm -f $all_file
any_matched=0
echo "Checking for $level log messages"
for f in $(sudo find /var/log/kolla/ -type f); do
if check_openstack_log_file_for_level $f $level >/dev/null; then
any_matched=1
if [[ $level = CRITICAL ]]; then
if check_openstack_log_file_for_level $f $level | filter_out_expected_critical $f >/dev/null; then
any_critical=1
fi
fi
echo $f >> $all_file
check_openstack_log_file_for_level $f $level >> $all_file
echo >> $all_file
fi
done
if [[ $any_matched -eq 1 ]]; then
echo "Found some $level log messages. Matches in $all_file"
fi
done
# check fluentd errors (we consider them critical)
fluentd_log_file=/var/log/kolla/fluentd/fluentd.log
fluentd_error_summary_file=/tmp/logs/kolla/fluentd-error.log
if check_fluentd_log_file_for_level $fluentd_log_file error >/dev/null; then
any_critical=1
echo "(critical) Found some error log messages in fluentd logs. Matches in $fluentd_error_summary_file"
check_fluentd_log_file_for_level $fluentd_log_file error > $fluentd_error_summary_file
echo >> $fluentd_error_summary_file
fi
if check_docker_log_file_for_sigkill >/dev/null; then
any_critical=1
echo "(critical) Found containers killed using signal 9 (SIGKILL) in docker logs."
fi
if [[ $any_critical -eq 1 ]]; then
echo "Found critical log messages - failing job."
exit 1
fi