05bbc77057
This update is to prevent nodes from crashing while powering off during graceful shutdown (or reboot). This improves timing and shutdown of containerd.service. The containerd shutdown script stops all containers via 'crictl stop' with 5 second timeout, followed by stop all pods via 'crictl stopp'. This cleans up lingering /pause sandbox containers. This modifies the arguments to xargs and crictl to let xargs deal with parallelism instead of batching to crictl. crictl appears to do the stop operations serially. The number stop in parallel is engineered to 10. Engineering the number of stop in parallel in relation to shutdown timings under stress load will be addressed in a subsequent update. The engineering TC should align with customer requirements. When testing containerd shutdown under the stress of multiple pods writing to a shared PersistentVolume, even the new parallel shutdown code is not sufficient to complete the shutdown within the default 90-second timeout. Additional changes will be needed to enable clean shutdown under those circumstances. Partial-Bug: 2043069 Test plan: - PASS - build-image, install and boot up on AIO-SX - PASS - perform reboot and verify /var/log/daemon.log has new k8s-container-cleanup.sh logs for 'Stopping all pods' and 'Stopping all containers', and that drbd stops after containerd. - FAIL - verify containerd shutdown works under stress with the new parallel stop pods parameter NPAR=10. The stress load uses ReadWriteMany PVC, and multiple pods, each writing to the shared PVC. Change-Id: Ibfc0a474a40344a629b3f0780449906a9c6b03ba Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
83 lines
2.2 KiB
Bash
Executable File
83 lines
2.2 KiB
Bash
Executable File
#!/bin/bash
|
|
# Copyright (c) 2022 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# The script will run during containerd.service ExecStop.
|
|
# This script detects whether systemd state is 'stopping' due to
|
|
# shutdown/reboot, then will stop all running containers before the
|
|
# service shuts down.
|
|
#
|
|
# All running containers are stopped one container at a time.
|
|
# The internal implementation of 'crictl stop --timeout <n>'
|
|
# sends a SIGTERM to the container, and will use SIGKILL only
|
|
# if the timeout is reached.
|
|
#
|
|
|
|
NAME=$(basename "${0}")
|
|
|
|
# Log info message to /var/log/daemon.log
|
|
function LOG {
|
|
logger -p daemon.info -t "${NAME}($$): " "${@}"
|
|
}
|
|
|
|
# Log error message to /var/log/daemon.log
|
|
function ERROR {
|
|
logger -p daemon.error -t "${NAME}($$): " "${@}"
|
|
}
|
|
|
|
function do_force_clean {
|
|
# Use crictl to gracefully stop each container. If specified timeout is
|
|
# reached, it forcibly kills the container. There is no need to check
|
|
# return code since there is nothing more we can do, and crictl already
|
|
# logs to daemon.log.
|
|
|
|
# Number to stop in parallel
|
|
NPAR=10
|
|
|
|
# Set timeout to 5 seconds in case stop doesn't complete
|
|
TIMEOUT="--timeout 5"
|
|
|
|
# Stop all containers.
|
|
LOG "Stopping all containers."
|
|
crictl ps -q | xargs -P ${NPAR} -n 1 -r crictl stop ${TIMEOUT}
|
|
LOG "Stopping all containers completed."
|
|
|
|
# Stop all pods, this will cleanup /pause containers.
|
|
LOG "Stopping all pods."
|
|
crictl pods -q | xargs -P ${NPAR} -n 1 -r crictl stopp
|
|
LOG "Stopping all pods completed."
|
|
}
|
|
|
|
case "$1" in
|
|
|
|
"")
|
|
state=$(timeout 10 systemctl is-system-running)
|
|
RC=$?
|
|
LOG "System state is: ${state}, RC = ${RC}."
|
|
case ${RC} in
|
|
124)
|
|
# systemctl hung.
|
|
ERROR "systemctl timed out. System state unknown."
|
|
exit 0
|
|
;;
|
|
|
|
1)
|
|
# 1 - initializing, starting, degraded, maintenance, stopping
|
|
if [ "${state}" = "stopping" ]; then
|
|
do_force_clean
|
|
fi
|
|
;;
|
|
esac
|
|
;;
|
|
force-clean)
|
|
do_force_clean
|
|
;;
|
|
*)
|
|
echo "usage: $0 { force-clean }" >&2
|
|
exit 3
|
|
;;
|
|
esac
|
|
|
|
exit 0
|