5d1a26b89d
It has been observed in systems running for months -> years that the CNI cache files (representing attributes of network attachment definitions of pods) can accumulate in large numbers in the /var/lib/cni/results/ and /var/lib/cni/multus/ directories. The cache files in /var/lib/cni/results/ have a naming signature of: <type>-<pod id>-<interface name> While the cache files in /var/lib/cni/multus have a naming signature of: <pod id> Normally these files are cleaned up automatically (I believe this is the responsibility of containerd). It has been seen that this happens reliably when one manually deletes a pod. The issue has been reproduced in the case of a host being manually rebooted. In this case, the pods are re-created when the host comes back up, but with a different pod-id than was used before In this case, _most_ of the time the cache files from the previous instantiation of the pod are deleted, but occasionally a few are missed by the internal garbage collection mechanism. Once a cache file from the previous instantiation of a pod escapes garbage collection, it seems to be left as a stale file for all subsequent reboots. Over time, this can cause these stale files to accumulate and take up disk space unnecessarily. The script will be called once by the k8s-pod-recovery service on system startup, and then periodically via a cron job installed by puppet. The cleanup mechanism analyzes the cache files by name and compares them with the id(s) of the currently running pods. Any stale files detected are deleted. Test Plan: PASS: Verify existing pods do not have their cache files removed PASS: Verify files younger than the specified 'olderthan' time are not removed PASS: Verify stale cache files for pods that do not exist anymore are removed. PASS: Verify the script does not run if kubelet is not up yet. Failure Path: PASS: Verify files not matching the naming signature (pod id embedded in file name) are not processed Regression: PASS: Verify system install PASS: Verify feature logging Partial-Bug: 1947386 Signed-off-by: Steven Webster <steven.webster@windriver.com> Change-Id: I0ce06646001e52d1cc6d204b924f41d049264b4c
350 lines
11 KiB
Bash
Executable File
350 lines
11 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
#
|
|
# chkconfig: 2345 76 25
|
|
#
|
|
### BEGIN INIT INFO
|
|
# Provides: k8s-pod-recovery
|
|
# Default-Start: 3 5
|
|
# Required-Start:
|
|
# Required-Stop:
|
|
# Default-Stop: 0 1 2 6
|
|
# Short-Description: Service to recovery pods after host boot
|
|
### END INIT INFO
|
|
|
|
. /etc/platform/platform.conf
|
|
|
|
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin:/usr/local/sbin
|
|
export KUBECONFIG=/etc/kubernetes/admin.conf
|
|
CONF_DIR=/etc/k8s-post-recovery.d
|
|
SLEEP_DELAY_SEC=15
|
|
|
|
NAME=$(basename $0)
|
|
PIDFILE=/var/run/${NAME}.pid
|
|
HOST=$(hostname)
|
|
|
|
# Log info message to /var/log/daemon.log
|
|
function LOG {
|
|
logger -p daemon.info -t "${NAME}($$): " "$@"
|
|
}
|
|
|
|
# Log error message to /var/log/daemon.log
|
|
function ERROR {
|
|
logger -p daemon.error -t "${NAME}($$): " "$@"
|
|
}
|
|
|
|
function _check_for_k8s_config {
|
|
# If this node has not been configured, then there is nothing to recovery
|
|
if [ ! -f ${KUBECONFIG} ]; then
|
|
LOG "${KUBECONFIG} does not exist. No pods to recover."
|
|
exit 0
|
|
fi
|
|
}
|
|
|
|
function _check_for_existing_process {
|
|
# Abort if another instantiation is already running
|
|
if [ -e ${PIDFILE} ]; then
|
|
PID=$(cat ${PIDFILE})
|
|
PROCESS=$(cat /proc/${PID}/comm)
|
|
if [ -n "${PID}" -a -e /proc/${PID} -a ${PROCESS} == ${NAME} ]; then
|
|
ERROR "Aborting, ${PID} already running: ${PIDFILE}."
|
|
exit 1
|
|
else
|
|
OUT=$(rm -v -f ${PIDFILE})
|
|
LOG "${OUT}"
|
|
fi
|
|
fi
|
|
|
|
# Create pidfile to indicate the script is running
|
|
echo $$ > ${PIDFILE}
|
|
}
|
|
|
|
function _wait_for_systemd {
|
|
while true; do
|
|
if systemctl is-system-running | grep -q -e running -e degraded; then
|
|
break
|
|
fi
|
|
LOG "Waiting for systemd to finish booting..."
|
|
sleep ${SLEEP_DELAY_SEC}
|
|
done
|
|
}
|
|
|
|
function _do_cni_cache_cleanup {
|
|
# Cleanup any stale CNI cache files (not associated with any running pod)
|
|
# that are older than 1 hour old
|
|
LOG "Starting CNI cache cleanup..."
|
|
k8s-cni-cache-cleanup -o 1 -d
|
|
if [[ ${?} -ne 0 ]]; then
|
|
ERROR "Failed to run CNI cache cleanup."
|
|
fi
|
|
}
|
|
|
|
function _wait_for_pod_stabilization {
|
|
|
|
local extra_args=$1
|
|
local time_between_polls=$2
|
|
local stable_cycles=$3
|
|
|
|
last_count=0
|
|
stability_count=0
|
|
while [[ $stability_count -lt $stable_cycles ]] ; do
|
|
pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces $extra_args | grep -v -e Running -e Completed | wc -l)
|
|
if [[ $pods_in_flux -ne $last_count ]]; then
|
|
LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed"
|
|
last_count=$pods_in_flux
|
|
stability_count=0
|
|
else
|
|
LOG "Pods transitions are stable... for $((stability_count*time_between_polls)) seconds."
|
|
stability_count=$((stability_count+1))
|
|
fi
|
|
sleep $time_between_polls
|
|
done
|
|
}
|
|
|
|
function _unknown_pods {
|
|
# $1: actions <recover|verify>
|
|
|
|
# Target specific namespaces and pods on this host
|
|
SUPPORTED_NAMESPACES=('armada' 'openstack' 'monitor')
|
|
|
|
shopt -s nullglob
|
|
for conf_file in ${CONF_DIR}/*.conf; do
|
|
grep -q '^namespace=' $conf_file || continue
|
|
SUPPORTED_NAMESPACES+=($(grep '^namespace=' $conf_file | awk -F '=' '{print $2}'))
|
|
done
|
|
|
|
if [ "$1" == 'recover' ]; then
|
|
# Recovers pods that are: Running/Unknown and Pending/Init:Unknown
|
|
for ns in ${SUPPORTED_NAMESPACES[@]}; do
|
|
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}')
|
|
for pod in $PODS ; do
|
|
LOG "Unknown pods: Recovering: $ns/$pod"
|
|
kubectl delete pods -n $ns $pod --wait=false
|
|
done
|
|
done
|
|
elif [ "$1" == 'verify' ]; then
|
|
for ns in ${SUPPORTED_NAMESPACES[@]}; do
|
|
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}')
|
|
if [ -z "${PODS}" ]; then
|
|
LOG "Unknown pods: None present for namespace: $ns"
|
|
else
|
|
ERROR "Unknown pods: still present for namespace: $ns"
|
|
fi
|
|
done
|
|
else
|
|
ERROR "Unknown action: $1"
|
|
fi
|
|
}
|
|
|
|
function _outofhugepages_pods {
|
|
# $1: actions <recover|verify>
|
|
|
|
# Target all namespaces and pods on this host
|
|
NAMESPACES=$(kubectl get ns | tail -n +2 | awk '{ print $1 }')
|
|
|
|
if [ "$1" == 'recover' ]; then
|
|
# Recovers pods that are: Running/OutOfhugepages
|
|
for ns in ${NAMESPACES[@]}; do
|
|
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /OutOfhugepages/'{print $1}')
|
|
for pod in $PODS ; do
|
|
LOG "OutOfhugepages pods: Recovering: $ns/$pod"
|
|
kubectl delete pods -n $ns $pod --wait=false
|
|
done
|
|
done
|
|
elif [ "$1" == 'verify' ]; then
|
|
for ns in ${NAMESPACES[@]}; do
|
|
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /OutOfhugepages/'{print $1}')
|
|
if [ -z "${PODS}" ]; then
|
|
LOG "OutOfhugepages pods: None present for namespace: $ns"
|
|
else
|
|
ERROR "OutOfhugepages pods: still present for namespace: $ns"
|
|
fi
|
|
done
|
|
else
|
|
ERROR "Unknown action: $1"
|
|
fi
|
|
}
|
|
|
|
function _node_affinity_pods {
|
|
# $1: actions <recover|verify>
|
|
|
|
if [ "$1" == 'recover' ]; then
|
|
PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffinity/'{print $1"/"$2}')
|
|
for pod in $PODS ; do
|
|
LOG "NodeAffinity pods: Recovering: $pod"
|
|
kubectl delete pods -n ${pod//// } --wait=false
|
|
done
|
|
elif [ "$1" == 'verify' ]; then
|
|
PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffnity/'{print $1"/"$2}')
|
|
if [ -z "${PODS}" ]; then
|
|
LOG "NodeAffinity pods: None present."
|
|
else
|
|
ERROR "NodeAffinity pods: still present"
|
|
fi
|
|
else
|
|
ERROR "Unknown action: $1"
|
|
fi
|
|
|
|
}
|
|
|
|
function _labeled_pods {
|
|
# $1: actions <recover|verify>
|
|
|
|
if [ "$1" == 'recover' ]; then
|
|
POLLING_INTERVAL=5
|
|
STABILITY_COUNT=6
|
|
_wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
|
|
|
|
|
|
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
|
|
|
|
# Don't have to restart device-plugin if no labeled pods are present. System may not be configured for SRIOV.
|
|
if [ ! -z "${PODS}" ]; then
|
|
LOG "Waiting for SRIOV device plugin pod to become available"
|
|
|
|
# Check if device-plugin is ready, but do not wait
|
|
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s
|
|
|
|
# If device plugin is not ready, restart it and wait
|
|
if [ "$?" -ne 0 ]; then
|
|
kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false
|
|
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s
|
|
|
|
if [ "$?" -ne 0 ]; then
|
|
ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover."
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Delete pods with the restart-on-reboot=true label
|
|
for pod in $PODS; do
|
|
LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
|
|
kubectl delete pods -n ${pod//// } --wait=false
|
|
done
|
|
elif [ "$1" == 'verify' ]; then
|
|
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
|
|
for pod in $PODS; do
|
|
LOG "restart-on-reboot labeled pods: Verifying: ${pod//// }"
|
|
STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}')
|
|
if [[ "${STATUS}" != "Running" ]]; then
|
|
ERROR "$pod: not recovered: $STATUS"
|
|
else
|
|
LOG "$pod: recovered"
|
|
fi
|
|
done
|
|
else
|
|
ERROR "Unknown action: $1"
|
|
fi
|
|
}
|
|
|
|
function _force_reset_pods {
|
|
# $1: actions <recover|verify>
|
|
|
|
# Handle resetting openstack libvirt pod as it sometimes is in a Running but
|
|
# unusable state
|
|
if kubectl get namespace openstack > /dev/null 2>&1; then
|
|
|
|
# Get the libvirt pods on this host that are Running without all
|
|
# conditions True
|
|
#
|
|
# Conditions:
|
|
# Initialized True
|
|
# Ready True
|
|
# ContainersReady True
|
|
# PodScheduled True
|
|
#
|
|
# NAME STATUS CONDITIONS NODE
|
|
# libvirt-libvirt-controller-0-937646f6-xst4r Running True,True,True,True controller-0
|
|
#
|
|
CUSTOM_COLUMNS='custom-columns=NAME:.metadata.name,STATUS:status.phase,CONDITIONS:status.conditions[*].status,NODE:spec.nodeName'
|
|
FIELD_SELECTOR="spec.nodeName=${HOST}"
|
|
PODS=$(kubectl get pods -n openstack -l application=libvirt --field-selector ${FIELD_SELECTOR} -o ${CUSTOM_COLUMNS} | grep -v NAME | grep -v 'True,True,True,True' | awk '{print $1}')
|
|
|
|
if [ "$1" == 'recover' ]; then
|
|
for pod in $PODS ; do
|
|
LOG "Recovering libvirt pod: $pod"
|
|
kubectl delete pods -n openstack $pod --wait=false
|
|
done
|
|
elif [ "$1" == 'verify' ]; then
|
|
if [ -z "${PODS}" ]; then
|
|
LOG "Openstack libvirt pod on ${HOST} is running."
|
|
else
|
|
ERROR "Openstack libvirt pod on ${HOST} has not been recovered."
|
|
fi
|
|
else
|
|
ERROR "Unknown action: $1"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
function _examine_pods {
|
|
# $1: actions <recover|verify>
|
|
|
|
# Manage labeled pods first
|
|
_labeled_pods $1
|
|
|
|
# Wait for pods transitions to stop
|
|
_wait_for_pod_stabilization "" $SLEEP_DELAY_SEC 6
|
|
|
|
# Check for recovery actions
|
|
_unknown_pods $1
|
|
_node_affinity_pods $1
|
|
_force_reset_pods $1
|
|
_outofhugepages_pods $1
|
|
}
|
|
|
|
|
|
function start {
|
|
_check_for_k8s_config
|
|
_check_for_existing_process
|
|
|
|
LOG "Starting."
|
|
|
|
_wait_for_systemd
|
|
_examine_pods 'recover'
|
|
_examine_pods 'verify'
|
|
_do_cni_cache_cleanup
|
|
}
|
|
|
|
function stop {
|
|
LOG "Stopping."
|
|
}
|
|
|
|
function status {
|
|
:
|
|
}
|
|
|
|
function reset {
|
|
:
|
|
}
|
|
|
|
case "$1" in
|
|
start)
|
|
start
|
|
;;
|
|
stop)
|
|
stop
|
|
;;
|
|
restart|force-reload|reload)
|
|
stop
|
|
start
|
|
;;
|
|
status)
|
|
status
|
|
;;
|
|
reset)
|
|
reset
|
|
;;
|
|
*)
|
|
echo "Usage: $0 {start|stop|force-reload|restart|reload|status|reset}"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
exit 0
|