Steven Webster 5d1a26b89d Implement CNI cache file cleanup for stale files
It has been observed in systems running for months -> years
that the CNI cache files (representing attributes of
network attachment definitions of pods) can accumulate in
large numbers in the /var/lib/cni/results/ and
/var/lib/cni/multus/ directories.

The cache files in /var/lib/cni/results/ have a naming signature of:

<type>-<pod id>-<interface name>

While the cache files in /var/lib/cni/multus have a naming signature
of:

<pod id>

Normally these files are cleaned up automatically (I believe
this is the responsibility of containerd).  It has been seen
that this happens reliably when one manually deletes a pod.

The issue has been reproduced in the case of a host being manually
rebooted.  In this case, the pods are re-created when the host comes
back up, but with a different pod-id than was used before

In this case, _most_ of the time the cache files from the previous
instantiation of the pod are deleted, but occasionally a few are
missed by the internal garbage collection mechanism.

Once a cache file from the previous instantiation of a pod escapes
garbage collection, it seems to be left as a stale file for all
subsequent reboots.  Over time, this can cause these stale files
to accumulate and take up disk space unnecessarily.

The script will be called once by the k8s-pod-recovery service
on system startup, and then periodically via a cron job installed
by puppet.

The cleanup mechanism analyzes the cache files by name and
compares them with the id(s) of the currently running pods. Any
stale files detected are deleted.

Test Plan:

PASS: Verify existing pods do not have their cache files removed
PASS: Verify files younger than the specified 'olderthan' time
      are not removed
PASS: Verify stale cache files for pods that do not exist anymore
      are removed.
PASS: Verify the script does not run if kubelet is not up yet.

Failure Path:

PASS: Verify files not matching the naming signature (pod id
      embedded in file name) are not processed

Regression:

PASS: Verify system install
PASS: Verify feature logging

Partial-Bug: 1947386

Signed-off-by: Steven Webster <steven.webster@windriver.com>
Change-Id: I0ce06646001e52d1cc6d204b924f41d049264b4c
2021-11-01 10:39:39 -04:00

350 lines
11 KiB
Bash
Executable File

#!/bin/bash
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# chkconfig: 2345 76 25
#
### BEGIN INIT INFO
# Provides: k8s-pod-recovery
# Default-Start: 3 5
# Required-Start:
# Required-Stop:
# Default-Stop: 0 1 2 6
# Short-Description: Service to recovery pods after host boot
### END INIT INFO
. /etc/platform/platform.conf
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin:/usr/local/sbin
export KUBECONFIG=/etc/kubernetes/admin.conf
CONF_DIR=/etc/k8s-post-recovery.d
SLEEP_DELAY_SEC=15
NAME=$(basename $0)
PIDFILE=/var/run/${NAME}.pid
HOST=$(hostname)
# Log info message to /var/log/daemon.log
function LOG {
logger -p daemon.info -t "${NAME}($$): " "$@"
}
# Log error message to /var/log/daemon.log
function ERROR {
logger -p daemon.error -t "${NAME}($$): " "$@"
}
function _check_for_k8s_config {
# If this node has not been configured, then there is nothing to recovery
if [ ! -f ${KUBECONFIG} ]; then
LOG "${KUBECONFIG} does not exist. No pods to recover."
exit 0
fi
}
function _check_for_existing_process {
# Abort if another instantiation is already running
if [ -e ${PIDFILE} ]; then
PID=$(cat ${PIDFILE})
PROCESS=$(cat /proc/${PID}/comm)
if [ -n "${PID}" -a -e /proc/${PID} -a ${PROCESS} == ${NAME} ]; then
ERROR "Aborting, ${PID} already running: ${PIDFILE}."
exit 1
else
OUT=$(rm -v -f ${PIDFILE})
LOG "${OUT}"
fi
fi
# Create pidfile to indicate the script is running
echo $$ > ${PIDFILE}
}
function _wait_for_systemd {
while true; do
if systemctl is-system-running | grep -q -e running -e degraded; then
break
fi
LOG "Waiting for systemd to finish booting..."
sleep ${SLEEP_DELAY_SEC}
done
}
function _do_cni_cache_cleanup {
# Cleanup any stale CNI cache files (not associated with any running pod)
# that are older than 1 hour old
LOG "Starting CNI cache cleanup..."
k8s-cni-cache-cleanup -o 1 -d
if [[ ${?} -ne 0 ]]; then
ERROR "Failed to run CNI cache cleanup."
fi
}
function _wait_for_pod_stabilization {
local extra_args=$1
local time_between_polls=$2
local stable_cycles=$3
last_count=0
stability_count=0
while [[ $stability_count -lt $stable_cycles ]] ; do
pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces $extra_args | grep -v -e Running -e Completed | wc -l)
if [[ $pods_in_flux -ne $last_count ]]; then
LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed"
last_count=$pods_in_flux
stability_count=0
else
LOG "Pods transitions are stable... for $((stability_count*time_between_polls)) seconds."
stability_count=$((stability_count+1))
fi
sleep $time_between_polls
done
}
function _unknown_pods {
# $1: actions <recover|verify>
# Target specific namespaces and pods on this host
SUPPORTED_NAMESPACES=('armada' 'openstack' 'monitor')
shopt -s nullglob
for conf_file in ${CONF_DIR}/*.conf; do
grep -q '^namespace=' $conf_file || continue
SUPPORTED_NAMESPACES+=($(grep '^namespace=' $conf_file | awk -F '=' '{print $2}'))
done
if [ "$1" == 'recover' ]; then
# Recovers pods that are: Running/Unknown and Pending/Init:Unknown
for ns in ${SUPPORTED_NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}')
for pod in $PODS ; do
LOG "Unknown pods: Recovering: $ns/$pod"
kubectl delete pods -n $ns $pod --wait=false
done
done
elif [ "$1" == 'verify' ]; then
for ns in ${SUPPORTED_NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}')
if [ -z "${PODS}" ]; then
LOG "Unknown pods: None present for namespace: $ns"
else
ERROR "Unknown pods: still present for namespace: $ns"
fi
done
else
ERROR "Unknown action: $1"
fi
}
function _outofhugepages_pods {
# $1: actions <recover|verify>
# Target all namespaces and pods on this host
NAMESPACES=$(kubectl get ns | tail -n +2 | awk '{ print $1 }')
if [ "$1" == 'recover' ]; then
# Recovers pods that are: Running/OutOfhugepages
for ns in ${NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /OutOfhugepages/'{print $1}')
for pod in $PODS ; do
LOG "OutOfhugepages pods: Recovering: $ns/$pod"
kubectl delete pods -n $ns $pod --wait=false
done
done
elif [ "$1" == 'verify' ]; then
for ns in ${NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /OutOfhugepages/'{print $1}')
if [ -z "${PODS}" ]; then
LOG "OutOfhugepages pods: None present for namespace: $ns"
else
ERROR "OutOfhugepages pods: still present for namespace: $ns"
fi
done
else
ERROR "Unknown action: $1"
fi
}
function _node_affinity_pods {
# $1: actions <recover|verify>
if [ "$1" == 'recover' ]; then
PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffinity/'{print $1"/"$2}')
for pod in $PODS ; do
LOG "NodeAffinity pods: Recovering: $pod"
kubectl delete pods -n ${pod//// } --wait=false
done
elif [ "$1" == 'verify' ]; then
PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffnity/'{print $1"/"$2}')
if [ -z "${PODS}" ]; then
LOG "NodeAffinity pods: None present."
else
ERROR "NodeAffinity pods: still present"
fi
else
ERROR "Unknown action: $1"
fi
}
function _labeled_pods {
# $1: actions <recover|verify>
if [ "$1" == 'recover' ]; then
POLLING_INTERVAL=5
STABILITY_COUNT=6
_wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
# Don't have to restart device-plugin if no labeled pods are present. System may not be configured for SRIOV.
if [ ! -z "${PODS}" ]; then
LOG "Waiting for SRIOV device plugin pod to become available"
# Check if device-plugin is ready, but do not wait
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s
# If device plugin is not ready, restart it and wait
if [ "$?" -ne 0 ]; then
kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s
if [ "$?" -ne 0 ]; then
ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover."
fi
fi
fi
# Delete pods with the restart-on-reboot=true label
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
kubectl delete pods -n ${pod//// } --wait=false
done
elif [ "$1" == 'verify' ]; then
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Verifying: ${pod//// }"
STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}')
if [[ "${STATUS}" != "Running" ]]; then
ERROR "$pod: not recovered: $STATUS"
else
LOG "$pod: recovered"
fi
done
else
ERROR "Unknown action: $1"
fi
}
function _force_reset_pods {
# $1: actions <recover|verify>
# Handle resetting openstack libvirt pod as it sometimes is in a Running but
# unusable state
if kubectl get namespace openstack > /dev/null 2>&1; then
# Get the libvirt pods on this host that are Running without all
# conditions True
#
# Conditions:
# Initialized True
# Ready True
# ContainersReady True
# PodScheduled True
#
# NAME STATUS CONDITIONS NODE
# libvirt-libvirt-controller-0-937646f6-xst4r Running True,True,True,True controller-0
#
CUSTOM_COLUMNS='custom-columns=NAME:.metadata.name,STATUS:status.phase,CONDITIONS:status.conditions[*].status,NODE:spec.nodeName'
FIELD_SELECTOR="spec.nodeName=${HOST}"
PODS=$(kubectl get pods -n openstack -l application=libvirt --field-selector ${FIELD_SELECTOR} -o ${CUSTOM_COLUMNS} | grep -v NAME | grep -v 'True,True,True,True' | awk '{print $1}')
if [ "$1" == 'recover' ]; then
for pod in $PODS ; do
LOG "Recovering libvirt pod: $pod"
kubectl delete pods -n openstack $pod --wait=false
done
elif [ "$1" == 'verify' ]; then
if [ -z "${PODS}" ]; then
LOG "Openstack libvirt pod on ${HOST} is running."
else
ERROR "Openstack libvirt pod on ${HOST} has not been recovered."
fi
else
ERROR "Unknown action: $1"
fi
fi
}
function _examine_pods {
# $1: actions <recover|verify>
# Manage labeled pods first
_labeled_pods $1
# Wait for pods transitions to stop
_wait_for_pod_stabilization "" $SLEEP_DELAY_SEC 6
# Check for recovery actions
_unknown_pods $1
_node_affinity_pods $1
_force_reset_pods $1
_outofhugepages_pods $1
}
function start {
_check_for_k8s_config
_check_for_existing_process
LOG "Starting."
_wait_for_systemd
_examine_pods 'recover'
_examine_pods 'verify'
_do_cni_cache_cleanup
}
function stop {
LOG "Stopping."
}
function status {
:
}
function reset {
:
}
case "$1" in
start)
start
;;
stop)
stop
;;
restart|force-reload|reload)
stop
start
;;
status)
status
;;
reset)
reset
;;
*)
echo "Usage: $0 {start|stop|force-reload|restart|reload|status|reset}"
exit 1
;;
esac
exit 0