Merge "Implement CNI cache file cleanup for stale files"

This commit is contained in:
Zuul 2021-11-04 17:22:43 +00:00 committed by Gerrit Code Review
commit 4bc0c7dd3d
6 changed files with 259 additions and 1 deletions

View File

@ -170,6 +170,7 @@ kubernetes-1.21.3-kubeadm
kubernetes-1.21.3-client
containerd
k8s-pod-recovery
k8s-cni-cache-cleanup
containernetworking-plugins
# resource-agents

View File

@ -66,6 +66,7 @@ kubernetes/chartmuseum
kubernetes/armada-helm-toolkit
kubernetes/armada
kubernetes/k8s-pod-recovery
kubernetes/k8s-cni-cache-cleanup
kubernetes/plugins/isolcpus-device-plugin
python/python-kubernetes
grub/grubby

View File

@ -0,0 +1,4 @@
SRC_DIR="."
COPY_LIST="$FILES_BASE/*"
TIS_PATCH_VER=PKG_GITREVCOUNT

View File

@ -0,0 +1,214 @@
#!/bin/bash
#
# Copyright (c) 2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
NAME=$(basename $0)
RESULTSDIR="/var/lib/cni/results"
MULTUSDIR="/var/lib/cni/multus"
PODS=$(crictl ps -v 2> /dev/null | grep -w -E 'PodID|pod.name')
PODIDS=($(echo "$PODS" | grep PodID | awk '{print $2}'))
PODNAMES=($(echo "$PODS" | grep -w pod.name | awk '{print $3}'))
KUBELET_UPTIME_MINUTES=5
POD_ID_LENGTH=64
DELETE="no"
OLDERTHANHOURS=1
# Log info message to /var/log/daemon.log
function LOG {
logger -p daemon.info -t "${NAME}($$): " "${@}"
}
# Log error message to /var/log/daemon.log
function ERROR {
logger -p daemon.error -t "${NAME}($$): " "${@}"
}
# Determine the age of a file in hours.
function file_age {
local file=${1}
local SECONDSPERHOUR=3600
now=$(date +%s)
old=$(stat -c %Z ${file})
diff=$(((${now} - ${old})/${SECONDSPERHOUR}))
echo ${diff}
}
# Determine the pod id associated with a result CNI cache file.
function results_cni_cache_file_to_pod_id {
local path=${1}
local ret=""
file=$(basename ${path})
# A valid CNI cache results file looks like:
# type-pod_id-interface_name
RESULTS_REGEX='^.*-([0-9a-zA-Z]{64})-[0-9a-zA-Z]+$'
if [[ ${file} =~ ${RESULTS_REGEX} ]]; then
ret=${BASH_REMATCH[1]}
fi
echo ${ret}
}
# Determine the pod id associated with a multus CNI cache file.
function multus_cni_cache_file_to_pod_id {
local path=${1}
local ret=""
file=$(basename ${path})
# A valid CNI cache multus file is simply the pod id
MULTUS_REGEX='^([0-9a-zA-Z]{64})$'
if [[ ${file} =~ ${MULTUS_REGEX} ]]; then
ret=${BASH_REMATCH[1]}
fi
echo ${ret}
}
# Determine the pod id associated with a CNI cache file.
function cni_cache_file_to_pod_id {
local path=${1}
local ret=""
dir=$(dirname ${path})
if [[ "${dir}" == "${RESULTSDIR}" ]]; then
ret=$(results_cni_cache_file_to_pod_id ${path})
elif [[ "${dir}" == "${MULTUSDIR}" ]]; then
ret=$(multus_cni_cache_file_to_pod_id ${path})
fi
echo ${ret}
}
# Determine the original pod name from a CNI cache file (if any).
function cache_file_to_pod_name {
local path=${1}
local ret="unknown"
grep -q "K8S_POD_NAME" ${path}
if [ ${?} -eq 0 ]; then
ret=$(cat ${path} | sed "s/.*K8S_POD_NAME\",\"//g" | cut -f1 -d"\"")
fi
echo ${ret}
}
# Given a CNI cache id, return the existing pod name (if any).
function get_pod {
local cacheid=${1}
local ret=""
for i in ${!PODIDS[@]}; do
podid=${PODIDS[${i}]}
if [[ "${podid}" == "${cacheid}" ]]; then
ret=${PODNAMES[${i}]}
fi
done
echo ${ret}
}
# Determine if the CNI cache file is old enough to process.
function check_cache_file_age {
local age=${1}
local ret=""
if [ -n ${OLDERTHANHOURS} ]; then
if [[ ${age} -ge ${OLDERTHANHOURS} ]]; then
ret=${age}
fi
fi
echo ${ret}
}
# Determine how long kubelet has been up in minutes
function kubelet_uptime {
local SECONDSPERMINUTE=60
kubelet_uptime=$(systemctl show kubelet --property WatchdogTimestamp | awk -F= '{print $2}')
[[ -n ${kubelet_uptime} ]]
if [ ${?} -ne 0 ]; then
ERROR "Failed to get kubelet uptime."
minutes=0
else
uptime=$(date --date="${kubelet_uptime}" +%s)
now=$(date +%s)
minutes=$(((${now}-${uptime})/${SECONDSPERMINUTE}))
fi
echo ${minutes}
}
# Wait for kubelet to be up for long enough to process CNI cache files.
function check_kubelet {
local retries=0
while [ ${retries} -le 30 ]; do
uptime=$(kubelet_uptime)
if [ ${uptime} -ge ${KUBELET_UPTIME_MINUTES} ]; then
return 0
fi
remaining=$((${KUBELET_UPTIME_MINUTES}-${uptime}))
LOG "Waiting for kubelet to be up for ${remaining} minutes ..."
retries=$((${retries}+1))
sleep 30
done
return 1
}
while getopts :o:d OPT; do
case ${OPT} in
o|--older-than)
OLDERTHANHOURS=${OPTARG}
;;
d|+d)
DELETE="yes"
;;
*)
echo "usage: ${0##*/} [-d] [-o older_than_hours]"
exit 2
esac
done
check_kubelet
if [[ ${?} -ne 0 ]]; then
LOG "Kubelet must be up for a minimum of ${KUBELET_UPTIME_MINUTES} minutes. Not running CNI cache cleanup."
exit 1
fi
for f in ${RESULTSDIR}/* ${MULTUSDIR}/*; do
cacheid=$(cni_cache_file_to_pod_id ${f})
if [[ ${#cacheid} -ne ${POD_ID_LENGTH} ]]; then
# Unrecognized file pattern, skip.
continue
fi
existing_podname=$(get_pod ${cacheid})
if [[ ${existing_podname} ]]; then
LOG "Pod ${existing_podname} exists. Not cleaning up CNI cache file(s)."
continue
fi
age=$(file_age ${f})
if [[ ! $(check_cache_file_age ${age}) ]]; then
LOG "Stale CNI cache file ${f} detected. Cleanup to occur after $((${OLDERTHANHOURS} - ${age})) hour(s)."
continue
fi
if [[ "${DELETE}" == "yes" ]]; then
rm -f ${f}
action="Deleted"
else
action="Detected"
fi
cache_podname=$(cache_file_to_pod_name ${f})
LOG "${action} stale CNI cache file ${f}: [age: ${age} hours old, podname: ${cache_podname}]."
done

View File

@ -0,0 +1,27 @@
Name: k8s-cni-cache-cleanup
Version: 1.0
Release: 0%{?_tis_dist}.%{tis_patch_ver}
Summary: Kubernetes CNI Cache Cleanup Utility
License: Apache-2.0
Group: base
Packager: Wind River <info@windriver.com>
URL: unknown
Source0: k8s-cni-cache-cleanup
Requires: /bin/bash
%description
%{summary}
%define local_dir /usr/local
%define local_sbindir %{local_dir}/sbin
%prep
%install
install -d %{buildroot}%{local_sbindir}
install -m 755 %{SOURCE0} %{buildroot}%{local_sbindir}/k8s-cni-cache-cleanup
%files
%defattr(-,root,root,-)
%{local_sbindir}/k8s-cni-cache-cleanup

View File

@ -19,7 +19,7 @@
. /etc/platform/platform.conf
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin:/usr/local/sbin
export KUBECONFIG=/etc/kubernetes/admin.conf
CONF_DIR=/etc/k8s-post-recovery.d
SLEEP_DELAY_SEC=15
@ -74,6 +74,16 @@ function _wait_for_systemd {
done
}
function _do_cni_cache_cleanup {
# Cleanup any stale CNI cache files (not associated with any running pod)
# that are older than 1 hour old
LOG "Starting CNI cache cleanup..."
k8s-cni-cache-cleanup -o 1 -d
if [[ ${?} -ne 0 ]]; then
ERROR "Failed to run CNI cache cleanup."
fi
}
function _wait_for_pod_stabilization {
local extra_args=$1
@ -298,6 +308,7 @@ function start {
_wait_for_systemd
_examine_pods 'recover'
_examine_pods 'verify'
_do_cni_cache_cleanup
}
function stop {