Clear pods in OutOfhugepages* state

Following an upgrade, some pods using
hugepages will still be in Running
state, but will have a replica that stays in
OutOfhugepages state.

k8s-pod-recovery can detect those pods and
delete them.

Closes-bug: 1943113
Signed-off-by: Daniel Safta <daniel.safta@windriver.com>
Change-Id: Idba510cabd66cd8b796563e3e6efa9baa5b4401e
This commit is contained in:
Daniel Safta 2021-09-06 09:07:34 +00:00
parent 6c5ab51017
commit 3b397cd14b

View File

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
# #
# Copyright (c) 2020 Wind River Systems, Inc. # Copyright (c) 2020-2021 Wind River Systems, Inc.
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
@ -131,6 +131,35 @@ function _unknown_pods {
fi fi
} }
function _outofhugepages_pods {
# $1: actions <recover|verify>
# Target all namespaces and pods on this host
NAMESPACES=$(kubectl get ns | tail -n +2 | awk '{ print $1 }')
if [ "$1" == 'recover' ]; then
# Recovers pods that are: Running/OutOfhugepages
for ns in ${NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /OutOfhugepages/'{print $1}')
for pod in $PODS ; do
LOG "OutOfhugepages pods: Recovering: $ns/$pod"
kubectl delete pods -n $ns $pod --wait=false
done
done
elif [ "$1" == 'verify' ]; then
for ns in ${NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /OutOfhugepages/'{print $1}')
if [ -z "${PODS}" ]; then
LOG "OutOfhugepages pods: None present for namespace: $ns"
else
ERROR "OutOfhugepages pods: still present for namespace: $ns"
fi
done
else
ERROR "Unknown action: $1"
fi
}
function _node_affinity_pods { function _node_affinity_pods {
# $1: actions <recover|verify> # $1: actions <recover|verify>
@ -170,12 +199,12 @@ function _labeled_pods {
# Check if device-plugin is ready, but do not wait # Check if device-plugin is ready, but do not wait
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s
# If device plugin is not ready, restart it and wait # If device plugin is not ready, restart it and wait
if [ "$?" -ne 0 ]; then if [ "$?" -ne 0 ]; then
kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s
if [ "$?" -ne 0 ]; then if [ "$?" -ne 0 ]; then
ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover." ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover."
fi fi
@ -256,6 +285,7 @@ function _examine_pods {
_unknown_pods $1 _unknown_pods $1
_node_affinity_pods $1 _node_affinity_pods $1
_force_reset_pods $1 _force_reset_pods $1
_outofhugepages_pods $1
} }