Wait for SRIOV device plugin before recovering labeled pods

This change modifies the k8s-pod-recovery service to wait for the
kube-sriov-device-plugin-amd64 pod on the local node to become
available before proceeding with the recovery of
restart-on-reboot=true labeled pods.

This is required because of a race condition where pods marked for
recovery would be restarted before the device plugin was ready and
the pods would then be stuck in "ContainerCreating".

The fix in this commit uses the kubectl wait ...
command to wait for the daemonset to be available. A timeout of 360s
has been set for this command in order to all enough time on busy
systems for the device-plugin pod to come up. The wait command
completes as soon as the pod is ready.

Closes-Bug: 1928965

Signed-off-by: Cole Walker <cole.walker@windriver.com>
Change-Id: Ie1937cf0612827b28762049e2dc440e55726d4f3
This commit is contained in:
Cole Walker 2021-05-31 14:39:05 -04:00
parent b310077093
commit 6c61e3b665

View File

@ -161,8 +161,21 @@ function _labeled_pods {
STABILITY_COUNT=6
_wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
# Delete pods with the restart-on-reboot=true label
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
# Don't have to restart device-plugin if no labeled pods are present. System may not be configured for SRIOV.
if [ ! -z "${PODS}" ]; then
LOG "Waiting for SRIOV device plugin pod to become available"
kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s
if [ "$?" -ne 0 ]; then
ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover."
fi
fi
# Delete pods with the restart-on-reboot=true label
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
kubectl delete pods -n ${pod//// } --wait=false