From 6c61e3b665cd7e097820185012603e1ad1e48b7c Mon Sep 17 00:00:00 2001 From: Cole Walker Date: Mon, 31 May 2021 14:39:05 -0400 Subject: [PATCH] Wait for SRIOV device plugin before recovering labeled pods This change modifies the k8s-pod-recovery service to wait for the kube-sriov-device-plugin-amd64 pod on the local node to become available before proceeding with the recovery of restart-on-reboot=true labeled pods. This is required because of a race condition where pods marked for recovery would be restarted before the device plugin was ready and the pods would then be stuck in "ContainerCreating". The fix in this commit uses the kubectl wait ... command to wait for the daemonset to be available. A timeout of 360s has been set for this command in order to all enough time on busy systems for the device-plugin pod to come up. The wait command completes as soon as the pod is ready. Closes-Bug: 1928965 Signed-off-by: Cole Walker Change-Id: Ie1937cf0612827b28762049e2dc440e55726d4f3 --- .../centos/files/k8s-pod-recovery | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery index e6b4fba69..3c69d5a49 100755 --- a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery +++ b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery @@ -161,8 +161,21 @@ function _labeled_pods { STABILITY_COUNT=6 _wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT - # Delete pods with the restart-on-reboot=true label + PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}') + + # Don't have to restart device-plugin if no labeled pods are present. System may not be configured for SRIOV. + if [ ! -z "${PODS}" ]; then + LOG "Waiting for SRIOV device plugin pod to become available" + kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false + kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s + + if [ "$?" -ne 0 ]; then + ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover." + fi + fi + + # Delete pods with the restart-on-reboot=true label for pod in $PODS; do LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }" kubectl delete pods -n ${pod//// } --wait=false