From b428a5de0070c6df82536b8b5b782810ebd9efda Mon Sep 17 00:00:00 2001 From: Cole Walker Date: Wed, 19 May 2021 16:57:54 +0000 Subject: [PATCH] Revert "Remove recover operations to "restart-on-reboot" pods" This reverts commit 8abcbf6fb1951b25e9964933558b75b9aff88135. Reason for revert: After performing a backup and restore on an AIO-SX system, SRIOV pods do not return to a running state and are instead stuck in "container creating". The workaround for this is to restart SRIOV pods when the system unlocks. Reverting this commit to allow users to label SRIOV pods and have them restarted by k8s-pod-recovery. Labelled pods will be restarted by k8s-pod-recovery and will be running after backup and restore is completed. This change has been tested by performing backup and restore on an AIO-SX system. SRIOV pods now come up correctly when labelled with restart-on-reboot=true Closes-Bug: 1928965 Signed-off-by: Cole Walker Change-Id: I9c520c0a47aabca7b96e50adf0f71742f4199c2f --- .../centos/files/k8s-pod-recovery | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery index ac38737f4..17ca252ee 100755 --- a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery +++ b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery @@ -153,6 +153,36 @@ function _node_affinity_pods { } +function _labeled_pods { + # $1: actions + + if [ "$1" == 'recover' ]; then + POLLING_INTERVAL=5 + STABILITY_COUNT=6 + _wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT + + # Delete pods with the restart-on-reboot=true label + PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}') + for pod in $PODS; do + LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }" + kubectl delete pods -n ${pod//// } --wait=false + done + elif [ "$1" == 'verify' ]; then + PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}') + for pod in $PODS; do + LOG "restart-on-reboot labeled pods: Verifying: ${pod//// }" + STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}') + if [[ "${STATUS}" != "Running" ]]; then + ERROR "$pod: not recovered: $STATUS" + else + LOG "$pod: recovered" + fi + done + else + ERROR "Unknown action: $1" + fi +} + function _force_reset_pods { # $1: actions @@ -196,6 +226,9 @@ function _force_reset_pods { function _examine_pods { # $1: actions + # Manage labeled pods first + _labeled_pods $1 + # Wait for pods transitions to stop _wait_for_pod_stabilization "" $SLEEP_DELAY_SEC 6