Handle labeled pods after stabilized

Pods that are in a k8s deployment, daemonset, etc can be labeled as restart-on-reboot="true", which will automatically cause them to be restarted after the worker manifest has completed in an AIO system. It may happen, however, that k8s-pod-recovery service is started before the pods are scheduled and created at the node the script is running on, causing them to be not restarted. The proposed solution is to wait for stabilization of labeled pods before restarting them. Closes-Bug: 1900920 Signed-off-by: Douglas Henrique Koerich <douglashenrique.koerich@windriver.com> Change-Id: I5c73bd838ab2be070bd40bea9e315dcf3852e47f
2021-03-11 09:12:49 -05:00 · 2021-03-11 09:12:49 -05:00 · 6169cc5d81
commit 6169cc5d81
parent 3d8ffbc2e8
1 changed files with 18 additions and 13 deletions
--- a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery
+++ b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery
@ -74,23 +74,24 @@ function _wait_for_systemd {
 }

 function _wait_for_pod_stabilization {
+
+    local extra_args=$1
+    local time_between_polls=$2
+    local stable_cycles=$3
+
    last_count=0
    stability_count=0
-    NINETY_SEC_COUNT=$((90/SLEEP_DELAY_SEC))
-    while true ; do
-        pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces | grep -v -e Running -e Completed | wc -l)
+    while [[ $stability_count -lt $stable_cycles ]] ; do
+        pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces $extra_args | grep -v -e Running -e Completed | wc -l)
        if [[ $pods_in_flux -ne $last_count ]]; then
            LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed"
            last_count=$pods_in_flux
            stability_count=0
        else
-            LOG "Pods transitions are stable... for $((stability_count*${SLEEP_DELAY_SEC})) seconds."
-            if [[ $stability_count -eq $NINETY_SEC_COUNT ]]; then
-                break
-            fi
+            LOG "Pods transitions are stable... for $((stability_count*time_between_polls)) seconds."
            stability_count=$((stability_count+1))
        fi
-        sleep ${SLEEP_DELAY_SEC}
+        sleep $time_between_polls
    done
 }

@ -148,15 +149,19 @@ function _node_affinity_pods {
 function _labeled_pods {
    # $1: actions <recover|verify>

-    # Delete pods with the restart-on-reboot=true label
-    PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
-
    if [ "$1" == 'recover' ]; then
+        POLLING_INTERVAL=5
+        STABILITY_COUNT=6
+        _wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
+
+        # Delete pods with the restart-on-reboot=true label
+        PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
        for pod in $PODS; do
            LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
            kubectl delete pods -n ${pod//// } --wait=false
        done
    elif [ "$1" == 'verify' ]; then
+        PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
        for pod in $PODS; do
            LOG "restart-on-reboot labeled  pods: Verifying: ${pod//// }"
            STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}')
@ -214,11 +219,11 @@ function _force_reset_pods {
 function _examine_pods {
    # $1: actions <recover|verify>

-    # No need to wait for pod transitions if we know the pod needs to be restarted
+    # Manage labeled pods first
    _labeled_pods $1

    # Wait for pods transitions to stop
-    _wait_for_pod_stabilization
+    _wait_for_pod_stabilization "" $SLEEP_DELAY_SEC 6

    # Check for recovery actions
    _unknown_pods $1