Test kube-api server availability for longer time

It was observed that during dead office recovery (when all nodes restart simultaneously) that the availability of kube-api-server can be larger than the current wait loops used to check pod stabilization. This can lead to a failed execution of important pod operations to recover them to normal operation. This change adds an initial specific loop to wait for the API server availability, in case of failure it exits the script and now the service is marked as failed which will trigger new attempts to execute in case of failure (with a 10s interval between service executions). A 5 minutes time was chosen to have a better chance of finishing the script without needing to have systemd to restart the service. The monitoring is done using the instructions in: https://kubernetes.io/docs/reference/using-api/health-checks/ Test Plan: install AIO-DX and: [PASS] execute nodes lock/unlock and observe the service correct execution, kube-api is immediately available. [PASS] execute simultaneous node reboots and observe that the service remains waiting for kube-api server for some time (in vbox it was 2.5min) and after that it executes the script Closes-Bug: 2089864 Change-Id: I0d77da1735ecb829ab1da013fe93431688e4cb97 Signed-off-by: Andre Kantek <andrefernandozanella.kantek@windriver.com>
2024-11-28 14:27:29 -03:00 · 2024-11-28 14:27:29 -03:00 · 1c628f50a6
commit 1c628f50a6
parent a2e71bf7ad
2 changed files with 36 additions and 2 deletions
--- a/kubernetes/k8s-pod-recovery/files/k8s-pod-recovery
+++ b/kubernetes/k8s-pod-recovery/files/k8s-pod-recovery
@ -106,6 +106,26 @@ function _wait_for_pod_stabilization {
    done
 }

+function _wait_for_kubeapi_server {
+
+    local time_between_polls=${1}
+    local attempt_cycles=${2}
+
+    attempt_count=0
+    while [[ ${attempt_count} -lt ${attempt_cycles} ]] ; do
+        api_status=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get --raw "/readyz")
+        if [[ ${api_status} == "ok" ]]; then
+            LOG "kube-api server available, status=${api_status}"
+            return 0
+        else
+            LOG "kube-api server not available, attempt[count=${attempt_count}, cycles=${attempt_cycles}]"
+            attempt_count=$((attempt_count+1))
+        fi
+        sleep "${time_between_polls}"
+    done
+    return 1
+}
+
 function _unknown_pods {
    # $1: actions <recover|verify>

@ -336,8 +356,20 @@ function start {
    LOG "Starting."

    _wait_for_systemd
-    _examine_pods 'recover'
-    _examine_pods 'verify'
+
+    # check if kube-api server is available before trying to use kubectl
+    # wait is up to 5 min to consider dead office recover for the active
+    # controller
+    _wait_for_kubeapi_server $SLEEP_DELAY_SEC 20
+    if [ $? -eq 0 ]; then
+        LOG "kube-api-server is available, start pod examination"
+        _examine_pods 'recover'
+        _examine_pods 'verify'
+    else
+        LOG "kube-api-server is not available, exit for systemd to restart on failure"
+        exit 1
+    fi
+
    _do_cni_cache_cleanup
 }

--- a/kubernetes/k8s-pod-recovery/files/k8s-pod-recovery.service
+++ b/kubernetes/k8s-pod-recovery/files/k8s-pod-recovery.service
@ -9,6 +9,8 @@ Type=simple
 ExecStart=/usr/local/sbin/k8s-pod-recovery start
 ExecStop=/usr/local/sbin/k8s-pod-recovery stop
 PIDFile=/var/run/k8s-pod-recovery.pid
+Restart=on-failure
+RestartSec=10s

 [Install]
 WantedBy=multi-user.target