Wait for SRIOV device plugin before recovering labeled pods
This change modifies the k8s-pod-recovery service to wait for the kube-sriov-device-plugin-amd64 pod on the local node to become available before proceeding with the recovery of restart-on-reboot=true labeled pods. This is required because of a race condition where pods marked for recovery would be restarted before the device plugin was ready and the pods would then be stuck in "ContainerCreating". The fix in this commit uses the kubectl wait ... command to wait for the daemonset to be available. A timeout of 360s has been set for this command in order to all enough time on busy systems for the device-plugin pod to come up. The wait command completes as soon as the pod is ready. Closes-Bug: 1928965 Signed-off-by: Cole Walker <cole.walker@windriver.com> Change-Id: Ie1937cf0612827b28762049e2dc440e55726d4f3
This commit is contained in:
parent
b310077093
commit
6c61e3b665
@ -161,8 +161,21 @@ function _labeled_pods {
|
||||
STABILITY_COUNT=6
|
||||
_wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
|
||||
|
||||
# Delete pods with the restart-on-reboot=true label
|
||||
|
||||
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
|
||||
|
||||
# Don't have to restart device-plugin if no labeled pods are present. System may not be configured for SRIOV.
|
||||
if [ ! -z "${PODS}" ]; then
|
||||
LOG "Waiting for SRIOV device plugin pod to become available"
|
||||
kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false
|
||||
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s
|
||||
|
||||
if [ "$?" -ne 0 ]; then
|
||||
ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover."
|
||||
fi
|
||||
fi
|
||||
|
||||
# Delete pods with the restart-on-reboot=true label
|
||||
for pod in $PODS; do
|
||||
LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
|
||||
kubectl delete pods -n ${pod//// } --wait=false
|
||||
|
Loading…
x
Reference in New Issue
Block a user