Wait for pods to terminate when host is locked
When a host is locked, the VIM adds the NoExecute taint to the host, which causes pods on the host to be terminated. The VIM is currently not waiting for the pods to terminate, so if the host reboots before pods are terminated (e.g. if an unlock was done quickly after the lock), there can be issues with pod recovery. With this change, the VIM will wait for up to 3 minutes for pods to terminate. If there are still pods terminating at that point, the VIM will either fail the lock (normal case) or allow the lock to pass (force lock case). Change-Id: I9f402ab6dc29ecc314bbd1a058ae3d79ea8728fe Closes-Bug: 1846818 Signed-off-by: Bart Wensley <barton.wensley@windriver.com>
This commit is contained in:
parent
6578b7a3b1
commit
3250993b82
@ -1 +1 @@
|
||||
TIS_PATCH_VER=76
|
||||
TIS_PATCH_VER=77
|
||||
|
@ -175,3 +175,26 @@ def mark_all_pods_not_ready(node_name, reason):
|
||||
pod.metadata.namespace))
|
||||
break
|
||||
return
|
||||
|
||||
|
||||
def get_terminating_pods(node_name):
|
||||
"""
|
||||
Get all pods on a node that are terminating
|
||||
"""
|
||||
# Get the client.
|
||||
kube_client = get_client()
|
||||
|
||||
# Retrieve the pods on the specified node.
|
||||
response = kube_client.list_namespaced_pod(
|
||||
"", field_selector="spec.nodeName=%s" % node_name)
|
||||
|
||||
terminating_pods = list()
|
||||
pods = response.items
|
||||
if pods is not None:
|
||||
for pod in pods:
|
||||
# The presence of the deletion_timestamp indicates the pod is
|
||||
# terminating.
|
||||
if pod.metadata.deletion_timestamp is not None:
|
||||
terminating_pods.append(pod.metadata.name)
|
||||
|
||||
return Result(','.join(terminating_pods))
|
||||
|
@ -2110,6 +2110,36 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
||||
callback.send(response)
|
||||
callback.close()
|
||||
|
||||
def get_terminating_pods(self, future, host_name, callback):
|
||||
"""
|
||||
Get list of terminating pods on a host
|
||||
"""
|
||||
response = dict()
|
||||
response['completed'] = False
|
||||
response['reason'] = ''
|
||||
|
||||
try:
|
||||
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
|
||||
|
||||
future.work(kubernetes_client.get_terminating_pods, host_name)
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("Kubernetes get_terminating_pods failed, operation "
|
||||
"did not complete, host_name=%s" % host_name)
|
||||
return
|
||||
|
||||
response['result-data'] = future.result.data
|
||||
response['completed'] = True
|
||||
|
||||
except Exception as e:
|
||||
DLOG.exception("Caught exception while trying to get "
|
||||
"terminating pods on %s, error=%s." % (host_name, e))
|
||||
|
||||
finally:
|
||||
callback.send(response)
|
||||
callback.close()
|
||||
|
||||
def host_rest_api_get_handler(self, request_dispatch):
|
||||
"""
|
||||
Host Rest-API GET handler callback
|
||||
|
@ -310,3 +310,125 @@ class TestNFVPluginsK8SMarkAllPodsNotReady(testcase.NFVTestCase):
|
||||
self.mock_patch_namespaced_pod_status.assert_called_with(
|
||||
"test-pod-ready", "test-namespace-1", mock.ANY)
|
||||
self.mock_patch_namespaced_pod_status.assert_called_once()
|
||||
|
||||
|
||||
@mock.patch('kubernetes.config.load_kube_config', mock_load_kube_config)
|
||||
class TestNFVPluginsK8SGetTerminatingPods(testcase.NFVTestCase):
|
||||
|
||||
list_namespaced_pod_result = {
|
||||
'test-node-1': kubernetes.client.V1PodList(
|
||||
api_version="v1",
|
||||
items=[
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-not-terminating",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp=None)
|
||||
),
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-terminating",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp="2019-10-03T16:54:25Z")
|
||||
),
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-not-terminating-2",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp=None)
|
||||
)
|
||||
]
|
||||
),
|
||||
'test-node-2': kubernetes.client.V1PodList(
|
||||
api_version="v1",
|
||||
items=[
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-not-terminating",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp=None)
|
||||
),
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-not-terminating-2",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp=None)
|
||||
)
|
||||
]
|
||||
),
|
||||
'test-node-3': kubernetes.client.V1PodList(
|
||||
api_version="v1",
|
||||
items=[
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-not-terminating",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp=None)
|
||||
),
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-terminating",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp="2019-10-03T16:54:25Z")
|
||||
),
|
||||
kubernetes.client.V1Pod(
|
||||
api_version="v1",
|
||||
kind="Pod",
|
||||
metadata=kubernetes.client.V1ObjectMeta(
|
||||
name="test-pod-terminating-2",
|
||||
namespace="test-namespace-1",
|
||||
deletion_timestamp="2019-10-03T16:55:25Z")
|
||||
)
|
||||
]
|
||||
)
|
||||
}
|
||||
|
||||
def setUp(self):
|
||||
super(TestNFVPluginsK8SGetTerminatingPods, self).setUp()
|
||||
|
||||
def mock_list_namespaced_pod(obj, namespace, field_selector=""):
|
||||
node_name = field_selector.split('spec.nodeName=', 1)[1]
|
||||
return self.list_namespaced_pod_result[node_name]
|
||||
|
||||
self.mocked_list_namespaced_pod = mock.patch(
|
||||
'kubernetes.client.CoreV1Api.list_namespaced_pod',
|
||||
mock_list_namespaced_pod)
|
||||
self.mocked_list_namespaced_pod.start()
|
||||
|
||||
def tearDown(self):
|
||||
super(TestNFVPluginsK8SGetTerminatingPods, self).tearDown()
|
||||
|
||||
self.mocked_list_namespaced_pod.stop()
|
||||
|
||||
def test_get_terminating_with_terminating(self):
|
||||
|
||||
result = kubernetes_client.get_terminating_pods("test-node-1")
|
||||
|
||||
assert result.result_data == 'test-pod-terminating'
|
||||
|
||||
def test_get_terminating_no_terminating(self):
|
||||
|
||||
result = kubernetes_client.get_terminating_pods("test-node-2")
|
||||
|
||||
assert result.result_data == ''
|
||||
|
||||
def test_get_terminating_with_two_terminating(self):
|
||||
|
||||
result = kubernetes_client.get_terminating_pods("test-node-3")
|
||||
|
||||
assert result.result_data == \
|
||||
'test-pod-terminating,test-pod-terminating-2'
|
||||
|
@ -981,6 +981,144 @@ class DisableHostServicesTaskWork(state_machine.StateTaskWork):
|
||||
return state_machine.STATE_TASK_WORK_RESULT.WAIT, empty_reason
|
||||
|
||||
|
||||
class WaitHostServicesDisabledTaskWork(state_machine.StateTaskWork):
|
||||
"""
|
||||
Wait Host Services Disabled Task Work
|
||||
"""
|
||||
def __init__(self, task, host, service):
|
||||
super(WaitHostServicesDisabledTaskWork, self).__init__(
|
||||
'wait-host-services-disabled_%s_%s' % (host.name, service), task,
|
||||
timeout_in_secs=180)
|
||||
self._host_reference = weakref.ref(host)
|
||||
self._service = service
|
||||
self._query_inprogress = False
|
||||
self._start_timestamp = None
|
||||
|
||||
@property
|
||||
def _host(self):
|
||||
"""
|
||||
Returns the host
|
||||
"""
|
||||
host = self._host_reference()
|
||||
return host
|
||||
|
||||
def timeout(self):
|
||||
"""
|
||||
Handle task work timeout
|
||||
"""
|
||||
if self._host.is_force_lock():
|
||||
DLOG.info("Wait-Host-Services-Disabled timeout for %s, "
|
||||
"force-locking, passing." % self._host.name)
|
||||
return state_machine.STATE_TASK_WORK_RESULT.SUCCESS, empty_reason
|
||||
|
||||
if not self._host.has_reason():
|
||||
self._host.update_failure_reason(
|
||||
"Terminating pods on disabled host %s timed out." %
|
||||
self._host.name)
|
||||
|
||||
return state_machine.STATE_TASK_WORK_RESULT.TIMED_OUT, empty_reason
|
||||
|
||||
@coroutine
|
||||
def _get_callback(self):
|
||||
"""
|
||||
Callback for get terminating pods
|
||||
"""
|
||||
response = (yield)
|
||||
self._query_inprogress = False
|
||||
|
||||
if self.task is not None:
|
||||
DLOG.verbose("Get-Terminating-Pods callback for service: "
|
||||
"%s %s, response=%s." %
|
||||
(self._service, self._host.name, response))
|
||||
|
||||
if response['completed']:
|
||||
if response['result-data'] != '':
|
||||
DLOG.info("Get-Terminating-Pods callback for %s, "
|
||||
"pods %s terminating" %
|
||||
(self._service, response['result-data']))
|
||||
return
|
||||
|
||||
# An empty response means no terminating pods exist.
|
||||
DLOG.info("Get-Terminating-Pods callback for %s, "
|
||||
"no pods are terminating" % self._service)
|
||||
self.task.task_work_complete(
|
||||
state_machine.STATE_TASK_WORK_RESULT.SUCCESS,
|
||||
empty_reason)
|
||||
else:
|
||||
DLOG.info("Get-Terminating-Pods callback for %s, "
|
||||
"failed" % self._host.name)
|
||||
|
||||
@coroutine
|
||||
def _extend_callback(self):
|
||||
"""
|
||||
Callback for host services disable extend
|
||||
"""
|
||||
response = (yield)
|
||||
if response['completed']:
|
||||
DLOG.info("Extended host services disable timeout for host %s."
|
||||
% self._host.name)
|
||||
self._host.disable_extend_timestamp = \
|
||||
timers.get_monotonic_timestamp_in_ms()
|
||||
else:
|
||||
DLOG.error("Failed to extend host services disable timeout for "
|
||||
"host %s." % self._host.name)
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Run wait host services disabled
|
||||
"""
|
||||
from nfv_vim import objects
|
||||
|
||||
DLOG.verbose("Wait-Host-Services-Disabled for %s for service %s." %
|
||||
(self._host.name, self._service))
|
||||
|
||||
now_ms = timers.get_monotonic_timestamp_in_ms()
|
||||
self._host.disable_extend_timestamp = now_ms
|
||||
self._start_timestamp = now_ms
|
||||
|
||||
if self._service == objects.HOST_SERVICES.CONTAINER:
|
||||
# We will wait for a bit before doing our first query to ensure
|
||||
# kubernetes has had time to start terminating the pods.
|
||||
return state_machine.STATE_TASK_WORK_RESULT.WAIT, empty_reason
|
||||
else:
|
||||
reason = ("Trying to wait for unknown host service %s" %
|
||||
self._service)
|
||||
DLOG.error(reason)
|
||||
self._host.update_failure_reason(reason)
|
||||
return state_machine.STATE_TASK_WORK_RESULT.FAILED, reason
|
||||
|
||||
def handle_event(self, event, event_data=None):
|
||||
"""
|
||||
Handle events while waiting for host services to be disabled
|
||||
"""
|
||||
from nfv_vim import objects
|
||||
|
||||
handled = False
|
||||
if HOST_EVENT.PERIODIC_TIMER == event:
|
||||
if self._service == objects.HOST_SERVICES.CONTAINER:
|
||||
if not self._query_inprogress:
|
||||
now_ms = timers.get_monotonic_timestamp_in_ms()
|
||||
elapsed_secs = (now_ms - self._start_timestamp) / 1000
|
||||
# Wait 10s before doing our first query
|
||||
if 10 <= elapsed_secs:
|
||||
DLOG.verbose("Wait-Host-Services-Disabled for %s for "
|
||||
"service %s. Doing query." %
|
||||
(self._host.name, self._service))
|
||||
self._query_inprogress = True
|
||||
nfvi.nfvi_get_terminating_pods(self._host.name,
|
||||
self._get_callback())
|
||||
handled = True
|
||||
|
||||
elif HOST_EVENT.AUDIT == event:
|
||||
now_ms = timers.get_monotonic_timestamp_in_ms()
|
||||
elapsed_secs = (now_ms - self._host.disable_extend_timestamp) / 1000
|
||||
if 120 <= elapsed_secs:
|
||||
nfvi.nfvi_notify_host_services_disable_extend(
|
||||
self._host.uuid, self._host.name, self._extend_callback())
|
||||
|
||||
return handled
|
||||
|
||||
|
||||
class NotifyHostServicesEnabledTaskWork(state_machine.StateTaskWork):
|
||||
"""
|
||||
Notify Host Services Enabled Task Work
|
||||
|
@ -28,6 +28,7 @@ from nfv_vim.host_fsm._host_task_work import NotifyInstancesHostDisabledTaskWork
|
||||
from nfv_vim.host_fsm._host_task_work import NotifyInstancesHostDisablingTaskWork
|
||||
from nfv_vim.host_fsm._host_task_work import QueryHypervisorTaskWork
|
||||
from nfv_vim.host_fsm._host_task_work import WaitHostServicesCreatedTaskWork
|
||||
from nfv_vim.host_fsm._host_task_work import WaitHostServicesDisabledTaskWork
|
||||
|
||||
DLOG = debug.debug_get_logger('nfv_vim.state_machine.host_task')
|
||||
|
||||
@ -245,6 +246,8 @@ class DisableHostTask(state_machine.StateTask):
|
||||
if not sw_mgmt_director.single_controller:
|
||||
task_work_list.append(DisableHostServicesTaskWork(
|
||||
self, host, objects.HOST_SERVICES.CONTAINER))
|
||||
task_work_list.append(WaitHostServicesDisabledTaskWork(
|
||||
self, host, objects.HOST_SERVICES.CONTAINER))
|
||||
task_work_list.append(notify_host_services_task(
|
||||
self, host, force_pass=True))
|
||||
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
|
||||
|
@ -101,6 +101,7 @@ from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_hosts # noqa: F40
|
||||
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_logs # noqa: F401
|
||||
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_system_info # noqa: F401
|
||||
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_system_state # noqa: F401
|
||||
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_terminating_pods # noqa: F401
|
||||
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_upgrade # noqa: F401
|
||||
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_lock_host # noqa: F401
|
||||
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_notify_host_failed # noqa: F401
|
||||
|
@ -278,6 +278,15 @@ def nfvi_get_alarm_history(start_period, end_period, callback):
|
||||
return cmd_id
|
||||
|
||||
|
||||
def nfvi_get_terminating_pods(host_name, callback):
|
||||
"""
|
||||
Get terminating pods
|
||||
"""
|
||||
cmd_id = _infrastructure_plugin.invoke_plugin('get_terminating_pods',
|
||||
host_name, callback=callback)
|
||||
return cmd_id
|
||||
|
||||
|
||||
def nfvi_register_host_add_callback(callback):
|
||||
"""
|
||||
Register for host add notifications
|
||||
|
Loading…
x
Reference in New Issue
Block a user