Periodically checks for nodes being cleaned
This patch is adding a periodic task to check for nodes waiting for the ramdisk callback when cleaning is being executed. A new configuration option called "clean_callback_timeout" was added, its value is the number of seconds that the Ironic conductor will wait for the ramdisk to doing the cleaning to contact Ironic back. Defaults to 1800. Closes-Bug: #1483120 Change-Id: Id7f9e9018b5cb2389bbe556171e7a9d46425afba
This commit is contained in:
parent
3d45795abb
commit
ea1b012e93
@ -1,5 +1,33 @@
|
||||
[DEFAULT]
|
||||
|
||||
#
|
||||
# Options defined in oslo.service.periodic_task
|
||||
#
|
||||
|
||||
# Some periodic tasks can be run in a separate process. Should
|
||||
# we run them here? (boolean value)
|
||||
#run_external_periodic_tasks=true
|
||||
|
||||
|
||||
#
|
||||
# Options defined in oslo.service.service
|
||||
#
|
||||
|
||||
# Enable eventlet backdoor. Acceptable values are 0, <port>,
|
||||
# and <start>:<end>, where 0 results in listening on a random
|
||||
# tcp port number; <port> results in listening on the
|
||||
# specified port number (and not enabling backdoor if that
|
||||
# port is in use); and <start>:<end> results in listening on
|
||||
# the smallest unused port number within the specified range
|
||||
# of port numbers. The chosen port is displayed in the
|
||||
# service's log file. (string value)
|
||||
#backdoor_port=<None>
|
||||
|
||||
# Enables or disables logging values of all registered options
|
||||
# when starting a service (at DEBUG level). (boolean value)
|
||||
#log_options=true
|
||||
|
||||
|
||||
#
|
||||
# Options defined in oslo.messaging
|
||||
#
|
||||
@ -79,12 +107,12 @@
|
||||
#
|
||||
|
||||
# Print debugging output (set logging level to DEBUG instead
|
||||
# of default WARNING level). (boolean value)
|
||||
# of default INFO level). (boolean value)
|
||||
#debug=false
|
||||
|
||||
# Print more verbose output (set logging level to INFO instead
|
||||
# of default WARNING level). (boolean value)
|
||||
#verbose=false
|
||||
# If set to false, will disable INFO logging level, making
|
||||
# WARNING the default. (boolean value)
|
||||
#verbose=true
|
||||
|
||||
# The name of a logging configuration file. This file is
|
||||
# appended to any existing logging configuration files. For
|
||||
@ -121,8 +149,8 @@
|
||||
# (Optional) Enables or disables syslog rfc5424 format for
|
||||
# logging. If enabled, prefixes the MSG part of the syslog
|
||||
# message with APP-NAME (RFC5424). The format without the APP-
|
||||
# NAME is deprecated in K, and will be removed in M, along
|
||||
# with this option. (boolean value)
|
||||
# NAME is deprecated in Kilo, and will be removed in Mitaka,
|
||||
# along with this option. (boolean value)
|
||||
#use_syslog_rfc_format=true
|
||||
|
||||
# Syslog facility to receive log lines. (string value)
|
||||
@ -167,34 +195,6 @@
|
||||
#fatal_deprecations=false
|
||||
|
||||
|
||||
#
|
||||
# Options defined in oslo.service.service
|
||||
#
|
||||
|
||||
# Enable eventlet backdoor. Acceptable values are 0, <port>,
|
||||
# and <start>:<end>, where 0 results in listening on a random
|
||||
# tcp port number; <port> results in listening on the
|
||||
# specified port number (and not enabling backdoor if that
|
||||
# port is in use); and <start>:<end> results in listening on
|
||||
# the smallest unused port number within the specified range
|
||||
# of port numbers. The chosen port is displayed in the
|
||||
# service's log file. (string value)
|
||||
#backdoor_port=<None>
|
||||
|
||||
# Enables or disables logging values of all registered options
|
||||
# when starting a service (at DEBUG level). (boolean value)
|
||||
#log_options=true
|
||||
|
||||
|
||||
#
|
||||
# Options defined in oslo.service.periodic_task
|
||||
#
|
||||
|
||||
# Some periodic tasks can be run in a separate process. Should
|
||||
# we run them here? (boolean value)
|
||||
#run_external_periodic_tasks=true
|
||||
|
||||
|
||||
#
|
||||
# Options defined in ironic.netconf
|
||||
#
|
||||
@ -561,6 +561,12 @@
|
||||
# option could be safely disabled. (boolean value)
|
||||
#clean_nodes=true
|
||||
|
||||
# Timeout (seconds) to wait for a callback from the ramdisk
|
||||
# doing the cleaning. If the timeout is reached the node will
|
||||
# be put in the "clean failed" provision state. Set to 0 to
|
||||
# disable timeout. (integer value)
|
||||
#clean_callback_timeout=1800
|
||||
|
||||
|
||||
[console]
|
||||
|
||||
@ -1370,12 +1376,12 @@
|
||||
# the same time. This option provides such compatibility - it
|
||||
# defaults to False in Liberty and can be turned on for early
|
||||
# adopters with a new installations or for testing. Please
|
||||
# note, that this option will be removed in M release.
|
||||
# (boolean value)
|
||||
# note, that this option will be removed in the Mitaka
|
||||
# release. (boolean value)
|
||||
#send_single_reply=false
|
||||
|
||||
# Qpid broker hostname. (string value)
|
||||
#qpid_hostname=ironic
|
||||
#qpid_hostname=localhost
|
||||
|
||||
# Qpid broker port. (integer value)
|
||||
#qpid_port=5672
|
||||
@ -1438,8 +1444,8 @@
|
||||
# the same time. This option provides such compatibility - it
|
||||
# defaults to False in Liberty and can be turned on for early
|
||||
# adopters with a new installations or for testing. Please
|
||||
# note, that this option will be removed in M release.
|
||||
# (boolean value)
|
||||
# note, that this option will be removed in the Mitaka
|
||||
# release. (boolean value)
|
||||
#send_single_reply=false
|
||||
|
||||
# SSL version to use (valid only if SSL enabled). Valid values
|
||||
@ -1468,7 +1474,7 @@
|
||||
|
||||
# The RabbitMQ broker address where a single node is used.
|
||||
# (string value)
|
||||
#rabbit_host=ironic
|
||||
#rabbit_host=localhost
|
||||
|
||||
# The RabbitMQ broker port where a single node is used.
|
||||
# (integer value)
|
||||
|
@ -184,6 +184,12 @@ conductor_opts = [
|
||||
'longer. In an environment where all tenants are '
|
||||
'trusted (eg, because there is only one tenant), '
|
||||
'this option could be safely disabled.')),
|
||||
cfg.IntOpt('clean_callback_timeout',
|
||||
default=1800,
|
||||
help=_('Timeout (seconds) to wait for a callback from the '
|
||||
'ramdisk doing the cleaning. If the timeout is reached '
|
||||
'the node will be put in the "clean failed" provision '
|
||||
'state. Set to 0 to disable timeout.')),
|
||||
]
|
||||
CONF = cfg.CONF
|
||||
CONF.register_opts(conductor_opts, 'conductor')
|
||||
@ -1253,6 +1259,31 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
task.node.conductor_affinity = self.conductor.id
|
||||
task.node.save()
|
||||
|
||||
@periodic_task.periodic_task(
|
||||
spacing=CONF.conductor.check_provision_state_interval)
|
||||
def _check_cleanwait_timeouts(self, context):
|
||||
"""Periodically checks for nodes being cleaned.
|
||||
|
||||
If a node doing cleaning is unresponsive (detected when it stops
|
||||
heart beating), the operation should be aborted.
|
||||
|
||||
:param context: request context.
|
||||
"""
|
||||
callback_timeout = CONF.conductor.clean_callback_timeout
|
||||
if not callback_timeout:
|
||||
return
|
||||
|
||||
filters = {'reserved': False,
|
||||
'provision_state': states.CLEANWAIT,
|
||||
'maintenance': False,
|
||||
'provisioned_before': callback_timeout}
|
||||
last_error = _("Timeout reached while cleaning the node. Please "
|
||||
"check if the ramdisk responsible for the cleaning is "
|
||||
"running on the node.")
|
||||
self._fail_if_in_state(context, filters, states.CLEANWAIT,
|
||||
'provision_updated_at',
|
||||
last_error=last_error)
|
||||
|
||||
@periodic_task.periodic_task(
|
||||
spacing=CONF.conductor.sync_local_state_interval)
|
||||
def _sync_local_state(self, context):
|
||||
|
@ -267,19 +267,15 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
# with previous code, otherwise nodes in CLEANING when this
|
||||
# is deployed would fail. Should be removed once the Mitaka
|
||||
# release starts.
|
||||
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING)
|
||||
and not node.clean_step):
|
||||
# Agent booted from prepare_cleaning
|
||||
LOG.debug('Node %s just booted to start cleaning.', node.uuid)
|
||||
manager.set_node_cleaning_steps(task)
|
||||
self._notify_conductor_resume_clean(task)
|
||||
# TODO(lucasagomes): CLEANING here for backwards compat
|
||||
# with previous code, otherwise nodes in CLEANING when this
|
||||
# is deployed would fail. Should be removed once the Mitaka
|
||||
# release starts.
|
||||
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING)
|
||||
and node.clean_step):
|
||||
self.continue_cleaning(task, **kwargs)
|
||||
elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
|
||||
node.touch_provisioning()
|
||||
if not node.clean_step:
|
||||
LOG.debug('Node %s just booted to start cleaning.',
|
||||
node.uuid)
|
||||
manager.set_node_cleaning_steps(task)
|
||||
self._notify_conductor_resume_clean(task)
|
||||
else:
|
||||
self.continue_cleaning(task, **kwargs)
|
||||
|
||||
except Exception as e:
|
||||
err_info = {'node': node.uuid, 'msg': msg, 'e': e}
|
||||
|
@ -1321,6 +1321,22 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
|
||||
self.assertIsNotNone(node.last_error)
|
||||
mock_cleanup.assert_called_once_with(mock.ANY)
|
||||
|
||||
def test__check_cleanwait_timeouts(self):
|
||||
self._start_service()
|
||||
CONF.set_override('clean_callback_timeout', 1, group='conductor')
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANWAIT,
|
||||
target_provision_state=states.AVAILABLE,
|
||||
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
|
||||
|
||||
self.service._check_cleanwait_timeouts(self.context)
|
||||
self.service._worker_pool.waitall()
|
||||
node.refresh()
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(states.AVAILABLE, node.target_provision_state)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
|
||||
def test_do_node_tear_down_invalid_state(self):
|
||||
self._start_service()
|
||||
# test node.provision_state is incorrect for tear_down
|
||||
|
@ -322,10 +322,12 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
|
||||
'is done. exception: LlamaException')
|
||||
|
||||
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
|
||||
@mock.patch.object(manager, 'set_node_cleaning_steps', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||
'_notify_conductor_resume_clean', autospec=True)
|
||||
def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps):
|
||||
def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps,
|
||||
mock_touch):
|
||||
kwargs = {
|
||||
'agent_url': 'http://127.0.0.1:9999/bar'
|
||||
}
|
||||
@ -337,14 +339,18 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
self.context, self.node.uuid, shared=True) as task:
|
||||
self.passthru.heartbeat(task, **kwargs)
|
||||
|
||||
mock_touch.assert_called_once_with(mock.ANY)
|
||||
mock_notify.assert_called_once_with(mock.ANY, task)
|
||||
mock_set_steps.assert_called_once_with(task)
|
||||
# Reset mocks for the next interaction
|
||||
mock_touch.reset_mock()
|
||||
mock_notify.reset_mock()
|
||||
mock_set_steps.reset_mock()
|
||||
|
||||
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||
'continue_cleaning', autospec=True)
|
||||
def test_heartbeat_continue_cleaning(self, mock_continue):
|
||||
def test_heartbeat_continue_cleaning(self, mock_continue, mock_touch):
|
||||
kwargs = {
|
||||
'agent_url': 'http://127.0.0.1:9999/bar'
|
||||
}
|
||||
@ -361,7 +367,10 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
self.context, self.node.uuid, shared=True) as task:
|
||||
self.passthru.heartbeat(task, **kwargs)
|
||||
|
||||
mock_touch.assert_called_once_with(mock.ANY)
|
||||
mock_continue.assert_called_once_with(mock.ANY, task, **kwargs)
|
||||
# Reset mocks for the next interaction
|
||||
mock_touch.reset_mock()
|
||||
mock_continue.reset_mock()
|
||||
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'continue_deploy',
|
||||
|
Loading…
x
Reference in New Issue
Block a user