From ea1b012e93da66d28369eadf687cacf1c1d7d994 Mon Sep 17 00:00:00 2001 From: Lucas Alvares Gomes Date: Thu, 20 Aug 2015 09:54:02 +0100 Subject: [PATCH] Periodically checks for nodes being cleaned This patch is adding a periodic task to check for nodes waiting for the ramdisk callback when cleaning is being executed. A new configuration option called "clean_callback_timeout" was added, its value is the number of seconds that the Ironic conductor will wait for the ramdisk to doing the cleaning to contact Ironic back. Defaults to 1800. Closes-Bug: #1483120 Change-Id: Id7f9e9018b5cb2389bbe556171e7a9d46425afba --- etc/ironic/ironic.conf.sample | 86 ++++++++++--------- ironic/conductor/manager.py | 31 +++++++ ironic/drivers/modules/agent_base_vendor.py | 22 ++--- ironic/tests/conductor/test_manager.py | 16 ++++ .../tests/drivers/test_agent_base_vendor.py | 13 ++- 5 files changed, 113 insertions(+), 55 deletions(-) diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index ca40572414..61ad724cca 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -1,5 +1,33 @@ [DEFAULT] +# +# Options defined in oslo.service.periodic_task +# + +# Some periodic tasks can be run in a separate process. Should +# we run them here? (boolean value) +#run_external_periodic_tasks=true + + +# +# Options defined in oslo.service.service +# + +# Enable eventlet backdoor. Acceptable values are 0, , +# and :, where 0 results in listening on a random +# tcp port number; results in listening on the +# specified port number (and not enabling backdoor if that +# port is in use); and : results in listening on +# the smallest unused port number within the specified range +# of port numbers. The chosen port is displayed in the +# service's log file. (string value) +#backdoor_port= + +# Enables or disables logging values of all registered options +# when starting a service (at DEBUG level). (boolean value) +#log_options=true + + # # Options defined in oslo.messaging # @@ -79,12 +107,12 @@ # # Print debugging output (set logging level to DEBUG instead -# of default WARNING level). (boolean value) +# of default INFO level). (boolean value) #debug=false -# Print more verbose output (set logging level to INFO instead -# of default WARNING level). (boolean value) -#verbose=false +# If set to false, will disable INFO logging level, making +# WARNING the default. (boolean value) +#verbose=true # The name of a logging configuration file. This file is # appended to any existing logging configuration files. For @@ -121,8 +149,8 @@ # (Optional) Enables or disables syslog rfc5424 format for # logging. If enabled, prefixes the MSG part of the syslog # message with APP-NAME (RFC5424). The format without the APP- -# NAME is deprecated in K, and will be removed in M, along -# with this option. (boolean value) +# NAME is deprecated in Kilo, and will be removed in Mitaka, +# along with this option. (boolean value) #use_syslog_rfc_format=true # Syslog facility to receive log lines. (string value) @@ -167,34 +195,6 @@ #fatal_deprecations=false -# -# Options defined in oslo.service.service -# - -# Enable eventlet backdoor. Acceptable values are 0, , -# and :, where 0 results in listening on a random -# tcp port number; results in listening on the -# specified port number (and not enabling backdoor if that -# port is in use); and : results in listening on -# the smallest unused port number within the specified range -# of port numbers. The chosen port is displayed in the -# service's log file. (string value) -#backdoor_port= - -# Enables or disables logging values of all registered options -# when starting a service (at DEBUG level). (boolean value) -#log_options=true - - -# -# Options defined in oslo.service.periodic_task -# - -# Some periodic tasks can be run in a separate process. Should -# we run them here? (boolean value) -#run_external_periodic_tasks=true - - # # Options defined in ironic.netconf # @@ -561,6 +561,12 @@ # option could be safely disabled. (boolean value) #clean_nodes=true +# Timeout (seconds) to wait for a callback from the ramdisk +# doing the cleaning. If the timeout is reached the node will +# be put in the "clean failed" provision state. Set to 0 to +# disable timeout. (integer value) +#clean_callback_timeout=1800 + [console] @@ -1370,12 +1376,12 @@ # the same time. This option provides such compatibility - it # defaults to False in Liberty and can be turned on for early # adopters with a new installations or for testing. Please -# note, that this option will be removed in M release. -# (boolean value) +# note, that this option will be removed in the Mitaka +# release. (boolean value) #send_single_reply=false # Qpid broker hostname. (string value) -#qpid_hostname=ironic +#qpid_hostname=localhost # Qpid broker port. (integer value) #qpid_port=5672 @@ -1438,8 +1444,8 @@ # the same time. This option provides such compatibility - it # defaults to False in Liberty and can be turned on for early # adopters with a new installations or for testing. Please -# note, that this option will be removed in M release. -# (boolean value) +# note, that this option will be removed in the Mitaka +# release. (boolean value) #send_single_reply=false # SSL version to use (valid only if SSL enabled). Valid values @@ -1468,7 +1474,7 @@ # The RabbitMQ broker address where a single node is used. # (string value) -#rabbit_host=ironic +#rabbit_host=localhost # The RabbitMQ broker port where a single node is used. # (integer value) diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 44d1ba8c77..1b3c40d7be 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -184,6 +184,12 @@ conductor_opts = [ 'longer. In an environment where all tenants are ' 'trusted (eg, because there is only one tenant), ' 'this option could be safely disabled.')), + cfg.IntOpt('clean_callback_timeout', + default=1800, + help=_('Timeout (seconds) to wait for a callback from the ' + 'ramdisk doing the cleaning. If the timeout is reached ' + 'the node will be put in the "clean failed" provision ' + 'state. Set to 0 to disable timeout.')), ] CONF = cfg.CONF CONF.register_opts(conductor_opts, 'conductor') @@ -1253,6 +1259,31 @@ class ConductorManager(periodic_task.PeriodicTasks): task.node.conductor_affinity = self.conductor.id task.node.save() + @periodic_task.periodic_task( + spacing=CONF.conductor.check_provision_state_interval) + def _check_cleanwait_timeouts(self, context): + """Periodically checks for nodes being cleaned. + + If a node doing cleaning is unresponsive (detected when it stops + heart beating), the operation should be aborted. + + :param context: request context. + """ + callback_timeout = CONF.conductor.clean_callback_timeout + if not callback_timeout: + return + + filters = {'reserved': False, + 'provision_state': states.CLEANWAIT, + 'maintenance': False, + 'provisioned_before': callback_timeout} + last_error = _("Timeout reached while cleaning the node. Please " + "check if the ramdisk responsible for the cleaning is " + "running on the node.") + self._fail_if_in_state(context, filters, states.CLEANWAIT, + 'provision_updated_at', + last_error=last_error) + @periodic_task.periodic_task( spacing=CONF.conductor.sync_local_state_interval) def _sync_local_state(self, context): diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index cb7f2a72e0..a3887b8baf 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -267,19 +267,15 @@ class BaseAgentVendor(base.VendorInterface): # with previous code, otherwise nodes in CLEANING when this # is deployed would fail. Should be removed once the Mitaka # release starts. - elif (node.provision_state in (states.CLEANWAIT, states.CLEANING) - and not node.clean_step): - # Agent booted from prepare_cleaning - LOG.debug('Node %s just booted to start cleaning.', node.uuid) - manager.set_node_cleaning_steps(task) - self._notify_conductor_resume_clean(task) - # TODO(lucasagomes): CLEANING here for backwards compat - # with previous code, otherwise nodes in CLEANING when this - # is deployed would fail. Should be removed once the Mitaka - # release starts. - elif (node.provision_state in (states.CLEANWAIT, states.CLEANING) - and node.clean_step): - self.continue_cleaning(task, **kwargs) + elif node.provision_state in (states.CLEANWAIT, states.CLEANING): + node.touch_provisioning() + if not node.clean_step: + LOG.debug('Node %s just booted to start cleaning.', + node.uuid) + manager.set_node_cleaning_steps(task) + self._notify_conductor_resume_clean(task) + else: + self.continue_cleaning(task, **kwargs) except Exception as e: err_info = {'node': node.uuid, 'msg': msg, 'e': e} diff --git a/ironic/tests/conductor/test_manager.py b/ironic/tests/conductor/test_manager.py index 06f1a63302..47898d6aa3 100644 --- a/ironic/tests/conductor/test_manager.py +++ b/ironic/tests/conductor/test_manager.py @@ -1321,6 +1321,22 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin, self.assertIsNotNone(node.last_error) mock_cleanup.assert_called_once_with(mock.ANY) + def test__check_cleanwait_timeouts(self): + self._start_service() + CONF.set_override('clean_callback_timeout', 1, group='conductor') + node = obj_utils.create_test_node( + self.context, driver='fake', + provision_state=states.CLEANWAIT, + target_provision_state=states.AVAILABLE, + provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0)) + + self.service._check_cleanwait_timeouts(self.context) + self.service._worker_pool.waitall() + node.refresh() + self.assertEqual(states.CLEANFAIL, node.provision_state) + self.assertEqual(states.AVAILABLE, node.target_provision_state) + self.assertIsNotNone(node.last_error) + def test_do_node_tear_down_invalid_state(self): self._start_service() # test node.provision_state is incorrect for tear_down diff --git a/ironic/tests/drivers/test_agent_base_vendor.py b/ironic/tests/drivers/test_agent_base_vendor.py index 6c64ec7d71..75873dfd3c 100644 --- a/ironic/tests/drivers/test_agent_base_vendor.py +++ b/ironic/tests/drivers/test_agent_base_vendor.py @@ -322,10 +322,12 @@ class TestBaseAgentVendor(db_base.DbTestCase): '1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy ' 'is done. exception: LlamaException') + @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) @mock.patch.object(manager, 'set_node_cleaning_steps', autospec=True) @mock.patch.object(agent_base_vendor.BaseAgentVendor, '_notify_conductor_resume_clean', autospec=True) - def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps): + def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps, + mock_touch): kwargs = { 'agent_url': 'http://127.0.0.1:9999/bar' } @@ -337,14 +339,18 @@ class TestBaseAgentVendor(db_base.DbTestCase): self.context, self.node.uuid, shared=True) as task: self.passthru.heartbeat(task, **kwargs) + mock_touch.assert_called_once_with(mock.ANY) mock_notify.assert_called_once_with(mock.ANY, task) mock_set_steps.assert_called_once_with(task) + # Reset mocks for the next interaction + mock_touch.reset_mock() mock_notify.reset_mock() mock_set_steps.reset_mock() + @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) @mock.patch.object(agent_base_vendor.BaseAgentVendor, 'continue_cleaning', autospec=True) - def test_heartbeat_continue_cleaning(self, mock_continue): + def test_heartbeat_continue_cleaning(self, mock_continue, mock_touch): kwargs = { 'agent_url': 'http://127.0.0.1:9999/bar' } @@ -361,7 +367,10 @@ class TestBaseAgentVendor(db_base.DbTestCase): self.context, self.node.uuid, shared=True) as task: self.passthru.heartbeat(task, **kwargs) + mock_touch.assert_called_once_with(mock.ANY) mock_continue.assert_called_once_with(mock.ANY, task, **kwargs) + # Reset mocks for the next interaction + mock_touch.reset_mock() mock_continue.reset_mock() @mock.patch.object(agent_base_vendor.BaseAgentVendor, 'continue_deploy',