Merge "Periodically checks for nodes being cleaned"

This commit is contained in:
Jenkins 2015-08-29 03:15:18 +00:00 committed by Gerrit Code Review
commit 55283a9cfa
5 changed files with 113 additions and 55 deletions

View File

@ -1,5 +1,33 @@
[DEFAULT]
#
# Options defined in oslo.service.periodic_task
#
# Some periodic tasks can be run in a separate process. Should
# we run them here? (boolean value)
#run_external_periodic_tasks=true
#
# Options defined in oslo.service.service
#
# Enable eventlet backdoor. Acceptable values are 0, <port>,
# and <start>:<end>, where 0 results in listening on a random
# tcp port number; <port> results in listening on the
# specified port number (and not enabling backdoor if that
# port is in use); and <start>:<end> results in listening on
# the smallest unused port number within the specified range
# of port numbers. The chosen port is displayed in the
# service's log file. (string value)
#backdoor_port=<None>
# Enables or disables logging values of all registered options
# when starting a service (at DEBUG level). (boolean value)
#log_options=true
#
# Options defined in oslo.messaging
#
@ -79,12 +107,12 @@
#
# Print debugging output (set logging level to DEBUG instead
# of default WARNING level). (boolean value)
# of default INFO level). (boolean value)
#debug=false
# Print more verbose output (set logging level to INFO instead
# of default WARNING level). (boolean value)
#verbose=false
# If set to false, will disable INFO logging level, making
# WARNING the default. (boolean value)
#verbose=true
# The name of a logging configuration file. This file is
# appended to any existing logging configuration files. For
@ -121,8 +149,8 @@
# (Optional) Enables or disables syslog rfc5424 format for
# logging. If enabled, prefixes the MSG part of the syslog
# message with APP-NAME (RFC5424). The format without the APP-
# NAME is deprecated in K, and will be removed in M, along
# with this option. (boolean value)
# NAME is deprecated in Kilo, and will be removed in Mitaka,
# along with this option. (boolean value)
#use_syslog_rfc_format=true
# Syslog facility to receive log lines. (string value)
@ -167,34 +195,6 @@
#fatal_deprecations=false
#
# Options defined in oslo.service.service
#
# Enable eventlet backdoor. Acceptable values are 0, <port>,
# and <start>:<end>, where 0 results in listening on a random
# tcp port number; <port> results in listening on the
# specified port number (and not enabling backdoor if that
# port is in use); and <start>:<end> results in listening on
# the smallest unused port number within the specified range
# of port numbers. The chosen port is displayed in the
# service's log file. (string value)
#backdoor_port=<None>
# Enables or disables logging values of all registered options
# when starting a service (at DEBUG level). (boolean value)
#log_options=true
#
# Options defined in oslo.service.periodic_task
#
# Some periodic tasks can be run in a separate process. Should
# we run them here? (boolean value)
#run_external_periodic_tasks=true
#
# Options defined in ironic.netconf
#
@ -561,6 +561,12 @@
# option could be safely disabled. (boolean value)
#clean_nodes=true
# Timeout (seconds) to wait for a callback from the ramdisk
# doing the cleaning. If the timeout is reached the node will
# be put in the "clean failed" provision state. Set to 0 to
# disable timeout. (integer value)
#clean_callback_timeout=1800
[console]
@ -1370,12 +1376,12 @@
# the same time. This option provides such compatibility - it
# defaults to False in Liberty and can be turned on for early
# adopters with a new installations or for testing. Please
# note, that this option will be removed in M release.
# (boolean value)
# note, that this option will be removed in the Mitaka
# release. (boolean value)
#send_single_reply=false
# Qpid broker hostname. (string value)
#qpid_hostname=ironic
#qpid_hostname=localhost
# Qpid broker port. (integer value)
#qpid_port=5672
@ -1438,8 +1444,8 @@
# the same time. This option provides such compatibility - it
# defaults to False in Liberty and can be turned on for early
# adopters with a new installations or for testing. Please
# note, that this option will be removed in M release.
# (boolean value)
# note, that this option will be removed in the Mitaka
# release. (boolean value)
#send_single_reply=false
# SSL version to use (valid only if SSL enabled). Valid values
@ -1468,7 +1474,7 @@
# The RabbitMQ broker address where a single node is used.
# (string value)
#rabbit_host=ironic
#rabbit_host=localhost
# The RabbitMQ broker port where a single node is used.
# (integer value)

View File

@ -184,6 +184,12 @@ conductor_opts = [
'longer. In an environment where all tenants are '
'trusted (eg, because there is only one tenant), '
'this option could be safely disabled.')),
cfg.IntOpt('clean_callback_timeout',
default=1800,
help=_('Timeout (seconds) to wait for a callback from the '
'ramdisk doing the cleaning. If the timeout is reached '
'the node will be put in the "clean failed" provision '
'state. Set to 0 to disable timeout.')),
]
CONF = cfg.CONF
CONF.register_opts(conductor_opts, 'conductor')
@ -1253,6 +1259,31 @@ class ConductorManager(periodic_task.PeriodicTasks):
task.node.conductor_affinity = self.conductor.id
task.node.save()
@periodic_task.periodic_task(
spacing=CONF.conductor.check_provision_state_interval)
def _check_cleanwait_timeouts(self, context):
"""Periodically checks for nodes being cleaned.
If a node doing cleaning is unresponsive (detected when it stops
heart beating), the operation should be aborted.
:param context: request context.
"""
callback_timeout = CONF.conductor.clean_callback_timeout
if not callback_timeout:
return
filters = {'reserved': False,
'provision_state': states.CLEANWAIT,
'maintenance': False,
'provisioned_before': callback_timeout}
last_error = _("Timeout reached while cleaning the node. Please "
"check if the ramdisk responsible for the cleaning is "
"running on the node.")
self._fail_if_in_state(context, filters, states.CLEANWAIT,
'provision_updated_at',
last_error=last_error)
@periodic_task.periodic_task(
spacing=CONF.conductor.sync_local_state_interval)
def _sync_local_state(self, context):

View File

@ -267,19 +267,15 @@ class BaseAgentVendor(base.VendorInterface):
# with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the Mitaka
# release starts.
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING)
and not node.clean_step):
# Agent booted from prepare_cleaning
LOG.debug('Node %s just booted to start cleaning.', node.uuid)
manager.set_node_cleaning_steps(task)
self._notify_conductor_resume_clean(task)
# TODO(lucasagomes): CLEANING here for backwards compat
# with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the Mitaka
# release starts.
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING)
and node.clean_step):
self.continue_cleaning(task, **kwargs)
elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
node.touch_provisioning()
if not node.clean_step:
LOG.debug('Node %s just booted to start cleaning.',
node.uuid)
manager.set_node_cleaning_steps(task)
self._notify_conductor_resume_clean(task)
else:
self.continue_cleaning(task, **kwargs)
except Exception as e:
err_info = {'node': node.uuid, 'msg': msg, 'e': e}

View File

@ -1321,6 +1321,22 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
self.assertIsNotNone(node.last_error)
mock_cleanup.assert_called_once_with(mock.ANY)
def test__check_cleanwait_timeouts(self):
self._start_service()
CONF.set_override('clean_callback_timeout', 1, group='conductor')
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANWAIT,
target_provision_state=states.AVAILABLE,
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
self.service._check_cleanwait_timeouts(self.context)
self.service._worker_pool.waitall()
node.refresh()
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(states.AVAILABLE, node.target_provision_state)
self.assertIsNotNone(node.last_error)
def test_do_node_tear_down_invalid_state(self):
self._start_service()
# test node.provision_state is incorrect for tear_down

View File

@ -322,10 +322,12 @@ class TestBaseAgentVendor(db_base.DbTestCase):
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
'is done. exception: LlamaException')
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(manager, 'set_node_cleaning_steps', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
'_notify_conductor_resume_clean', autospec=True)
def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps):
def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps,
mock_touch):
kwargs = {
'agent_url': 'http://127.0.0.1:9999/bar'
}
@ -337,14 +339,18 @@ class TestBaseAgentVendor(db_base.DbTestCase):
self.context, self.node.uuid, shared=True) as task:
self.passthru.heartbeat(task, **kwargs)
mock_touch.assert_called_once_with(mock.ANY)
mock_notify.assert_called_once_with(mock.ANY, task)
mock_set_steps.assert_called_once_with(task)
# Reset mocks for the next interaction
mock_touch.reset_mock()
mock_notify.reset_mock()
mock_set_steps.reset_mock()
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
'continue_cleaning', autospec=True)
def test_heartbeat_continue_cleaning(self, mock_continue):
def test_heartbeat_continue_cleaning(self, mock_continue, mock_touch):
kwargs = {
'agent_url': 'http://127.0.0.1:9999/bar'
}
@ -361,7 +367,10 @@ class TestBaseAgentVendor(db_base.DbTestCase):
self.context, self.node.uuid, shared=True) as task:
self.passthru.heartbeat(task, **kwargs)
mock_touch.assert_called_once_with(mock.ANY)
mock_continue.assert_called_once_with(mock.ANY, task, **kwargs)
# Reset mocks for the next interaction
mock_touch.reset_mock()
mock_continue.reset_mock()
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'continue_deploy',