diff --git a/ironic/common/faults.py b/ironic/common/faults.py new file mode 100644 index 0000000000..dd7ab955c7 --- /dev/null +++ b/ironic/common/faults.py @@ -0,0 +1,25 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +"""Fault definitions.""" + +POWER_FAILURE = 'power failure' +""" Node is moved to maintenance due to power synchronization failure. """ + +CLEAN_FAILURE = 'clean failure' +""" Node is moved to maintenance due to failure of a cleaning + operation. """ + +RESCUE_ABORT_FAILURE = 'rescue abort failure' +""" Node is moved to maintenance due to failure of cleaning up during + rescue abort. """ diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 4118f39cd0..66750fe215 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -59,6 +59,7 @@ from six.moves import queue from ironic.common import driver_factory from ironic.common import exception +from ironic.common import faults from ironic.common.glance_service import service_utils as glance_utils from ironic.common.i18n import _ from ironic.common import images @@ -164,9 +165,11 @@ class ConductorManager(base_manager.BaseConductorManager): # NOTE(jroll) clear maintenance_reason if node.update sets # maintenance to False for backwards compatibility, for tools # not using the maintenance endpoint. + # NOTE(kaifeng) also clear fault when out of maintenance. delta = node_obj.obj_what_changed() if 'maintenance' in delta and not node_obj.maintenance: node_obj.maintenance_reason = None + node_obj.fault = None # TODO(dtantsur): reconsider allowing changing some (but not all) # interfaces for active nodes in the future. @@ -744,6 +747,7 @@ class ConductorManager(base_manager.BaseConductorManager): node.last_error = error_msg node.maintenance = True node.maintenance_reason = error_msg + node.fault = faults.RESCUE_ABORT_FAILURE node.save() return @@ -1564,6 +1568,96 @@ class ConductorManager(base_manager.BaseConductorManager): # Yield on every iteration eventlet.sleep(0) + @METRICS.timer('ConductorManager._power_failure_recovery') + @periodics.periodic(spacing=CONF.conductor.power_failure_recovery_interval, + enabled=bool( + CONF.conductor.power_failure_recovery_interval)) + def _power_failure_recovery(self, context): + """Periodic task to check power states for nodes in maintenance. + + Attempt to grab a lock and sync only if the following + conditions are met: + + 1) Node is mapped to this conductor. + 2) Node is in maintenance with maintenance type of power failure. + 3) Node is not reserved. + 4) Node is not in the ENROLL state. + """ + def should_sync_power_state_for_recovery(task): + """Check if ironic should sync power state for recovery.""" + + # NOTE(dtantsur): it's also pointless (and dangerous) to + # sync power state when a power action is in progress + if (task.node.provision_state == states.ENROLL or + not task.node.maintenance or + task.node.fault != faults.POWER_FAILURE or + task.node.target_power_state or + task.node.reservation): + return False + return True + + def handle_recovery(task, actual_power_state): + """Handle recovery when power sync is succeeded.""" + task.upgrade_lock() + node = task.node + # Update power state + old_power_state = node.power_state + node.power_state = actual_power_state + # Clear maintenance related fields + node.maintenance = False + node.maintenance_reason = None + node.fault = None + node.save() + LOG.info("Node %(node)s is recovered from power failure " + "with actual power state '%(state)s'.", + {'node': node.uuid, 'state': actual_power_state}) + if old_power_state != actual_power_state: + notify_utils.emit_power_state_corrected_notification( + task, old_power_state) + + # NOTE(kaifeng) To avoid conflicts with periodic task of the + # regular power state checking, maintenance is still a required + # condition. + filters = {'maintenance': True, + 'fault': faults.POWER_FAILURE} + node_iter = self.iter_nodes(fields=['id'], filters=filters) + for (node_uuid, driver, node_id) in node_iter: + try: + with task_manager.acquire(context, node_uuid, + purpose='power failure recovery', + shared=True) as task: + if not should_sync_power_state_for_recovery(task): + continue + try: + # Validate driver info in case of parameter changed + # in maintenance. + task.driver.power.validate(task) + # The driver may raise an exception, or may return + # ERROR. Handle both the same way. + power_state = task.driver.power.get_power_state(task) + if power_state == states.ERROR: + raise exception.PowerStateFailure( + _("Power driver returned ERROR state " + "while trying to get power state.")) + except Exception as e: + LOG.debug("During power_failure_recovery, could " + "not get power state for node %(node)s, " + "Error: %(err)s.", + {'node': task.node.uuid, 'err': e}) + else: + handle_recovery(task, power_state) + except exception.NodeNotFound: + LOG.info("During power_failure_recovery, node %(node)s was " + "not found and presumed deleted by another process.", + {'node': node_uuid}) + except exception.NodeLocked: + LOG.info("During power_failure_recovery, node %(node)s was " + "already locked by another process. Skip.", + {'node': node_uuid}) + finally: + # Yield on every iteration + eventlet.sleep(0) + @METRICS.timer('ConductorManager._check_deploy_timeouts') @periodics.periodic(spacing=CONF.conductor.check_provision_state_interval) def _check_deploy_timeouts(self, context): @@ -3391,6 +3485,7 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state, node.last_error = msg node.maintenance = True node.maintenance_reason = msg + node.fault = faults.POWER_FAILURE node.save() if old_power_state != actual_power_state: notify_utils.emit_power_state_corrected_notification( diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index de7c34ad14..36c9b8300f 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -19,6 +19,7 @@ from oslo_utils import excutils import six from ironic.common import exception +from ironic.common import faults from ironic.common.i18n import _ from ironic.common import network from ironic.common import states @@ -375,6 +376,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, node.last_error = msg node.maintenance = True node.maintenance_reason = msg + node.fault = faults.CLEAN_FAILURE node.save() if set_fail_state: diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 1d1175c31c..34a41f436e 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -169,6 +169,14 @@ opts = [ 'complete, i.e., so that a baremetal node is in the ' 'desired power state. If timed out, the power operation ' 'is considered a failure.')), + cfg.IntOpt('power_failure_recovery_interval', + min=0, default=300, + help=_('Interval (in seconds) between checking the power ' + 'state for nodes previously put into maintenance mode ' + 'due to power synchronization failure. A node is ' + 'automatically moved out of maintenance mode once its ' + 'power state is retrieved successfully. Set to 0 to ' + 'disable this check.')), ] diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 64ef8906a7..adc903639e 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -477,9 +477,11 @@ class UpdateNodeTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): res = self.service.update_node(self.context, node) self.assertEqual({'test': 'two'}, res['extra']) - def test_update_node_clears_maintenance_reason(self): - node = obj_utils.create_test_node(self.context, driver='fake-hardware', + def test_update_node_maintenance_set_false(self): + node = obj_utils.create_test_node(self.context, + driver='fake-hardware', maintenance=True, + fault='clean failure', maintenance_reason='reason') # check that ManagerService.update_node actually updates the node @@ -487,6 +489,7 @@ class UpdateNodeTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): res = self.service.update_node(self.context, node) self.assertFalse(res['maintenance']) self.assertIsNone(res['maintenance_reason']) + self.assertIsNone(res['fault']) def test_update_node_already_locked(self): node = obj_utils.create_test_node(self.context, driver='fake-hardware', @@ -2110,6 +2113,7 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin, self.assertIsNotNone(task.node.last_error) self.assertIsNotNone(task.node.maintenance_reason) self.assertTrue(task.node.maintenance) + self.assertEqual('clean failure', task.node.fault) @mgr_utils.mock_record_keepalive @@ -3356,6 +3360,8 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, self.assertIsNotNone(task.node.last_error) self.assertIsNotNone(task.node.maintenance_reason) self.assertTrue(task.node.maintenance) + self.assertEqual('rescue abort failure', + task.node.fault) @mgr_utils.mock_record_keepalive @@ -5157,6 +5163,7 @@ class ManagerDoSyncPowerStateTestCase(db_base.DbTestCase): self.service.power_state_sync_count[self.node.uuid]) self.assertTrue(self.node.maintenance) self.assertIsNotNone(self.node.maintenance_reason) + self.assertEqual('power failure', self.node.fault) def test_max_retries_exceeded2(self, node_power_action): self.config(force_power_state_during_sync=True, group='conductor') @@ -5176,6 +5183,7 @@ class ManagerDoSyncPowerStateTestCase(db_base.DbTestCase): self.assertEqual(3, self.service.power_state_sync_count[self.node.uuid]) self.assertTrue(self.node.maintenance) + self.assertEqual('power failure', self.node.fault) @mock.patch('ironic.objects.node.NodeCorrectedPowerStateNotification') def test_max_retries_exceeded_notify(self, mock_notif, node_power_action): @@ -5505,6 +5513,158 @@ class ManagerSyncPowerStatesTestCase(mgr_utils.CommonMixIn, self.assertEqual(sync_calls, sync_mock.call_args_list) +@mock.patch.object(task_manager, 'acquire') +@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor') +@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list') +class ManagerPowerRecoveryTestCase(mgr_utils.CommonMixIn, + db_base.DbTestCase): + def setUp(self): + super(ManagerPowerRecoveryTestCase, self).setUp() + self.service = manager.ConductorManager('hostname', 'test-topic') + self.service.dbapi = self.dbapi + self.driver = mock.Mock(spec_set=drivers_base.BaseDriver) + self.power = self.driver.power + self.task = mock.Mock(spec_set=['context', 'driver', 'node', + 'upgrade_lock', 'shared']) + self.node = self._create_node(maintenance=True, + fault='power failure', + maintenance_reason='Unreachable BMC') + self.task.node = self.node + self.task.driver = self.driver + self.filters = {'maintenance': True, + 'fault': 'power failure'} + self.columns = ['uuid', 'driver', 'id'] + + def test_node_not_mapped(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = False + + self.service._power_failure_recovery(self.context) + + get_nodeinfo_mock.assert_called_once_with( + columns=self.columns, filters=self.filters) + mapped_mock.assert_called_once_with(self.node.uuid, + self.node.driver) + self.assertFalse(acquire_mock.called) + self.assertFalse(self.power.validate.called) + + def _power_failure_recovery(self, node_dict, get_nodeinfo_mock, + mapped_mock, acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + + task = self._create_task(node_attrs=node_dict) + acquire_mock.side_effect = self._get_acquire_side_effect(task) + + self.service._power_failure_recovery(self.context) + + get_nodeinfo_mock.assert_called_once_with( + columns=self.columns, filters=self.filters) + mapped_mock.assert_called_once_with(self.node.uuid, + self.node.driver) + acquire_mock.assert_called_once_with(self.context, self.node.uuid, + purpose=mock.ANY, + shared=True) + self.assertFalse(self.power.validate.called) + + def test_node_locked_on_acquire(self, get_nodeinfo_mock, mapped_mock, + acquire_mock): + node_dict = dict(reservation='host1', uuid=self.node.uuid) + self._power_failure_recovery(node_dict, get_nodeinfo_mock, + mapped_mock, acquire_mock) + + def test_node_in_enroll_on_acquire(self, get_nodeinfo_mock, mapped_mock, + acquire_mock): + node_dict = dict(provision_state=states.ENROLL, + target_provision_state=states.NOSTATE, + maintenance=True, uuid=self.node.uuid) + self._power_failure_recovery(node_dict, get_nodeinfo_mock, + mapped_mock, acquire_mock) + + def test_node_in_power_transition_on_acquire(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + node_dict = dict(target_power_state=states.POWER_ON, + maintenance=True, uuid=self.node.uuid) + self._power_failure_recovery(node_dict, get_nodeinfo_mock, + mapped_mock, acquire_mock) + + def test_node_not_in_maintenance_on_acquire(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + node_dict = dict(maintenance=False, uuid=self.node.uuid) + self._power_failure_recovery(node_dict, get_nodeinfo_mock, + mapped_mock, acquire_mock) + + def test_node_disappears_on_acquire(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + acquire_mock.side_effect = exception.NodeNotFound(node=self.node.uuid, + host='fake') + + self.service._power_failure_recovery(self.context) + + get_nodeinfo_mock.assert_called_once_with( + columns=self.columns, filters=self.filters) + mapped_mock.assert_called_once_with(self.node.uuid, + self.node.driver) + acquire_mock.assert_called_once_with(self.context, self.node.uuid, + purpose=mock.ANY, + shared=True) + self.assertFalse(self.power.validate.called) + + @mock.patch.object(notification_utils, + 'emit_power_state_corrected_notification') + def test_node_recovery_success(self, notify_mock, get_nodeinfo_mock, + mapped_mock, acquire_mock): + self.node.power_state = states.POWER_ON + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect(self.task) + self.power.get_power_state.return_value = states.POWER_OFF + + self.service._power_failure_recovery(self.context) + + get_nodeinfo_mock.assert_called_once_with( + columns=self.columns, filters=self.filters) + mapped_mock.assert_called_once_with(self.node.uuid, + self.node.driver) + acquire_mock.assert_called_once_with(self.context, self.node.uuid, + purpose=mock.ANY, + shared=True) + self.power.validate.assert_called_once_with(self.task) + self.power.get_power_state.assert_called_once_with(self.task) + self.task.upgrade_lock.assert_called_once_with() + self.assertFalse(self.node.maintenance) + self.assertIsNone(self.node.fault) + self.assertIsNone(self.node.maintenance_reason) + self.assertEqual(states.POWER_OFF, self.node.power_state) + notify_mock.assert_called_once_with(self.task, states.POWER_ON) + + def test_node_recovery_failed(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect(self.task) + self.power.get_power_state.return_value = states.ERROR + + self.service._power_failure_recovery(self.context) + + get_nodeinfo_mock.assert_called_once_with( + columns=self.columns, filters=self.filters) + mapped_mock.assert_called_once_with(self.node.uuid, + self.node.driver) + acquire_mock.assert_called_once_with(self.context, self.node.uuid, + purpose=mock.ANY, + shared=True) + self.power.validate.assert_called_once_with(self.task) + self.power.get_power_state.assert_called_once_with(self.task) + self.assertFalse(self.task.upgrade_lock.called) + self.assertTrue(self.node.maintenance) + self.assertEqual('power failure', self.node.fault) + self.assertEqual('Unreachable BMC', self.node.maintenance_reason) + + @mock.patch.object(task_manager, 'acquire') @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor') @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list') diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index c4175d823c..13ca1f6df8 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -1182,6 +1182,7 @@ class ErrorHandlersTestCase(tests_base.TestCase): target_state=None) self.assertTrue(self.node.maintenance) self.assertEqual(clean_error, self.node.maintenance_reason) + self.assertEqual('clean failure', self.node.fault) def test_cleaning_error_handler(self): self.node.provision_state = states.CLEANING @@ -1196,6 +1197,7 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertEqual(msg, self.node.last_error) self.assertTrue(self.node.maintenance) self.assertEqual(msg, self.node.maintenance_reason) + self.assertEqual('clean failure', self.node.fault) driver = self.task.driver.deploy driver.tear_down_cleaning.assert_called_once_with(self.task) self.task.process_event.assert_called_once_with('fail', @@ -1238,6 +1240,7 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertTrue(log_mock.exception.called) self.assertIn(msg, self.node.last_error) self.assertIn(msg, self.node.maintenance_reason) + self.assertEqual('clean failure', self.node.fault) def test_abort_on_conductor_take_over_cleaning(self): self.node.maintenance = False @@ -1246,6 +1249,7 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertTrue(self.node.maintenance) self.assertIn('take over', self.node.maintenance_reason) self.assertIn('take over', self.node.last_error) + self.assertEqual('clean failure', self.node.fault) self.task.driver.deploy.tear_down_cleaning.assert_called_once_with( self.task) self.node.save.assert_called_once_with() diff --git a/releasenotes/notes/power-fault-recovery-6e22f0114ceee203.yaml b/releasenotes/notes/power-fault-recovery-6e22f0114ceee203.yaml new file mode 100644 index 0000000000..a7184d7766 --- /dev/null +++ b/releasenotes/notes/power-fault-recovery-6e22f0114ceee203.yaml @@ -0,0 +1,18 @@ +--- +features: + - | + Adds power failure recovery to ironic. For nodes that ironic had put into + maintenance mode due to power failure, ironic periodically checks their + power state, and moves them out of maintenance mode when power state can + be retrieved. The interval of this check is configured via + ``[conductor]power_failure_recovery_interval`` configuration option, the + default value is 300 (seconds). Set to 0 to disable this behavior. +upgrade: + - | + Power failure recovery introduces a new configuration option + ``[conductor]power_failure_recovery_interval``, which is enabled and set + to 300 seconds by default. In case the default value is not suitable for + the needs or scale of a deployment, please make adjustment or turn it off + during upgrade. The power failure recovery does not apply to nodes that + were in maintenance mode due to power failure before upgrade, they have + to be manually moved out of maintenance mode. \ No newline at end of file