Power fault recovery: apply fault
This patch implements setting and using the fault field. For each case currently maintenance is set to True, the fault is set accordingly. A periodic task is added to check power state for nodes in maintenance due to power failure, maintenance is cleared if the power state of a node can be retrieved. When a node is taken out of maintenance by user, the fault is cleared (if there is any). Story: #1596107 Task: #10469 Change-Id: Ic4ab20af9022a2d06bdac567e7a098f3ba08570a Partial-Bug: #1596107
This commit is contained in:
parent
96aeaa8d9b
commit
0a1b165ba5
25
ironic/common/faults.py
Normal file
25
ironic/common/faults.py
Normal file
@ -0,0 +1,25 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
"""Fault definitions."""
|
||||
|
||||
POWER_FAILURE = 'power failure'
|
||||
""" Node is moved to maintenance due to power synchronization failure. """
|
||||
|
||||
CLEAN_FAILURE = 'clean failure'
|
||||
""" Node is moved to maintenance due to failure of a cleaning
|
||||
operation. """
|
||||
|
||||
RESCUE_ABORT_FAILURE = 'rescue abort failure'
|
||||
""" Node is moved to maintenance due to failure of cleaning up during
|
||||
rescue abort. """
|
@ -59,6 +59,7 @@ from six.moves import queue
|
||||
|
||||
from ironic.common import driver_factory
|
||||
from ironic.common import exception
|
||||
from ironic.common import faults
|
||||
from ironic.common.glance_service import service_utils as glance_utils
|
||||
from ironic.common.i18n import _
|
||||
from ironic.common import images
|
||||
@ -164,9 +165,11 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
# NOTE(jroll) clear maintenance_reason if node.update sets
|
||||
# maintenance to False for backwards compatibility, for tools
|
||||
# not using the maintenance endpoint.
|
||||
# NOTE(kaifeng) also clear fault when out of maintenance.
|
||||
delta = node_obj.obj_what_changed()
|
||||
if 'maintenance' in delta and not node_obj.maintenance:
|
||||
node_obj.maintenance_reason = None
|
||||
node_obj.fault = None
|
||||
|
||||
# TODO(dtantsur): reconsider allowing changing some (but not all)
|
||||
# interfaces for active nodes in the future.
|
||||
@ -744,6 +747,7 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
node.last_error = error_msg
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = error_msg
|
||||
node.fault = faults.RESCUE_ABORT_FAILURE
|
||||
node.save()
|
||||
return
|
||||
|
||||
@ -1564,6 +1568,96 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
# Yield on every iteration
|
||||
eventlet.sleep(0)
|
||||
|
||||
@METRICS.timer('ConductorManager._power_failure_recovery')
|
||||
@periodics.periodic(spacing=CONF.conductor.power_failure_recovery_interval,
|
||||
enabled=bool(
|
||||
CONF.conductor.power_failure_recovery_interval))
|
||||
def _power_failure_recovery(self, context):
|
||||
"""Periodic task to check power states for nodes in maintenance.
|
||||
|
||||
Attempt to grab a lock and sync only if the following
|
||||
conditions are met:
|
||||
|
||||
1) Node is mapped to this conductor.
|
||||
2) Node is in maintenance with maintenance type of power failure.
|
||||
3) Node is not reserved.
|
||||
4) Node is not in the ENROLL state.
|
||||
"""
|
||||
def should_sync_power_state_for_recovery(task):
|
||||
"""Check if ironic should sync power state for recovery."""
|
||||
|
||||
# NOTE(dtantsur): it's also pointless (and dangerous) to
|
||||
# sync power state when a power action is in progress
|
||||
if (task.node.provision_state == states.ENROLL or
|
||||
not task.node.maintenance or
|
||||
task.node.fault != faults.POWER_FAILURE or
|
||||
task.node.target_power_state or
|
||||
task.node.reservation):
|
||||
return False
|
||||
return True
|
||||
|
||||
def handle_recovery(task, actual_power_state):
|
||||
"""Handle recovery when power sync is succeeded."""
|
||||
task.upgrade_lock()
|
||||
node = task.node
|
||||
# Update power state
|
||||
old_power_state = node.power_state
|
||||
node.power_state = actual_power_state
|
||||
# Clear maintenance related fields
|
||||
node.maintenance = False
|
||||
node.maintenance_reason = None
|
||||
node.fault = None
|
||||
node.save()
|
||||
LOG.info("Node %(node)s is recovered from power failure "
|
||||
"with actual power state '%(state)s'.",
|
||||
{'node': node.uuid, 'state': actual_power_state})
|
||||
if old_power_state != actual_power_state:
|
||||
notify_utils.emit_power_state_corrected_notification(
|
||||
task, old_power_state)
|
||||
|
||||
# NOTE(kaifeng) To avoid conflicts with periodic task of the
|
||||
# regular power state checking, maintenance is still a required
|
||||
# condition.
|
||||
filters = {'maintenance': True,
|
||||
'fault': faults.POWER_FAILURE}
|
||||
node_iter = self.iter_nodes(fields=['id'], filters=filters)
|
||||
for (node_uuid, driver, node_id) in node_iter:
|
||||
try:
|
||||
with task_manager.acquire(context, node_uuid,
|
||||
purpose='power failure recovery',
|
||||
shared=True) as task:
|
||||
if not should_sync_power_state_for_recovery(task):
|
||||
continue
|
||||
try:
|
||||
# Validate driver info in case of parameter changed
|
||||
# in maintenance.
|
||||
task.driver.power.validate(task)
|
||||
# The driver may raise an exception, or may return
|
||||
# ERROR. Handle both the same way.
|
||||
power_state = task.driver.power.get_power_state(task)
|
||||
if power_state == states.ERROR:
|
||||
raise exception.PowerStateFailure(
|
||||
_("Power driver returned ERROR state "
|
||||
"while trying to get power state."))
|
||||
except Exception as e:
|
||||
LOG.debug("During power_failure_recovery, could "
|
||||
"not get power state for node %(node)s, "
|
||||
"Error: %(err)s.",
|
||||
{'node': task.node.uuid, 'err': e})
|
||||
else:
|
||||
handle_recovery(task, power_state)
|
||||
except exception.NodeNotFound:
|
||||
LOG.info("During power_failure_recovery, node %(node)s was "
|
||||
"not found and presumed deleted by another process.",
|
||||
{'node': node_uuid})
|
||||
except exception.NodeLocked:
|
||||
LOG.info("During power_failure_recovery, node %(node)s was "
|
||||
"already locked by another process. Skip.",
|
||||
{'node': node_uuid})
|
||||
finally:
|
||||
# Yield on every iteration
|
||||
eventlet.sleep(0)
|
||||
|
||||
@METRICS.timer('ConductorManager._check_deploy_timeouts')
|
||||
@periodics.periodic(spacing=CONF.conductor.check_provision_state_interval)
|
||||
def _check_deploy_timeouts(self, context):
|
||||
@ -3391,6 +3485,7 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state,
|
||||
node.last_error = msg
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = msg
|
||||
node.fault = faults.POWER_FAILURE
|
||||
node.save()
|
||||
if old_power_state != actual_power_state:
|
||||
notify_utils.emit_power_state_corrected_notification(
|
||||
|
@ -19,6 +19,7 @@ from oslo_utils import excutils
|
||||
import six
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common import faults
|
||||
from ironic.common.i18n import _
|
||||
from ironic.common import network
|
||||
from ironic.common import states
|
||||
@ -375,6 +376,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
node.last_error = msg
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = msg
|
||||
node.fault = faults.CLEAN_FAILURE
|
||||
node.save()
|
||||
|
||||
if set_fail_state:
|
||||
|
@ -169,6 +169,14 @@ opts = [
|
||||
'complete, i.e., so that a baremetal node is in the '
|
||||
'desired power state. If timed out, the power operation '
|
||||
'is considered a failure.')),
|
||||
cfg.IntOpt('power_failure_recovery_interval',
|
||||
min=0, default=300,
|
||||
help=_('Interval (in seconds) between checking the power '
|
||||
'state for nodes previously put into maintenance mode '
|
||||
'due to power synchronization failure. A node is '
|
||||
'automatically moved out of maintenance mode once its '
|
||||
'power state is retrieved successfully. Set to 0 to '
|
||||
'disable this check.')),
|
||||
]
|
||||
|
||||
|
||||
|
@ -477,9 +477,11 @@ class UpdateNodeTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
res = self.service.update_node(self.context, node)
|
||||
self.assertEqual({'test': 'two'}, res['extra'])
|
||||
|
||||
def test_update_node_clears_maintenance_reason(self):
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
def test_update_node_maintenance_set_false(self):
|
||||
node = obj_utils.create_test_node(self.context,
|
||||
driver='fake-hardware',
|
||||
maintenance=True,
|
||||
fault='clean failure',
|
||||
maintenance_reason='reason')
|
||||
|
||||
# check that ManagerService.update_node actually updates the node
|
||||
@ -487,6 +489,7 @@ class UpdateNodeTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
res = self.service.update_node(self.context, node)
|
||||
self.assertFalse(res['maintenance'])
|
||||
self.assertIsNone(res['maintenance_reason'])
|
||||
self.assertIsNone(res['fault'])
|
||||
|
||||
def test_update_node_already_locked(self):
|
||||
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
|
||||
@ -2110,6 +2113,7 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
self.assertIsNotNone(task.node.last_error)
|
||||
self.assertIsNotNone(task.node.maintenance_reason)
|
||||
self.assertTrue(task.node.maintenance)
|
||||
self.assertEqual('clean failure', task.node.fault)
|
||||
|
||||
|
||||
@mgr_utils.mock_record_keepalive
|
||||
@ -3356,6 +3360,8 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
|
||||
self.assertIsNotNone(task.node.last_error)
|
||||
self.assertIsNotNone(task.node.maintenance_reason)
|
||||
self.assertTrue(task.node.maintenance)
|
||||
self.assertEqual('rescue abort failure',
|
||||
task.node.fault)
|
||||
|
||||
|
||||
@mgr_utils.mock_record_keepalive
|
||||
@ -5157,6 +5163,7 @@ class ManagerDoSyncPowerStateTestCase(db_base.DbTestCase):
|
||||
self.service.power_state_sync_count[self.node.uuid])
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertIsNotNone(self.node.maintenance_reason)
|
||||
self.assertEqual('power failure', self.node.fault)
|
||||
|
||||
def test_max_retries_exceeded2(self, node_power_action):
|
||||
self.config(force_power_state_during_sync=True, group='conductor')
|
||||
@ -5176,6 +5183,7 @@ class ManagerDoSyncPowerStateTestCase(db_base.DbTestCase):
|
||||
self.assertEqual(3,
|
||||
self.service.power_state_sync_count[self.node.uuid])
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertEqual('power failure', self.node.fault)
|
||||
|
||||
@mock.patch('ironic.objects.node.NodeCorrectedPowerStateNotification')
|
||||
def test_max_retries_exceeded_notify(self, mock_notif, node_power_action):
|
||||
@ -5505,6 +5513,158 @@ class ManagerSyncPowerStatesTestCase(mgr_utils.CommonMixIn,
|
||||
self.assertEqual(sync_calls, sync_mock.call_args_list)
|
||||
|
||||
|
||||
@mock.patch.object(task_manager, 'acquire')
|
||||
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
|
||||
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
|
||||
class ManagerPowerRecoveryTestCase(mgr_utils.CommonMixIn,
|
||||
db_base.DbTestCase):
|
||||
def setUp(self):
|
||||
super(ManagerPowerRecoveryTestCase, self).setUp()
|
||||
self.service = manager.ConductorManager('hostname', 'test-topic')
|
||||
self.service.dbapi = self.dbapi
|
||||
self.driver = mock.Mock(spec_set=drivers_base.BaseDriver)
|
||||
self.power = self.driver.power
|
||||
self.task = mock.Mock(spec_set=['context', 'driver', 'node',
|
||||
'upgrade_lock', 'shared'])
|
||||
self.node = self._create_node(maintenance=True,
|
||||
fault='power failure',
|
||||
maintenance_reason='Unreachable BMC')
|
||||
self.task.node = self.node
|
||||
self.task.driver = self.driver
|
||||
self.filters = {'maintenance': True,
|
||||
'fault': 'power failure'}
|
||||
self.columns = ['uuid', 'driver', 'id']
|
||||
|
||||
def test_node_not_mapped(self, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock):
|
||||
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
|
||||
mapped_mock.return_value = False
|
||||
|
||||
self.service._power_failure_recovery(self.context)
|
||||
|
||||
get_nodeinfo_mock.assert_called_once_with(
|
||||
columns=self.columns, filters=self.filters)
|
||||
mapped_mock.assert_called_once_with(self.node.uuid,
|
||||
self.node.driver)
|
||||
self.assertFalse(acquire_mock.called)
|
||||
self.assertFalse(self.power.validate.called)
|
||||
|
||||
def _power_failure_recovery(self, node_dict, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock):
|
||||
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
|
||||
mapped_mock.return_value = True
|
||||
|
||||
task = self._create_task(node_attrs=node_dict)
|
||||
acquire_mock.side_effect = self._get_acquire_side_effect(task)
|
||||
|
||||
self.service._power_failure_recovery(self.context)
|
||||
|
||||
get_nodeinfo_mock.assert_called_once_with(
|
||||
columns=self.columns, filters=self.filters)
|
||||
mapped_mock.assert_called_once_with(self.node.uuid,
|
||||
self.node.driver)
|
||||
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
|
||||
purpose=mock.ANY,
|
||||
shared=True)
|
||||
self.assertFalse(self.power.validate.called)
|
||||
|
||||
def test_node_locked_on_acquire(self, get_nodeinfo_mock, mapped_mock,
|
||||
acquire_mock):
|
||||
node_dict = dict(reservation='host1', uuid=self.node.uuid)
|
||||
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock)
|
||||
|
||||
def test_node_in_enroll_on_acquire(self, get_nodeinfo_mock, mapped_mock,
|
||||
acquire_mock):
|
||||
node_dict = dict(provision_state=states.ENROLL,
|
||||
target_provision_state=states.NOSTATE,
|
||||
maintenance=True, uuid=self.node.uuid)
|
||||
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock)
|
||||
|
||||
def test_node_in_power_transition_on_acquire(self, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock):
|
||||
node_dict = dict(target_power_state=states.POWER_ON,
|
||||
maintenance=True, uuid=self.node.uuid)
|
||||
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock)
|
||||
|
||||
def test_node_not_in_maintenance_on_acquire(self, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock):
|
||||
node_dict = dict(maintenance=False, uuid=self.node.uuid)
|
||||
self._power_failure_recovery(node_dict, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock)
|
||||
|
||||
def test_node_disappears_on_acquire(self, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock):
|
||||
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
|
||||
mapped_mock.return_value = True
|
||||
acquire_mock.side_effect = exception.NodeNotFound(node=self.node.uuid,
|
||||
host='fake')
|
||||
|
||||
self.service._power_failure_recovery(self.context)
|
||||
|
||||
get_nodeinfo_mock.assert_called_once_with(
|
||||
columns=self.columns, filters=self.filters)
|
||||
mapped_mock.assert_called_once_with(self.node.uuid,
|
||||
self.node.driver)
|
||||
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
|
||||
purpose=mock.ANY,
|
||||
shared=True)
|
||||
self.assertFalse(self.power.validate.called)
|
||||
|
||||
@mock.patch.object(notification_utils,
|
||||
'emit_power_state_corrected_notification')
|
||||
def test_node_recovery_success(self, notify_mock, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock):
|
||||
self.node.power_state = states.POWER_ON
|
||||
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
|
||||
mapped_mock.return_value = True
|
||||
acquire_mock.side_effect = self._get_acquire_side_effect(self.task)
|
||||
self.power.get_power_state.return_value = states.POWER_OFF
|
||||
|
||||
self.service._power_failure_recovery(self.context)
|
||||
|
||||
get_nodeinfo_mock.assert_called_once_with(
|
||||
columns=self.columns, filters=self.filters)
|
||||
mapped_mock.assert_called_once_with(self.node.uuid,
|
||||
self.node.driver)
|
||||
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
|
||||
purpose=mock.ANY,
|
||||
shared=True)
|
||||
self.power.validate.assert_called_once_with(self.task)
|
||||
self.power.get_power_state.assert_called_once_with(self.task)
|
||||
self.task.upgrade_lock.assert_called_once_with()
|
||||
self.assertFalse(self.node.maintenance)
|
||||
self.assertIsNone(self.node.fault)
|
||||
self.assertIsNone(self.node.maintenance_reason)
|
||||
self.assertEqual(states.POWER_OFF, self.node.power_state)
|
||||
notify_mock.assert_called_once_with(self.task, states.POWER_ON)
|
||||
|
||||
def test_node_recovery_failed(self, get_nodeinfo_mock,
|
||||
mapped_mock, acquire_mock):
|
||||
get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response()
|
||||
mapped_mock.return_value = True
|
||||
acquire_mock.side_effect = self._get_acquire_side_effect(self.task)
|
||||
self.power.get_power_state.return_value = states.ERROR
|
||||
|
||||
self.service._power_failure_recovery(self.context)
|
||||
|
||||
get_nodeinfo_mock.assert_called_once_with(
|
||||
columns=self.columns, filters=self.filters)
|
||||
mapped_mock.assert_called_once_with(self.node.uuid,
|
||||
self.node.driver)
|
||||
acquire_mock.assert_called_once_with(self.context, self.node.uuid,
|
||||
purpose=mock.ANY,
|
||||
shared=True)
|
||||
self.power.validate.assert_called_once_with(self.task)
|
||||
self.power.get_power_state.assert_called_once_with(self.task)
|
||||
self.assertFalse(self.task.upgrade_lock.called)
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertEqual('power failure', self.node.fault)
|
||||
self.assertEqual('Unreachable BMC', self.node.maintenance_reason)
|
||||
|
||||
|
||||
@mock.patch.object(task_manager, 'acquire')
|
||||
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
|
||||
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
|
||||
|
@ -1182,6 +1182,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
target_state=None)
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertEqual(clean_error, self.node.maintenance_reason)
|
||||
self.assertEqual('clean failure', self.node.fault)
|
||||
|
||||
def test_cleaning_error_handler(self):
|
||||
self.node.provision_state = states.CLEANING
|
||||
@ -1196,6 +1197,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.assertEqual(msg, self.node.last_error)
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertEqual(msg, self.node.maintenance_reason)
|
||||
self.assertEqual('clean failure', self.node.fault)
|
||||
driver = self.task.driver.deploy
|
||||
driver.tear_down_cleaning.assert_called_once_with(self.task)
|
||||
self.task.process_event.assert_called_once_with('fail',
|
||||
@ -1238,6 +1240,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.assertTrue(log_mock.exception.called)
|
||||
self.assertIn(msg, self.node.last_error)
|
||||
self.assertIn(msg, self.node.maintenance_reason)
|
||||
self.assertEqual('clean failure', self.node.fault)
|
||||
|
||||
def test_abort_on_conductor_take_over_cleaning(self):
|
||||
self.node.maintenance = False
|
||||
@ -1246,6 +1249,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertIn('take over', self.node.maintenance_reason)
|
||||
self.assertIn('take over', self.node.last_error)
|
||||
self.assertEqual('clean failure', self.node.fault)
|
||||
self.task.driver.deploy.tear_down_cleaning.assert_called_once_with(
|
||||
self.task)
|
||||
self.node.save.assert_called_once_with()
|
||||
|
@ -0,0 +1,18 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Adds power failure recovery to ironic. For nodes that ironic had put into
|
||||
maintenance mode due to power failure, ironic periodically checks their
|
||||
power state, and moves them out of maintenance mode when power state can
|
||||
be retrieved. The interval of this check is configured via
|
||||
``[conductor]power_failure_recovery_interval`` configuration option, the
|
||||
default value is 300 (seconds). Set to 0 to disable this behavior.
|
||||
upgrade:
|
||||
- |
|
||||
Power failure recovery introduces a new configuration option
|
||||
``[conductor]power_failure_recovery_interval``, which is enabled and set
|
||||
to 300 seconds by default. In case the default value is not suitable for
|
||||
the needs or scale of a deployment, please make adjustment or turn it off
|
||||
during upgrade. The power failure recovery does not apply to nodes that
|
||||
were in maintenance mode due to power failure before upgrade, they have
|
||||
to be manually moved out of maintenance mode.
|
Loading…
Reference in New Issue
Block a user