diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index 9d72a46c71..0acef9952e 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -15,7 +15,7 @@ #rpc_conn_pool_size=30 # Qpid broker hostname. (string value) -#qpid_hostname=localhost +#qpid_hostname=ironic # Qpid broker port. (integer value) #qpid_port=5672 @@ -76,7 +76,7 @@ # The RabbitMQ broker address where a single node is used. # (string value) -#rabbit_host=localhost +#rabbit_host=ironic # The RabbitMQ broker port where a single node is used. # (integer value) @@ -604,6 +604,10 @@ # when configdrive_use_swift is True. (string value) #configdrive_swift_container=ironic_configdrive_container +# Timeout (seconds) for waiting for node inspection. 0 - +# unlimited. (integer value) (integer value) +#inspect_timeout=1800 + [console] @@ -957,7 +961,7 @@ # Timeout (in seconds) for iRMC operations (integer value) #client_timeout=60 -# Sensor Data retrieval method either "ipmitool" or "scci" +# Sensor data retrieval method, either "ipmitool" or "scci" # (string value) #sensor_method=ipmitool diff --git a/ironic/api/controllers/v1/__init__.py b/ironic/api/controllers/v1/__init__.py index 76713fab2d..a60435782d 100644 --- a/ironic/api/controllers/v1/__init__.py +++ b/ironic/api/controllers/v1/__init__.py @@ -60,7 +60,8 @@ MIN_VER_STR = '1.1' # v1.3: Add node.driver_internal_info # v1.4: Add MANAGEABLE state # v1.5: Add logical node names -MAX_VER_STR = '1.5' +# v1.6: Add INSPECT* states +MAX_VER_STR = '1.6' MIN_VER = base.Version({base.Version.string: MIN_VER_STR}, diff --git a/ironic/api/controllers/v1/node.py b/ironic/api/controllers/v1/node.py index f83a4bf357..0eee6cc67d 100644 --- a/ironic/api/controllers/v1/node.py +++ b/ironic/api/controllers/v1/node.py @@ -72,9 +72,11 @@ def hide_driver_internal_info(obj): def check_allow_management_verbs(verb): # v1.4 added the MANAGEABLE state and two verbs to move nodes into # and out of that state. Reject requests to do this in older versions - if (pecan.request.version.minor < 4 and - verb in [ir_states.VERBS['manage'], ir_states.VERBS['provide']]): - raise exception.NotAcceptable() + if ((pecan.request.version.minor < 4 and + verb in [ir_states.VERBS['manage'], ir_states.VERBS['provide']]) + or (pecan.request.version.minor < 6 and + verb == ir_states.VERBS['inspect'])): + raise exception.NotAcceptable() def allow_logical_names(): @@ -127,7 +129,8 @@ class NodePatchType(types.JsonPatchType): '/power_state', '/provision_state', '/reservation', '/target_power_state', '/target_provision_state', '/provision_updated_at', '/maintenance_reason', - '/driver_internal_info'] + '/driver_internal_info', '/inspection_finished_at', + '/inspection_started_at', ] @staticmethod def mandatory_attrs(): @@ -439,6 +442,9 @@ class NodeStatesController(rest.RestController): elif target == ir_states.DELETED: pecan.request.rpcapi.do_node_tear_down( pecan.request.context, rpc_node.uuid, topic) + elif target == ir_states.VERBS['inspect']: + pecan.request.rpcapi.inspect_hardware( + pecan.request.context, rpc_node.uuid, topic=topic) elif target in ( ir_states.VERBS['manage'], ir_states.VERBS['provide']): pecan.request.rpcapi.do_provisioning_action( @@ -511,6 +517,12 @@ class Node(base.APIBase): provision_updated_at = datetime.datetime """The UTC date and time of the last provision state change""" + inspection_finished_at = datetime.datetime + """The UTC date and time when the last inspection finished successfully.""" + + inspection_started_at = datetime.datetime + """The UTC date and time of the hardware when inspection was started""" + maintenance = types.boolean """Indicates whether the node is in maintenance mode.""" @@ -636,6 +648,7 @@ class Node(base.APIBase): 'cpus': '1'}, updated_at=time, created_at=time, provision_updated_at=time, instance_info={}, maintenance=False, maintenance_reason=None, + inspection_finished_at=None, inspection_started_at=time, console_enabled=False) # NOTE(matty_dubs): The chassis_uuid getter() is based on the # _chassis_uuid variable: diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 994ffe903e..cb3a90fe5f 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -163,8 +163,12 @@ conductor_opts = [ default='ironic_configdrive_container', help='Name of the Swift container to store config drive ' 'data. Used when configdrive_use_swift is True.'), -] + cfg.IntOpt('inspect_timeout', + default=1800, + help='Timeout (seconds) for waiting for node inspection. ' + '0 - unlimited. (integer value)'), +] CONF = cfg.CONF CONF.register_opts(conductor_opts, 'conductor') @@ -173,7 +177,7 @@ class ConductorManager(periodic_task.PeriodicTasks): """Ironic Conductor manager main class.""" # NOTE(rloo): This must be in sync with rpcapi.ConductorAPI's. - RPC_API_VERSION = '1.23' + RPC_API_VERSION = '1.24' target = messaging.Target(version=RPC_API_VERSION) @@ -921,40 +925,11 @@ class ConductorManager(periodic_task.PeriodicTasks): 'provision_state': states.DEPLOYWAIT, 'maintenance': False, 'provisioned_before': callback_timeout} - columns = ['uuid', 'driver'] - node_list = self.dbapi.get_nodeinfo_list( - columns=columns, - filters=filters, - sort_key='provision_updated_at', - sort_dir='asc') - - workers_count = 0 - for node_uuid, driver in node_list: - if not self._mapped_to_this_conductor(node_uuid, driver): - continue - try: - with task_manager.acquire(context, node_uuid) as task: - # NOTE(comstud): Recheck maintenance and provision_state - # now that we have the lock. We don't need to re-check - # updated_at unless we expect the state to have flipped - # to something else and then back to DEPLOYWAIT between - # the call to get_nodeinfo_list and now. - if (task.node.maintenance or - task.node.provision_state != states.DEPLOYWAIT): - continue - # timeout has been reached - fail the deploy - task.process_event('fail', - callback=self._spawn_worker, - call_args=(utils.cleanup_after_timeout, - task), - err_handler=provisioning_error_handler) - except exception.NoFreeConductorWorker: - break - except (exception.NodeLocked, exception.NodeNotFound): - continue - workers_count += 1 - if workers_count == CONF.conductor.periodic_max_workers: - break + sort_key = 'provision_updated_at' + callback_method = utils.cleanup_after_timeout + err_handler = provisioning_error_handler + self._fail_if_timeout_reached(context, filters, states.DEPLOYWAIT, + sort_key, callback_method, err_handler) def _do_takeover(self, task): LOG.debug(('Conductor %(cdr)s taking over node %(node)s'), @@ -1425,6 +1400,135 @@ class ConductorManager(periodic_task.PeriodicTasks): driver=task.node.driver, extension='management') return task.driver.management.get_supported_boot_devices() + @messaging.expected_exceptions(exception.NoFreeConductorWorker, + exception.NodeLocked, + exception.HardwareInspectionFailure, + exception.UnsupportedDriverExtension) + def inspect_hardware(self, context, node_id): + """Inspect hardware to obtain hardware properties. + + Initiate the inspection of a node. Validations are done + synchronously and the actual inspection work is performed in + background (asynchronously). + + :param context: request context. + :param node_id: node id or uuid. + :raises: NodeLocked if node is locked by another conductor. + :raises: UnsupportedDriverExtension if the node's driver doesn't + support inspect. + :raises: NoFreeConductorWorker when there is no free worker to start + async task + :raises: HardwareInspectionFailure when unable to get + essential scheduling properties from hardware. + + """ + LOG.debug('RPC inspect_hardware called for node %s', node_id) + with task_manager.acquire(context, node_id, shared=False) as task: + node = task.node + if not getattr(task.driver, 'inspect', None): + raise exception.UnsupportedDriverExtension( + driver=task.node.driver, extension='inspect') + + try: + task.driver.power.validate(task) + task.driver.inspect.validate(task) + except (exception.InvalidParameterValue, + exception.MissingParameterValue) as e: + error = (_("RPC inspect_hardware failed to validate " + "inspection or power info. Error: %(msg)s") + % {'msg': e}) + raise exception.HardwareInspectionFailure(error=error) + + try: + task.process_event('inspect', + callback=self._spawn_worker, + call_args=(_do_inspect_hardware, task, + self.conductor.id), + err_handler=provisioning_error_handler) + + except exception.InvalidState: + error = (_("Inspection is not possible while node " + "%(node)s is in state %(state)s") + % {'node': node.uuid, + 'state': node.provision_state}) + raise exception.HardwareInspectionFailure(error=error) + + @periodic_task.periodic_task( + spacing=CONF.conductor.check_provision_state_interval) + def _check_inspect_timeouts(self, context): + """Periodically checks inspect_timeout and fails upon reaching it. + + :param: context: request context + + """ + callback_timeout = CONF.conductor.inspect_timeout + if not callback_timeout: + return + + filters = {'reserved': False, + 'provision_state': states.INSPECTING, + 'inspection_started_before': callback_timeout} + sort_key = 'inspection_started_at' + last_error = _("timeout reached while waiting for callback") + self._fail_if_timeout_reached(context, filters, states.INSPECTING, + sort_key, last_error=last_error) + + def _fail_if_timeout_reached(self, context, filters, provision_state, + sort_key, callback_method=None, + err_handler=None, last_error=None): + """Checks if the async(background) process has reached timeout. + + :param: context: request context + :param: filters: criteria (as a dictionary) to get the desired + list of nodes. + :param: provision_state: provision_state that the node is in, + for the provisioning activity to have failed. + :param: sort_key: the nodes are sorted based on this key. + :param: callback_method: the callback method to be invoked in a + spawned thread, for a failed node. This + method must take a :class:`TaskManager` as + the first (and only required) parameter. + :param: err_handler: the error handler to be invoked if an error. + occurs trying to spawn a thread for a failed node. + :param: last_error: the error message to be updated in node.last_error + + """ + columns = ['uuid', 'driver'] + node_list = self.dbapi.get_nodeinfo_list(columns=columns, + filters=filters, + sort_key=sort_key, + sort_dir='asc') + + workers_count = 0 + for node_uuid, driver in node_list: + if not self._mapped_to_this_conductor(node_uuid, driver): + continue + try: + with task_manager.acquire(context, node_uuid) as task: + if (task.node.maintenance or + task.node.provision_state != provision_state): + continue + + # timeout has been reached - process the event 'fail' + if callback_method: + task.process_event('fail', + callback=self._spawn_worker, + call_args=(callback_method, task), + err_handler=err_handler) + else: + if not last_error: + last_error = _("timeout reached while waiting " + "for callback") + task.node.last_error = last_error + task.process_event('fail') + except exception.NoFreeConductorWorker: + break + except (exception.NodeLocked, exception.NodeNotFound): + continue + workers_count += 1 + if workers_count >= CONF.conductor.periodic_max_workers: + break + def get_vendor_passthru_metadata(route_dict): d = {} @@ -1696,3 +1800,37 @@ def do_sync_power_state(task, count): node.save() return count + + +def _do_inspect_hardware(task, conductor_id): + """Prepare the environment and inspect a node.""" + node = task.node + + def handle_failure(e): + # NOTE(deva): there is no need to clear conductor_affinity + node.last_error = e + task.process_event('fail') + LOG.error(_LE("Failed to inspect node %(node)s: %(err)s"), + {'node': node.uuid, 'err': e}) + + try: + new_state = task.driver.inspect.inspect_hardware(task) + + except Exception as e: + with excutils.save_and_reraise_exception(): + error = str(e) + handle_failure(error) + + # Update conductor_affinity to reference this conductor's ID + # since there may be local persistent state + node.conductor_affinity = conductor_id + + if new_state == states.MANAGEABLE: + task.process_event('done') + LOG.info(_LI('Successfully inspected node %(node)s') + % {'node': node.uuid}) + elif new_state != states.INSPECTING: + error = (_("Driver returned unexpected state in inspection" + "%(state)s") % {'state': new_state}) + handle_failure(error) + raise exception.HardwareInspectionFailure(error=error) diff --git a/ironic/conductor/rpcapi.py b/ironic/conductor/rpcapi.py index 9524123d46..173ea68929 100644 --- a/ironic/conductor/rpcapi.py +++ b/ironic/conductor/rpcapi.py @@ -66,11 +66,12 @@ class ConductorAPI(object): | get_driver_vendor_passthru_methods | 1.22 - Added configdrive parameter to do_node_deploy. | 1.23 - Added do_provisioning_action + | 1.24 - Added inspect_hardware method """ # NOTE(rloo): This must be in sync with manager.ConductorManager's. - RPC_API_VERSION = '1.23' + RPC_API_VERSION = '1.24' def __init__(self, topic=None): super(ConductorAPI, self).__init__() @@ -483,3 +484,20 @@ class ConductorAPI(object): cctxt = self.client.prepare(topic=topic or self.topic, version='1.17') return cctxt.call(context, 'get_supported_boot_devices', node_id=node_id) + + def inspect_hardware(self, context, node_id, topic=None): + """Signals the conductor service to perform hardware introspection. + + :param context: request context. + :param node_id: node id or uuid. + :param topic: RPC topic. Defaults to self.topic. + :raises: NodeLocked if node is locked by another conductor. + :raises: HardwareInspectionFailure + :raises: NoFreeConductorWorker when there is no free worker to start + async task. + :raises: UnsupportedDriverExtension if the node's driver doesn't + support inspection. + + """ + cctxt = self.client.prepare(topic=topic or self.topic, version='1.24') + return cctxt.call(context, 'inspect_hardware', node_id=node_id) diff --git a/ironic/db/sqlalchemy/alembic/versions/1e1d5ace7dc6_add_inspection_started_at_and_.py b/ironic/db/sqlalchemy/alembic/versions/1e1d5ace7dc6_add_inspection_started_at_and_.py new file mode 100644 index 0000000000..58d8954d33 --- /dev/null +++ b/ironic/db/sqlalchemy/alembic/versions/1e1d5ace7dc6_add_inspection_started_at_and_.py @@ -0,0 +1,40 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""add inspection_started_at and inspection_finished_at + +Revision ID: 1e1d5ace7dc6 +Revises: 3ae36a5f5131 +Create Date: 2015-02-26 10:46:46.861927 + +""" + +# revision identifiers, used by Alembic. +revision = '1e1d5ace7dc6' +down_revision = '3ae36a5f5131' + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + op.add_column('nodes', sa.Column('inspection_started_at', + sa.DateTime(), + nullable=True)) + op.add_column('nodes', sa.Column('inspection_finished_at', + sa.DateTime(), + nullable=True)) + + +def downgrade(): + op.drop_column('nodes', 'inspection_started_at') + op.drop_column('nodes', 'inspection_finished_at') diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 9b1fbf81f0..5a529d518e 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -191,6 +191,11 @@ class Connection(api.Connection): limit = timeutils.utcnow() - datetime.timedelta( seconds=filters['provisioned_before']) query = query.filter(models.Node.provision_updated_at < limit) + if 'inspection_started_before' in filters: + limit = ((timeutils.utcnow()) - + (datetime.timedelta( + seconds=filters['inspection_started_before']))) + query = query.filter(models.Node.inspection_started_at < limit) return query @@ -371,6 +376,17 @@ class Connection(api.Connection): if 'provision_state' in values: values['provision_updated_at'] = timeutils.utcnow() + if values['provision_state'] == states.INSPECTING: + values['inspection_started_at'] = timeutils.utcnow() + values['inspection_finished_at'] = None + + if (ref.provision_state == states.INSPECTING and + values.get('provision_state') == states.MANAGEABLE): + values['inspection_finished_at'] = timeutils.utcnow() + values['inspection_started_at'] = None + elif (ref.provision_state == states.INSPECTING and + values.get('provision_state') == states.INSPECTFAIL): + values['inspection_started_at'] = None ref.update(values) return ref diff --git a/ironic/db/sqlalchemy/models.py b/ironic/db/sqlalchemy/models.py index 1f25dc30bd..6cbe5394cd 100644 --- a/ironic/db/sqlalchemy/models.py +++ b/ironic/db/sqlalchemy/models.py @@ -185,6 +185,8 @@ class Node(Base): maintenance = Column(Boolean, default=False) maintenance_reason = Column(Text, nullable=True) console_enabled = Column(Boolean, default=False) + inspection_finished_at = Column(DateTime, nullable=True) + inspection_started_at = Column(DateTime, nullable=True) extra = Column(JSONEncodedDict) diff --git a/ironic/drivers/modules/fake.py b/ironic/drivers/modules/fake.py index f256fc3174..ecd05418fa 100644 --- a/ironic/drivers/modules/fake.py +++ b/ironic/drivers/modules/fake.py @@ -181,4 +181,4 @@ class FakeInspect(base.InspectInterface): pass def inspect_hardware(self, task): - pass + return states.MANAGEABLE diff --git a/ironic/objects/node.py b/ironic/objects/node.py index 75ead2d3d0..f6b9e000ea 100644 --- a/ironic/objects/node.py +++ b/ironic/objects/node.py @@ -79,6 +79,9 @@ class Node(base.IronicObject): # that started but failed to finish. 'last_error': obj_utils.str_or_none, + 'inspection_finished_at': obj_utils.datetime_or_str_or_none, + 'inspection_started_at': obj_utils.datetime_or_str_or_none, + 'extra': obj_utils.dict_or_none, } diff --git a/ironic/tests/api/v1/test_nodes.py b/ironic/tests/api/v1/test_nodes.py index 790284e84b..9222c1ee1e 100644 --- a/ironic/tests/api/v1/test_nodes.py +++ b/ironic/tests/api/v1/test_nodes.py @@ -199,6 +199,8 @@ class TestListNodes(test_api_base.FunctionalTest): self.assertIn('reservation', data) self.assertIn('maintenance_reason', data) self.assertIn('name', data) + self.assertIn('inspection_finished_at', data) + self.assertIn('inspection_started_at', data) # never expose the chassis_id self.assertNotIn('chassis_id', data) @@ -219,6 +221,8 @@ class TestListNodes(test_api_base.FunctionalTest): self.assertIn('target_power_state', data['nodes'][0]) self.assertIn('target_provision_state', data['nodes'][0]) self.assertIn('provision_updated_at', data['nodes'][0]) + self.assertIn('inspection_finished_at', data['nodes'][0]) + self.assertIn('inspection_started_at', data['nodes'][0]) # never expose the chassis_id self.assertNotIn('chassis_id', data['nodes'][0]) @@ -1504,6 +1508,9 @@ class TestPut(test_api_base.FunctionalTest): p = mock.patch.object(rpcapi.ConductorAPI, 'do_node_tear_down') self.mock_dntd = p.start() self.addCleanup(p.stop) + p = mock.patch.object(rpcapi.ConductorAPI, 'inspect_hardware') + self.mock_dnih = p.start() + self.addCleanup(p.stop) def test_power_state(self): response = self.put_json('/nodes/%s/states/power' % self.node.uuid, @@ -1695,6 +1702,17 @@ class TestPut(test_api_base.FunctionalTest): states.VERBS['provide'], 'test-topic') + def test_inspect_already_in_progress(self): + node = self.node + node.provision_state = states.INSPECTING + node.target_provision_state = states.MANAGEABLE + node.reservation = 'fake-host' + node.save() + ret = self.put_json('/nodes/%s/states/provision' % node.uuid, + {'target': states.MANAGEABLE}, + expect_errors=True) + self.assertEqual(409, ret.status_code) # Conflict + @mock.patch.object(rpcapi.ConductorAPI, 'do_provisioning_action') def test_manage_from_available(self, mock_dpa): self.node.provision_state = states.AVAILABLE diff --git a/ironic/tests/conductor/test_manager.py b/ironic/tests/conductor/test_manager.py index 17c901d64b..0640f34805 100644 --- a/ironic/tests/conductor/test_manager.py +++ b/ironic/tests/conductor/test_manager.py @@ -3070,3 +3070,356 @@ class StoreConfigDriveTestCase(tests_base.TestCase): mock_swift.return_value.get_temp_url.assert_called_once_with( container_name, expected_obj_name, timeout) self.assertEqual(expected_instance_info, self.node.instance_info) + + +@_mock_record_keepalive +class NodeInspectHardware(_ServiceSetUpMixin, + tests_db_base.DbTestCase): + + @mock.patch('ironic.drivers.modules.fake.FakeInspect.inspect_hardware') + def test_inspect_hardware_ok(self, mock_inspect): + self._start_service() + node = obj_utils.create_test_node(self.context, driver='fake', + provision_state=states.INSPECTING) + task = task_manager.TaskManager(self.context, node.uuid) + mock_inspect.return_value = states.MANAGEABLE + manager._do_inspect_hardware(task, self.service.conductor.id) + node.refresh() + self.assertEqual(states.MANAGEABLE, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertIsNone(node.last_error) + mock_inspect.assert_called_once_with(mock.ANY) + + @mock.patch('ironic.drivers.modules.fake.FakeInspect.inspect_hardware') + def test_inspect_hardware_return_inspecting(self, mock_inspect): + self._start_service() + node = obj_utils.create_test_node(self.context, driver='fake', + provision_state=states.INSPECTING) + task = task_manager.TaskManager(self.context, node.uuid) + mock_inspect.return_value = states.INSPECTING + manager._do_inspect_hardware(task, self.service.conductor.id) + node.refresh() + self.assertEqual(states.INSPECTING, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertIsNone(node.last_error) + mock_inspect.assert_called_once_with(mock.ANY) + + @mock.patch.object(manager, 'LOG') + @mock.patch('ironic.drivers.modules.fake.FakeInspect.inspect_hardware') + def test_inspect_hardware_return_other_state(self, mock_inspect, log_mock): + self._start_service() + node = obj_utils.create_test_node(self.context, driver='fake', + provision_state=states.INSPECTING) + task = task_manager.TaskManager(self.context, node.uuid) + mock_inspect.return_value = None + self.assertRaises(exception.HardwareInspectionFailure, + manager._do_inspect_hardware, task, + self.service.conductor.id) + node.refresh() + self.assertEqual(states.INSPECTFAIL, node.provision_state) + self.assertEqual(states.MANAGEABLE, node.target_provision_state) + self.assertIsNotNone(node.last_error) + mock_inspect.assert_called_once_with(mock.ANY) + self.assertTrue(log_mock.error.called) + + def test__check_inspect_timeouts(self): + self._start_service() + CONF.set_override('inspect_timeout', 1, group='conductor') + node = obj_utils.create_test_node(self.context, driver='fake', + provision_state=states.INSPECTING, + target_provision_state=states.MANAGEABLE, + provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0), + inspection_started_at=datetime.datetime(2000, 1, 1, 0, 0)) + + self.service._check_inspect_timeouts(self.context) + self.service._worker_pool.waitall() + node.refresh() + self.assertEqual(states.INSPECTFAIL, node.provision_state) + self.assertEqual(states.MANAGEABLE, node.target_provision_state) + self.assertIsNotNone(node.last_error) + + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker') + def test_inspect_hardware_worker_pool_full(self, mock_spawn): + prv_state = states.MANAGEABLE + tgt_prv_state = states.NOSTATE + node = obj_utils.create_test_node(self.context, + provision_state=prv_state, + target_provision_state=tgt_prv_state, + last_error=None, driver='fake') + self._start_service() + + mock_spawn.side_effect = exception.NoFreeConductorWorker() + + exc = self.assertRaises(messaging.rpc.ExpectedException, + self.service.inspect_hardware, + self.context, node.uuid) + # Compare true exception hidden by @messaging.expected_exceptions + self.assertEqual(exception.NoFreeConductorWorker, exc.exc_info[0]) + self.service._worker_pool.waitall() + node.refresh() + # Make sure things were rolled back + self.assertEqual(prv_state, node.provision_state) + self.assertEqual(tgt_prv_state, node.target_provision_state) + self.assertIsNotNone(node.last_error) + # Verify reservation has been cleared. + self.assertIsNone(node.reservation) + + def _test_inspect_hardware_validate_fail(self, mock_validate): + mock_validate.side_effect = exception.InvalidParameterValue('error') + node = obj_utils.create_test_node(self.context, driver='fake') + exc = self.assertRaises(messaging.rpc.ExpectedException, + self.service.inspect_hardware, + self.context, node.uuid) + # Compare true exception hidden by @messaging.expected_exceptions + self.assertEqual(exception.HardwareInspectionFailure, exc.exc_info[0]) + # This is a sync operation last_error should be None. + self.assertIsNone(node.last_error) + # Verify reservation has been cleared. + self.assertIsNone(node.reservation) + + @mock.patch('ironic.drivers.modules.fake.FakeInspect.validate') + def test_inspect_hardware_validate_fail(self, mock_validate): + self._test_inspect_hardware_validate_fail(mock_validate) + + @mock.patch('ironic.drivers.modules.fake.FakePower.validate') + def test_inspect_hardware_power_validate_fail(self, mock_validate): + self._test_inspect_hardware_validate_fail(mock_validate) + + @mock.patch('ironic.drivers.modules.fake.FakeInspect.inspect_hardware') + def test_inspect_hardware_raises_error(self, mock_inspect): + self._start_service() + mock_inspect.side_effect = exception.HardwareInspectionFailure('test') + state = states.MANAGEABLE + node = obj_utils.create_test_node(self.context, driver='fake', + provision_state=states.INSPECTING, + target_provision_state=state) + task = task_manager.TaskManager(self.context, node.uuid) + + self.assertRaises(exception.HardwareInspectionFailure, + manager._do_inspect_hardware, task, + self.service.conductor.id) + node.refresh() + self.assertEqual(states.INSPECTFAIL, node.provision_state) + self.assertEqual(states.MANAGEABLE, node.target_provision_state) + self.assertIsNotNone(node.last_error) + self.assertTrue(mock_inspect.called) + + +@mock.patch.object(task_manager, 'acquire') +@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor') +@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list') +class ManagerCheckInspectTimeoutsTestCase(_CommonMixIn, + tests_db_base.DbTestCase): + def setUp(self): + super(ManagerCheckInspectTimeoutsTestCase, self).setUp() + self.config(inspect_timeout=300, group='conductor') + self.service = manager.ConductorManager('hostname', 'test-topic') + self.service.dbapi = self.dbapi + + self.node = self._create_node(provision_state=states.INSPECTING, + target_provision_state=states.MANAGEABLE) + self.task = self._create_task(node=self.node) + + self.node2 = self._create_node(provision_state=states.INSPECTING, + target_provision_state=states.MANAGEABLE) + self.task2 = self._create_task(node=self.node2) + + self.filters = {'reserved': False, + 'inspection_started_before': 300, + 'provision_state': states.INSPECTING} + self.columns = ['uuid', 'driver'] + + def _assert_get_nodeinfo_args(self, get_nodeinfo_mock): + get_nodeinfo_mock.assert_called_once_with(sort_dir='asc', + columns=self.columns, filters=self.filters, + sort_key='inspection_started_at') + + def test__check_inspect_timeouts_disabled(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + self.config(inspect_timeout=0, group='conductor') + + self.service._check_inspect_timeouts(self.context) + + self.assertFalse(get_nodeinfo_mock.called) + self.assertFalse(mapped_mock.called) + self.assertFalse(acquire_mock.called) + + def test__check_inspect_timeouts_not_mapped(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = False + + self.service._check_inspect_timeouts(self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + mapped_mock.assert_called_once_with(self.node.uuid, self.node.driver) + self.assertFalse(acquire_mock.called) + + def test__check_inspect_timeout(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect(self.task) + + self.service._check_inspect_timeouts(self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + mapped_mock.assert_called_once_with(self.node.uuid, self.node.driver) + acquire_mock.assert_called_once_with(self.context, self.node.uuid) + self.task.process_event.assert_called_with('fail') + + def test__check_inspect_timeouts_acquire_node_disappears(self, + get_nodeinfo_mock, + mapped_mock, + acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + acquire_mock.side_effect = exception.NodeNotFound(node='fake') + + # Exception eaten + self.service._check_inspect_timeouts(self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + mapped_mock.assert_called_once_with( + self.node.uuid, self.node.driver) + acquire_mock.assert_called_once_with(self.context, + self.node.uuid) + self.assertFalse(self.task.process_event.called) + + def test__check_inspect_timeouts_acquire_node_locked(self, + get_nodeinfo_mock, + mapped_mock, + acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + acquire_mock.side_effect = exception.NodeLocked(node='fake', + host='fake') + + # Exception eaten + self.service._check_inspect_timeouts(self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + mapped_mock.assert_called_once_with( + self.node.uuid, self.node.driver) + acquire_mock.assert_called_once_with(self.context, + self.node.uuid) + self.assertFalse(self.task.process_event.called) + + def test__check_inspect_timeouts_no_acquire_after_lock(self, + get_nodeinfo_mock, + mapped_mock, + acquire_mock): + task = self._create_task( + node_attrs=dict(provision_state=states.AVAILABLE, + uuid=self.node.uuid)) + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response() + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect(task) + + self.service._check_inspect_timeouts(self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + mapped_mock.assert_called_once_with( + self.node.uuid, self.node.driver) + acquire_mock.assert_called_once_with(self.context, + self.node.uuid) + self.assertFalse(task.process_event.called) + + def test__check_inspect_timeouts_to_maintenance_after_lock(self, + get_nodeinfo_mock, + mapped_mock, + acquire_mock): + task = self._create_task( + node_attrs=dict(provision_state=states.INSPECTING, + target_provision_state=states.MANAGEABLE, + maintenance=True, + uuid=self.node.uuid)) + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response( + [task.node, self.node2]) + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect( + [task, self.task2]) + + self.service._check_inspect_timeouts(self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + self.assertEqual([mock.call(self.node.uuid, task.node.driver), + mock.call(self.node2.uuid, self.node2.driver)], + mapped_mock.call_args_list) + self.assertEqual([mock.call(self.context, self.node.uuid), + mock.call(self.context, self.node2.uuid)], + acquire_mock.call_args_list) + # First node skipped + self.assertFalse(task.process_event.called) + # Second node spawned + self.task2.process_event.assert_called_with('fail') + + def test__check_inspect_timeouts_exiting_no_worker_avail(self, + get_nodeinfo_mock, + mapped_mock, + acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response( + [self.node, self.node2]) + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect( + [(self.task, exception.NoFreeConductorWorker()), self.task2]) + + # Exception should be nuked + self.service._check_inspect_timeouts(self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + # mapped should be only called for the first node as we should + # have exited the loop early due to NoFreeConductorWorker + mapped_mock.assert_called_once_with( + self.node.uuid, self.node.driver) + acquire_mock.assert_called_once_with(self.context, + self.node.uuid) + self.task.process_event.assert_called_with('fail') + + def test__check_inspect_timeouts_exit_with_other_exception(self, + get_nodeinfo_mock, + mapped_mock, + acquire_mock): + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response( + [self.node, self.node2]) + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect( + [(self.task, exception.IronicException('foo')), self.task2]) + + # Should re-raise + self.assertRaises(exception.IronicException, + self.service._check_inspect_timeouts, + self.context) + + self._assert_get_nodeinfo_args(get_nodeinfo_mock) + # mapped should be only called for the first node as we should + # have exited the loop early due to unknown exception + mapped_mock.assert_called_once_with( + self.node.uuid, self.node.driver) + acquire_mock.assert_called_once_with(self.context, + self.node.uuid) + self.task.process_event.assert_called_with('fail') + + def test__check_inspect_timeouts_worker_limit(self, get_nodeinfo_mock, + mapped_mock, acquire_mock): + self.config(periodic_max_workers=2, group='conductor') + + # Use the same nodes/tasks to make life easier in the tests + # here + + get_nodeinfo_mock.return_value = self._get_nodeinfo_list_response( + [self.node] * 3) + mapped_mock.return_value = True + acquire_mock.side_effect = self._get_acquire_side_effect( + [self.task] * 3) + + self.service._check_inspect_timeouts(self.context) + + # Should only have ran 2. + self.assertEqual([mock.call(self.node.uuid, self.node.driver)] * 2, + mapped_mock.call_args_list) + self.assertEqual([mock.call(self.context, self.node.uuid)] * 2, + acquire_mock.call_args_list) + process_event_call = mock.call('fail') + self.assertEqual([process_event_call] * 2, + self.task.process_event.call_args_list) diff --git a/ironic/tests/conductor/test_rpcapi.py b/ironic/tests/conductor/test_rpcapi.py index 8d688f152a..c2ea6409cc 100644 --- a/ironic/tests/conductor/test_rpcapi.py +++ b/ironic/tests/conductor/test_rpcapi.py @@ -282,3 +282,9 @@ class RPCAPITestCase(base.DbTestCase): 'call', version='1.21', driver_name='fake-driver') + + def test_inspect_hardware(self): + self._test_rpcapi('inspect_hardware', + 'call', + version='1.24', + node_id=self.fake_node['uuid']) diff --git a/ironic/tests/db/test_nodes.py b/ironic/tests/db/test_nodes.py index 6fd256d009..35c542e5ed 100644 --- a/ironic/tests/db/test_nodes.py +++ b/ironic/tests/db/test_nodes.py @@ -164,6 +164,32 @@ class DbNodeTestCase(base.DbTestCase): states.DEPLOYWAIT}) self.assertEqual([node2.id], [r[0] for r in res]) + @mock.patch.object(timeutils, 'utcnow') + def test_get_nodeinfo_list_inspection(self, mock_utcnow): + past = datetime.datetime(2000, 1, 1, 0, 0) + next = past + datetime.timedelta(minutes=8) + present = past + datetime.timedelta(minutes=10) + mock_utcnow.return_value = past + + # node with provision_updated timeout + node1 = utils.create_test_node(uuid=uuidutils.generate_uuid(), + inspection_started_at=past) + # node with None in provision_updated_at + node2 = utils.create_test_node(uuid=uuidutils.generate_uuid(), + provision_state=states.INSPECTING) + # node without timeout + utils.create_test_node(uuid=uuidutils.generate_uuid(), + inspection_started_at=next) + + mock_utcnow.return_value = present + res = self.dbapi.get_nodeinfo_list( + filters={'inspection_started_before': 300}) + self.assertEqual([node1.id], [r[0] for r in res]) + + res = self.dbapi.get_nodeinfo_list(filters={'provision_state': + states.INSPECTING}) + self.assertEqual([node2.id], [r[0] for r in res]) + def test_get_node_list(self): uuids = [] for i in range(1, 6): @@ -355,6 +381,31 @@ class DbNodeTestCase(base.DbTestCase): node = utils.create_test_node() res = self.dbapi.update_node(node.id, {'extra': {'foo': 'bar'}}) self.assertIsNone(res['provision_updated_at']) + self.assertIsNone(res['inspection_started_at']) + + @mock.patch.object(timeutils, 'utcnow') + def test_update_node_inspection_started_at(self, mock_utcnow): + mocked_time = datetime.datetime(2000, 1, 1, 0, 0) + mock_utcnow.return_value = mocked_time + node = utils.create_test_node(uuid=uuidutils.generate_uuid(), + inspection_started_at=mocked_time) + res = self.dbapi.update_node(node.id, {'provision_state': 'fake'}) + result = res['inspection_started_at'] + self.assertEqual(mocked_time, + timeutils.normalize_time(result)) + self.assertIsNone(res['inspection_finished_at']) + + @mock.patch.object(timeutils, 'utcnow') + def test_update_node_inspection_finished_at(self, mock_utcnow): + mocked_time = datetime.datetime(2000, 1, 1, 0, 0) + mock_utcnow.return_value = mocked_time + node = utils.create_test_node(uuid=uuidutils.generate_uuid(), + inspection_finished_at=mocked_time) + res = self.dbapi.update_node(node.id, {'provision_state': 'fake'}) + result = res['inspection_finished_at'] + self.assertEqual(mocked_time, + timeutils.normalize_time(result)) + self.assertIsNone(res['inspection_started_at']) def test_reserve_node(self): node = utils.create_test_node() diff --git a/ironic/tests/db/utils.py b/ironic/tests/db/utils.py index f69e69458d..8af8907c37 100644 --- a/ironic/tests/db/utils.py +++ b/ironic/tests/db/utils.py @@ -203,6 +203,8 @@ def get_test_node(**kw): 'extra': kw.get('extra', {}), 'updated_at': kw.get('updated_at'), 'created_at': kw.get('created_at'), + 'inspection_finished_at': kw.get('inspection_finished_at'), + 'inspection_started_at': kw.get('inspection_started_at'), }