DRYD2 - MVP - Phase 2
Add Driver tasks for ConfigureHardware (equal to commission stage in MaaS)
This commit is contained in:
parent
3d2632156d
commit
6e342060df
@ -22,6 +22,7 @@ import drydock_provisioner.objects.task as task_model
|
|||||||
|
|
||||||
from drydock_provisioner.drivers.node import NodeDriver
|
from drydock_provisioner.drivers.node import NodeDriver
|
||||||
from .api_client import MaasRequestFactory
|
from .api_client import MaasRequestFactory
|
||||||
|
|
||||||
import drydock_provisioner.drivers.node.maasdriver.models.fabric as maas_fabric
|
import drydock_provisioner.drivers.node.maasdriver.models.fabric as maas_fabric
|
||||||
import drydock_provisioner.drivers.node.maasdriver.models.vlan as maas_vlan
|
import drydock_provisioner.drivers.node.maasdriver.models.vlan as maas_vlan
|
||||||
import drydock_provisioner.drivers.node.maasdriver.models.subnet as maas_subnet
|
import drydock_provisioner.drivers.node.maasdriver.models.subnet as maas_subnet
|
||||||
@ -133,7 +134,7 @@ class MaasNodeDriver(NodeDriver):
|
|||||||
'retry': False,
|
'retry': False,
|
||||||
'detail': 'MaaS Network creation timed-out'
|
'detail': 'MaaS Network creation timed-out'
|
||||||
}
|
}
|
||||||
self.logger.warn("Thread for task %s timed out after 120s" % (subtask.get_id()))
|
self.logger.warning("Thread for task %s timed out after 120s" % (subtask.get_id()))
|
||||||
self.orchestrator.task_field_update(task.get_id(),
|
self.orchestrator.task_field_update(task.get_id(),
|
||||||
status=hd_fields.TaskStatus.Complete,
|
status=hd_fields.TaskStatus.Complete,
|
||||||
result=hd_fields.ActionResult.Failure,
|
result=hd_fields.ActionResult.Failure,
|
||||||
@ -153,7 +154,9 @@ class MaasNodeDriver(NodeDriver):
|
|||||||
subtasks = []
|
subtasks = []
|
||||||
|
|
||||||
result_detail = {
|
result_detail = {
|
||||||
'detail': []
|
'detail': [],
|
||||||
|
'failed_nodes': [],
|
||||||
|
'successful_nodes': [],
|
||||||
}
|
}
|
||||||
|
|
||||||
for n in task.node_list:
|
for n in task.node_list:
|
||||||
@ -183,25 +186,95 @@ class MaasNodeDriver(NodeDriver):
|
|||||||
if subtask.status == hd_fields.TaskStatus.Complete:
|
if subtask.status == hd_fields.TaskStatus.Complete:
|
||||||
self.logger.info("Task %s to identify node %s complete - status %s" %
|
self.logger.info("Task %s to identify node %s complete - status %s" %
|
||||||
(subtask.get_id(), n, subtask.get_result()))
|
(subtask.get_id(), n, subtask.get_result()))
|
||||||
|
|
||||||
result_detail['detail'].extend(subtask.result_detail['detail'])
|
|
||||||
running_subtasks = running_subtasks - 1
|
running_subtasks = running_subtasks - 1
|
||||||
|
|
||||||
if subtask.result in [hd_fields.ActionResult.Success,
|
if subtask.result == hd_fields.ActionResult.Success:
|
||||||
hd_fields.ActionResult.PartialSuccess]:
|
result_detail['successful_nodes'].extend(subtask.node_list)
|
||||||
worked = True
|
worked = True
|
||||||
elif subtask.result in [hd_fields.ActionResult.Failure,
|
elif subtask.result == hd_fields.ActionResult.Failure:
|
||||||
hd_fields.ActionResult.PartialSuccess]:
|
result_detail['failed_nodes'].extend(subtask.node_list)
|
||||||
failed = True
|
failed = True
|
||||||
|
elif subtask.result == hd_fields.ActionResult.PartialSuccess:
|
||||||
|
worked = failed = True
|
||||||
|
|
||||||
time.sleep(1 * 60)
|
time.sleep(1 * 60)
|
||||||
attempts = attempts + 1
|
attempts = attempts + 1
|
||||||
|
|
||||||
if running_subtasks > 0:
|
if running_subtasks > 0:
|
||||||
self.logger.warn("Time out for task %s before all subtask threads complete" % (task.get_id()))
|
self.logger.warning("Time out for task %s before all subtask threads complete" % (task.get_id()))
|
||||||
result = hd_fields.ActionResult.DependentFailure
|
result = hd_fields.ActionResult.DependentFailure
|
||||||
result_detail['detail'].append('Some subtasks did not complete before the timeout threshold')
|
result_detail['detail'].append('Some subtasks did not complete before the timeout threshold')
|
||||||
if worked and failed:
|
elif worked and failed:
|
||||||
|
result = hd_fields.ActionResult.PartialSuccess
|
||||||
|
elif worked:
|
||||||
|
result = hd_fields.ActionResult.Success
|
||||||
|
else:
|
||||||
|
result = hd_fields.ActionResult.Failure
|
||||||
|
|
||||||
|
self.orchestrator.task_field_update(task.get_id(),
|
||||||
|
status=hd_fields.TaskStatus.Complete,
|
||||||
|
result=result,
|
||||||
|
result_detail=result_detail)
|
||||||
|
elif task.action == hd_fields.OrchestratorAction.ConfigureHardware:
|
||||||
|
self.orchestrator.task_field_update(task.get_id(),
|
||||||
|
status=hd_fields.TaskStatus.Running)
|
||||||
|
|
||||||
|
self.logger.debug("Starting subtask to commissiong %s nodes." % (len(task.node_list)))
|
||||||
|
|
||||||
|
subtasks = []
|
||||||
|
|
||||||
|
result_detail = {
|
||||||
|
'detail': [],
|
||||||
|
'failed_nodes': [],
|
||||||
|
'successful_nodes': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for n in task.node_list:
|
||||||
|
subtask = self.orchestrator.create_task(task_model.DriverTask,
|
||||||
|
parent_task_id=task.get_id(), design_id=design_id,
|
||||||
|
action=hd_fields.OrchestratorAction.ConfigureHardware,
|
||||||
|
site_name=task.site_name,
|
||||||
|
task_scope={'site': task.site_name, 'node_names': [n]})
|
||||||
|
runner = MaasTaskRunner(state_manager=self.state_manager,
|
||||||
|
orchestrator=self.orchestrator,
|
||||||
|
task_id=subtask.get_id(),config=self.config)
|
||||||
|
|
||||||
|
self.logger.info("Starting thread for task %s to commission node %s" % (subtask.get_id(), n))
|
||||||
|
|
||||||
|
runner.start()
|
||||||
|
subtasks.append(subtask.get_id())
|
||||||
|
|
||||||
|
running_subtasks = len(subtasks)
|
||||||
|
attempts = 0
|
||||||
|
worked = failed = False
|
||||||
|
|
||||||
|
#TODO Add timeout to config
|
||||||
|
while running_subtasks > 0 and attempts < 20:
|
||||||
|
for t in subtasks:
|
||||||
|
subtask = self.state_manager.get_task(t)
|
||||||
|
|
||||||
|
if subtask.status == hd_fields.TaskStatus.Complete:
|
||||||
|
self.logger.info("Task %s to commission node %s complete - status %s" %
|
||||||
|
(subtask.get_id(), n, subtask.get_result()))
|
||||||
|
running_subtasks = running_subtasks - 1
|
||||||
|
|
||||||
|
if subtask.result == hd_fields.ActionResult.Success:
|
||||||
|
result_detail['successful_nodes'].extend(subtask.node_list)
|
||||||
|
worked = True
|
||||||
|
elif subtask.result == hd_fields.ActionResult.Failure:
|
||||||
|
result_detail['failed_nodes'].extend(subtask.node_list)
|
||||||
|
failed = True
|
||||||
|
elif subtask.result == hd_fields.ActionResult.PartialSuccess:
|
||||||
|
worked = failed = True
|
||||||
|
|
||||||
|
time.sleep(1 * 60)
|
||||||
|
attempts = attempts + 1
|
||||||
|
|
||||||
|
if running_subtasks > 0:
|
||||||
|
self.logger.warning("Time out for task %s before all subtask threads complete" % (task.get_id()))
|
||||||
|
result = hd_fields.ActionResult.DependentFailure
|
||||||
|
result_detail['detail'].append('Some subtasks did not complete before the timeout threshold')
|
||||||
|
elif worked and failed:
|
||||||
result = hd_fields.ActionResult.PartialSuccess
|
result = hd_fields.ActionResult.PartialSuccess
|
||||||
elif worked:
|
elif worked:
|
||||||
result = hd_fields.ActionResult.Success
|
result = hd_fields.ActionResult.Success
|
||||||
@ -440,8 +513,80 @@ class MaasTaskRunner(drivers.DriverTaskRunner):
|
|||||||
status=hd_fields.TaskStatus.Complete,
|
status=hd_fields.TaskStatus.Complete,
|
||||||
result=result,
|
result=result,
|
||||||
result_detail=result_detail)
|
result_detail=result_detail)
|
||||||
|
elif task_action == hd_fields.OrchestratorAction.ConfigureHardware:
|
||||||
|
try:
|
||||||
|
machine_list = maas_machine.Machines(self.maas_client)
|
||||||
|
machine_list.refresh()
|
||||||
|
except:
|
||||||
|
self.orchestrator.task_field_update(self.task.get_id(),
|
||||||
|
status=hd_fields.TaskStatus.Complete,
|
||||||
|
result=hd_fields.ActionResult.Failure,
|
||||||
|
result_detail={'detail': 'Error accessing MaaS Machines API', 'retry': True})
|
||||||
|
return
|
||||||
|
|
||||||
|
nodes = self.task.node_list
|
||||||
|
|
||||||
|
result_detail = {'detail': []}
|
||||||
|
|
||||||
|
worked = failed = False
|
||||||
|
|
||||||
|
# TODO Better way of representing the node statuses than static strings
|
||||||
|
for n in nodes:
|
||||||
|
try:
|
||||||
|
self.logger.debug("Locating node %s for commissioning" % (n))
|
||||||
|
node = site_design.get_baremetal_node(n)
|
||||||
|
machine = machine_list.identify_baremetal_node(node, update_name=False)
|
||||||
|
if machine is not None:
|
||||||
|
if machine.status_name == 'New':
|
||||||
|
self.logger.debug("Located node %s in MaaS, starting commissioning" % (n))
|
||||||
|
machine.commission()
|
||||||
|
|
||||||
|
# Poll machine status
|
||||||
|
attempts = 0
|
||||||
|
|
||||||
|
while attempts < 20 and machine.status_name != 'Ready':
|
||||||
|
attempts = attempts + 1
|
||||||
|
time.sleep(1 * 60)
|
||||||
|
try:
|
||||||
|
machine.refresh()
|
||||||
|
self.logger.debug("Polling node %s status attempt %d: %s" % (n, attempts, machine.status_name))
|
||||||
|
except:
|
||||||
|
self.logger.warning("Error updating node %s status during commissioning, will re-attempt." %
|
||||||
|
(n))
|
||||||
|
if machine.status_name == 'Ready':
|
||||||
|
self.logger.info("Node %s commissioned." % (n))
|
||||||
|
result_detail['detail'].append("Node %s commissioned" % (n))
|
||||||
|
worked = True
|
||||||
|
elif machine.status_name == 'Commissioning':
|
||||||
|
self.logger.info("Located node %s in MaaS, node already being commissioned. Skipping..." % (n))
|
||||||
|
result_detail['detail'].append("Located node %s in MaaS, node already being commissioned. Skipping..." % (n))
|
||||||
|
worked = True
|
||||||
|
elif machine.status_name == 'Ready':
|
||||||
|
self.logger.info("Located node %s in MaaS, node commissioned. Skipping..." % (n))
|
||||||
|
result_detail['detail'].append("Located node %s in MaaS, node commissioned. Skipping..." % (n))
|
||||||
|
worked = True
|
||||||
|
else:
|
||||||
|
self.logger.warning("Located node %s in MaaS, unknown status %s. Skipping..." % (n, machine.status_name))
|
||||||
|
result_detail['detail'].append("Located node %s in MaaS, node commissioned. Skipping..." % (n))
|
||||||
|
failed = True
|
||||||
|
else:
|
||||||
|
self.logger.warning("Node %s not found in MaaS" % n)
|
||||||
|
failed = True
|
||||||
|
result_detail['detail'].append("Node %s not found in MaaS" % n)
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
failed = True
|
||||||
|
result_detail['detail'].append("Error commissioning node %s: %s" % (n, str(ex)))
|
||||||
|
|
||||||
|
result = None
|
||||||
|
if worked and failed:
|
||||||
|
result = hd_fields.ActionResult.PartialSuccess
|
||||||
|
elif worked:
|
||||||
|
result = hd_fields.ActionResult.Success
|
||||||
|
elif failed:
|
||||||
|
result = hd_fields.ActionResult.Failure
|
||||||
|
|
||||||
|
self.orchestrator.task_field_update(self.task.get_id(),
|
||||||
|
status=hd_fields.TaskStatus.Complete,
|
||||||
|
result=result,
|
||||||
|
result_detail=result_detail)
|
@ -44,7 +44,7 @@ class ResourceBase(object):
|
|||||||
|
|
||||||
updated_fields = resp.json()
|
updated_fields = resp.json()
|
||||||
|
|
||||||
for f in self.json_fields:
|
for f in self.fields:
|
||||||
if f in updated_fields.keys():
|
if f in updated_fields.keys():
|
||||||
setattr(self, f, updated_fields.get(f))
|
setattr(self, f, updated_fields.get(f))
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ class Machine(model_base.ResourceBase):
|
|||||||
|
|
||||||
resource_url = 'machines/{resource_id}/'
|
resource_url = 'machines/{resource_id}/'
|
||||||
fields = ['resource_id', 'hostname', 'power_type', 'power_state', 'power_parameters', 'interfaces',
|
fields = ['resource_id', 'hostname', 'power_type', 'power_state', 'power_parameters', 'interfaces',
|
||||||
'boot_interface', 'memory', 'cpu_count', 'tag_names']
|
'boot_interface', 'memory', 'cpu_count', 'tag_names', 'status_name']
|
||||||
json_fields = ['hostname', 'power_type']
|
json_fields = ['hostname', 'power_type']
|
||||||
|
|
||||||
def __init__(self, api_client, **kwargs):
|
def __init__(self, api_client, **kwargs):
|
||||||
@ -31,6 +31,8 @@ class Machine(model_base.ResourceBase):
|
|||||||
if getattr(self, 'resource_id', None) is not None:
|
if getattr(self, 'resource_id', None) is not None:
|
||||||
self.interfaces = maas_interface.Interfaces(api_client, system_id=self.resource_id)
|
self.interfaces = maas_interface.Interfaces(api_client, system_id=self.resource_id)
|
||||||
self.interfaces.refresh()
|
self.interfaces.refresh()
|
||||||
|
else:
|
||||||
|
self.interfaces = None
|
||||||
|
|
||||||
def get_power_params(self):
|
def get_power_params(self):
|
||||||
url = self.interpolate_url()
|
url = self.interpolate_url()
|
||||||
@ -54,6 +56,11 @@ class Machine(model_base.ResourceBase):
|
|||||||
if not resp.ok:
|
if not resp.ok:
|
||||||
raise Exception()
|
raise Exception()
|
||||||
|
|
||||||
|
def get_network_interface(self, iface_name):
|
||||||
|
if self.interfaces is not None:
|
||||||
|
iface = self.interfaces.singleton({'name': iface_name})
|
||||||
|
return iface
|
||||||
|
|
||||||
def get_details(self):
|
def get_details(self):
|
||||||
url = self.interpolate_url()
|
url = self.interpolate_url()
|
||||||
|
|
||||||
@ -142,6 +149,7 @@ class Machines(model_base.ResourceCollectionBase):
|
|||||||
maas_node.hostname = node_model.name
|
maas_node.hostname = node_model.name
|
||||||
maas_node.update()
|
maas_node.update()
|
||||||
self.logger.debug("Updated MaaS resource %s hostname to %s" % (maas_node.resource_id, node_model.name))
|
self.logger.debug("Updated MaaS resource %s hostname to %s" % (maas_node.resource_id, node_model.name))
|
||||||
|
|
||||||
return maas_node
|
return maas_node
|
||||||
|
|
||||||
except ValueError as ve:
|
except ValueError as ve:
|
||||||
|
@ -314,11 +314,35 @@ class Orchestrator(object):
|
|||||||
elif node_identify_task.get_result() in [hd_fields.ActionResult.PartialSuccess,
|
elif node_identify_task.get_result() in [hd_fields.ActionResult.PartialSuccess,
|
||||||
hd_fields.ActionResult.Failure]:
|
hd_fields.ActionResult.Failure]:
|
||||||
# TODO This threshold should be a configurable default and tunable by task API
|
# TODO This threshold should be a configurable default and tunable by task API
|
||||||
if node_identify_attempts > 2:
|
if node_identify_attempts > 10:
|
||||||
failed = True
|
failed = True
|
||||||
break
|
break
|
||||||
|
|
||||||
time.sleep(5 * 60)
|
time.sleep(1 * 60)
|
||||||
|
|
||||||
|
# We can only commission nodes that were successfully identified in the provisioner
|
||||||
|
if len(node_identify_task.result_detail['successful_nodes']) > 0:
|
||||||
|
self.logger.info("Found %s successfully identified nodes, starting commissioning." %
|
||||||
|
(len(node_identify_task.result_detail['successful_nodes'])))
|
||||||
|
node_commission_task = self.create_task(tasks.DriverTask,
|
||||||
|
parent_task_id=task.get_id(), design_id=design_id,
|
||||||
|
action=hd_fields.OrchestratorAction.ConfigureHardware,
|
||||||
|
task_scope={'site': task_site,
|
||||||
|
'node_names': node_identify_task.result_detail['successful_nodes']})
|
||||||
|
|
||||||
|
self.logger.info("Starting node driver task %s to commission nodes." % (node_commission_task.get_id()))
|
||||||
|
node_driver.execute_task(node_commission_task.get_id())
|
||||||
|
|
||||||
|
node_commission_task = self.state_manager.get_task(node_commission_task.get_id())
|
||||||
|
|
||||||
|
if node_commission_task.get_result() in [hd_fields.ActionResult.Success,
|
||||||
|
hd_fields.ActionResult.PartialSuccess]:
|
||||||
|
worked = True
|
||||||
|
elif node_commission_task.get_result() in [hd_fields.ActionResult.Failure,
|
||||||
|
hd_fields.ActionResult.PartialSuccess]:
|
||||||
|
failed = True
|
||||||
|
else:
|
||||||
|
self.logger.warning("No nodes successfully identified, skipping commissioning subtask")
|
||||||
|
|
||||||
final_result = None
|
final_result = None
|
||||||
if worked and failed:
|
if worked and failed:
|
||||||
@ -328,6 +352,7 @@ class Orchestrator(object):
|
|||||||
else:
|
else:
|
||||||
final_result = hd_fields.ActionResult.Failure
|
final_result = hd_fields.ActionResult.Failure
|
||||||
|
|
||||||
|
|
||||||
self.task_field_update(task_id,
|
self.task_field_update(task_id,
|
||||||
status=hd_fields.TaskStatus.Complete,
|
status=hd_fields.TaskStatus.Complete,
|
||||||
result=final_result)
|
result=final_result)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user