Merge "Added code to support Destroy Node"
This commit is contained in:
commit
a67efe071e
@ -46,6 +46,7 @@ class DrydockConfig(object):
|
||||
options = [
|
||||
cfg.IntOpt(
|
||||
'poll_interval',
|
||||
min=1,
|
||||
default=10,
|
||||
help=
|
||||
'Polling interval in seconds for checking subtask or downstream status'
|
||||
@ -196,6 +197,11 @@ class DrydockConfig(object):
|
||||
help=
|
||||
'Timeout in minutes between deployment completion and the all boot actions reporting status'
|
||||
),
|
||||
cfg.IntOpt(
|
||||
'destroy_node',
|
||||
default=30,
|
||||
help='Timeout in minutes for releasing a node',
|
||||
),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
|
@ -217,9 +217,168 @@ class InterrogateNode(BaseMaasAction):
|
||||
class DestroyNode(BaseMaasAction):
|
||||
"""Action to remove node from MaaS in preparation for redeploy."""
|
||||
|
||||
# define the list of node statuses, from which maas server allows releasing a node
|
||||
|
||||
# A machine can be released from following states, based on MaaS API reference.
|
||||
# The disk of the released machine is erased, and the machine will end up in
|
||||
# "Ready" state in MaaS after release.
|
||||
actionable_node_statuses = (
|
||||
"Allocated",
|
||||
"Deployed",
|
||||
"Deploying",
|
||||
"Failed deployment",
|
||||
"Releasing failed",
|
||||
"Failed disk erasing",
|
||||
)
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Destroy Node erases the storage, releases the BM node in MaaS, and
|
||||
finally deletes the BM node as a resource from the MaaS database.
|
||||
After successful completion of this action, the destroyed nodes are removed
|
||||
from MaaS list of resources and will be Unkown to MaaS. These nodes have
|
||||
to go through the enlistment process and be detected by MaaS as new nodes.
|
||||
Destroy Node can be performed from any BM node state.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
try:
|
||||
machine_list = maas_machine.Machines(self.maas_client)
|
||||
machine_list.refresh()
|
||||
except Exception as ex:
|
||||
self.logger.warning("Error accessing the MaaS API.", exc_info=ex)
|
||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||
self.task.failure()
|
||||
self.task.add_status_msg(
|
||||
msg='Error accessing MaaS Machines API: {}'.format(str(ex)),
|
||||
error=True,
|
||||
ctx='NA',
|
||||
ctx_type='NA')
|
||||
self.task.save()
|
||||
return
|
||||
|
||||
self.task.set_status(hd_fields.TaskStatus.Running)
|
||||
self.task.save()
|
||||
|
||||
try:
|
||||
site_design = self._load_site_design()
|
||||
except errors.OrchestratorError:
|
||||
self.task.add_status_msg(
|
||||
msg="Error loading site design.",
|
||||
error=True,
|
||||
ctx='NA',
|
||||
ctx_type='NA')
|
||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||
self.task.failure()
|
||||
self.task.save()
|
||||
return
|
||||
|
||||
nodes = self.orchestrator.process_node_filter(self.task.node_filter,
|
||||
site_design)
|
||||
for n in nodes:
|
||||
try:
|
||||
machine = machine_list.identify_baremetal_node(n, update_name=False)
|
||||
|
||||
if machine is None:
|
||||
msg = "Could not locate machine for node {}".format(n.name)
|
||||
self.logger.info(msg)
|
||||
self.task.add_status_msg(
|
||||
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||
self.task.success(focus=n.get_id())
|
||||
continue
|
||||
|
||||
# First release the node and erase its disks, if MaaS API allows
|
||||
if machine.status_name in self.actionable_node_statuses:
|
||||
msg = "Releasing node {}, and erasing storage.".format(
|
||||
n.name)
|
||||
self.logger.info(msg)
|
||||
|
||||
try:
|
||||
machine.release(erase_disk=True, quick_erase=True)
|
||||
except errors.DriverError:
|
||||
msg = "Error Releasing node {}, skipping".format(n.name)
|
||||
self.logger.warning(msg)
|
||||
self.task.add_status_msg(
|
||||
msg=msg, error=True, ctx=n.name, ctx_type='node')
|
||||
self.task.failure(focus=n.get_id())
|
||||
continue
|
||||
|
||||
# node release with erase disk will take sometime monitor it
|
||||
attempts = 0
|
||||
max_attempts = (config.config_mgr.conf.timeouts.destroy_node
|
||||
* 60) // config.config_mgr.conf.maasdriver.poll_interval
|
||||
|
||||
while (attempts < max_attempts
|
||||
and (not machine.status_name.startswith('Ready')
|
||||
and not machine.status_name.startswith(
|
||||
'Failed'))):
|
||||
attempts = attempts + 1
|
||||
time.sleep(
|
||||
config.config_mgr.conf.maasdriver.poll_interval)
|
||||
try:
|
||||
machine.refresh()
|
||||
self.logger.debug(
|
||||
"Polling node {} status attempt {:d} of {:d}: {}".format(
|
||||
n.name, attempts, max_attempts,
|
||||
machine.status_name))
|
||||
except Exception:
|
||||
self.logger.warning(
|
||||
"Error updating node {} status during release node, will re-attempt.".format(n.name))
|
||||
if machine.status_name.startswith('Ready'):
|
||||
msg = "Node {} released and disk erased.".format(
|
||||
n.name)
|
||||
self.logger.info(msg)
|
||||
self.task.add_status_msg(
|
||||
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||
self.task.success(focus=n.get_id())
|
||||
else:
|
||||
msg = "Node {} release timed out".format(n.name)
|
||||
self.logger.warning(msg)
|
||||
self.task.add_status_msg(
|
||||
msg=msg, error=True, ctx=n.name, ctx_type='node')
|
||||
self.task.failure(focus=n.get_id())
|
||||
else:
|
||||
# Node is in a state that cannot be released from MaaS API.
|
||||
# Reset the storage instead
|
||||
msg = "Destroy node {} in status: {}, resetting storage.".format(
|
||||
n.name, machine.status_name)
|
||||
self.logger.info(msg)
|
||||
machine.reset_storage_config()
|
||||
self.task.add_status_msg(
|
||||
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||
|
||||
# for both cases above delete the node to force re-commissioning
|
||||
# But, before deleting the node reset it power type in maas if
|
||||
# the node power type should be virsh.
|
||||
try:
|
||||
if n.oob_type == 'libvirt':
|
||||
self.logger.info(
|
||||
'Resetting MaaS virsh power parameters for node {}.'.format(
|
||||
n.name))
|
||||
# setting power type attibutes to empty string
|
||||
# will remove them from maas BMC table
|
||||
machine.reset_power_parameters()
|
||||
except AttributeError as attr_er:
|
||||
pass
|
||||
|
||||
machine.delete()
|
||||
msg = "Deleted Node: {} in status: {}.".format(n.name,
|
||||
machine.status_name)
|
||||
self.logger.info(msg)
|
||||
self.task.add_status_msg(
|
||||
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||
self.task.success(focus=n.get_id())
|
||||
|
||||
except errors.DriverError as dex:
|
||||
msg = "Driver error, while destroying node {}, skipping".format(
|
||||
n.name)
|
||||
self.logger.warning(msg, exc_info=dex)
|
||||
self.task.add_status_msg(
|
||||
msg=msg, error=True, ctx=n.name, ctx_type='node')
|
||||
self.task.failure(focus=n.get_id())
|
||||
continue
|
||||
|
||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||
self.task.save()
|
||||
return
|
||||
|
||||
@ -970,9 +1129,8 @@ class ConfigureHardware(BaseMaasAction):
|
||||
|
||||
# Poll machine status
|
||||
attempts = 0
|
||||
max_attempts = config.config_mgr.conf.timeouts.configure_hardware * (
|
||||
60 //
|
||||
config.config_mgr.conf.maasdriver.poll_interval)
|
||||
max_attempts = (config.config_mgr.conf.timeouts.configure_hardware
|
||||
* 60) // config.config_mgr.conf.maasdriver.poll_interval
|
||||
|
||||
while (attempts < max_attempts and
|
||||
(machine.status_name != 'Ready' and
|
||||
@ -2139,8 +2297,8 @@ class DeployNode(BaseMaasAction):
|
||||
continue
|
||||
|
||||
attempts = 0
|
||||
max_attempts = config.config_mgr.conf.timeouts.deploy_node * (
|
||||
60 // config.config_mgr.conf.maasdriver.poll_interval)
|
||||
max_attempts = (config.config_mgr.conf.timeouts.deploy_node
|
||||
* 60) // config.config_mgr.conf.maasdriver.poll_interval
|
||||
|
||||
while (attempts < max_attempts
|
||||
and (not machine.status_name.startswith('Deployed')
|
||||
|
@ -207,14 +207,39 @@ class Machine(model_base.ResourceBase):
|
||||
self.logger.error(msg)
|
||||
raise errors.DriverError(msg)
|
||||
|
||||
def release(self, erase_disk=False):
|
||||
def release(self, erase_disk=False, secure_erase=False, quick_erase=False):
|
||||
"""Release a node so it can be redeployed.
|
||||
Release is opposite of acquire/allocate. After a successful release, the node
|
||||
will be in Ready state.
|
||||
|
||||
:param erase_disk: If true, the local disks on the machine will be quick wiped
|
||||
:param erase_disk: If true, the local disks on the machine will be erased.
|
||||
:param secure_erase: If erase_disk and secure_erase are set to True, and
|
||||
quick_erase is not specified (default to False), MaaS
|
||||
will try secure_erase first. If the drive does not
|
||||
support secure erase, MaaS will overwirte th entire
|
||||
drive with null butes.
|
||||
:param quick_erase: If erase_disk and quick_erase are true, 1MB at the
|
||||
start and at the end of the drive will be erased to make
|
||||
data recovery inconvenient.
|
||||
If all three parameters are True and the drive supports
|
||||
secure erase, secure_erase will have precedence.
|
||||
If the all three parameters are true, but the disk drive
|
||||
does not support secure erase, MaaS will do quick erase.
|
||||
But, if the disk drive supports neither secure nor
|
||||
quick erase, the disk will be re-written with null bytes.
|
||||
If erase_disk is true, but both secure_erase and quick_erase
|
||||
are Fasle (default), MAAS will overwrite the whole disk
|
||||
with null bytes.
|
||||
If erase_disk is false, MaaS will not erase the drive, before
|
||||
releasing the node.
|
||||
"""
|
||||
url = self.interpolate_url()
|
||||
|
||||
options = {'erase': erase_disk}
|
||||
options = {
|
||||
'erase': erase_disk,
|
||||
'secure_erase': secure_erase,
|
||||
'quick_erase': quick_erase,
|
||||
}
|
||||
|
||||
resp = self.api_client.post(url, op='release', files=options)
|
||||
|
||||
@ -225,6 +250,26 @@ class Machine(model_base.ResourceBase):
|
||||
self.logger.debug("MaaS response: %s" % resp.text)
|
||||
raise errors.DriverError(brief_msg)
|
||||
|
||||
def delete(self):
|
||||
""" Reset the node storage, and delete it.
|
||||
After node deletion, the node resource is purged from MaaS resources.
|
||||
MaaS API machine delete call, only removes the machine from MaaS resource list.
|
||||
AFter delete, he namchine needs to be manually pwowered on to be re-enlisted
|
||||
in MaaS as a New node.
|
||||
|
||||
:param erase_disk: If true, the node storage is reset, before node resource
|
||||
is deleted from maas.
|
||||
"""
|
||||
url = self.interpolate_url()
|
||||
resp = self.api_client.delete(url)
|
||||
|
||||
if not resp.ok:
|
||||
brief_msg = ("Error deleting node, received HTTP %s from MaaS" %
|
||||
resp.status_code)
|
||||
self.logger.error(brief_msg)
|
||||
self.logger.debug("MaaS response: %s" % resp.text)
|
||||
raise errors.DriverError(brief_msg)
|
||||
|
||||
def commission(self, debug=False):
|
||||
"""Start the MaaS commissioning process.
|
||||
|
||||
@ -370,6 +415,29 @@ class Machine(model_base.ResourceBase):
|
||||
"Failed updating power parameters MAAS url %s - return code %s\n%s"
|
||||
% (url, resp.status_code.resp.text))
|
||||
|
||||
def reset_power_parameters(self):
|
||||
"""Reset power type and parameters for this node to manual.
|
||||
This is done to address the MaaS api issue detecting multiple BMC NIC
|
||||
after a node delete.
|
||||
|
||||
Only available after the node has been added to MAAS.
|
||||
"""
|
||||
|
||||
url = self.interpolate_url()
|
||||
|
||||
self.logger.debug("Resetting node power type for machine {}".format(
|
||||
self.resource_id))
|
||||
self.power_type = 'manual'
|
||||
power_params = {'power_type': 'manual'}
|
||||
resp = self.api_client.put(url, files=power_params)
|
||||
|
||||
if resp.status_code == 200:
|
||||
return True
|
||||
|
||||
raise errors.DriverError(
|
||||
"Failed updating power parameters MAAS url {} - return code {}\n{}".format(
|
||||
url, resp.status_code.resp.text))
|
||||
|
||||
def to_dict(self):
|
||||
"""Serialize this resource instance into a dict.
|
||||
|
||||
|
@ -202,10 +202,66 @@ class DestroyNodes(BaseAction):
|
||||
|
||||
def start(self):
|
||||
"""Start executing this action."""
|
||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||
self.task.failure()
|
||||
self.task.set_status(hd_fields.TaskStatus.Running)
|
||||
self.task.save()
|
||||
|
||||
node_driver = self._get_driver('node')
|
||||
|
||||
if node_driver is None:
|
||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||
self.task.add_status_msg(
|
||||
msg="No node driver enabled, ending task.",
|
||||
error=True,
|
||||
ctx=str(self.task.get_id()),
|
||||
ctx_type='task')
|
||||
self.task.result.set_message("No NodeDriver enabled.")
|
||||
self.task.result.set_reason("Bad Configuration.")
|
||||
self.task.failure()
|
||||
self.task.save()
|
||||
return
|
||||
|
||||
target_nodes = self.orchestrator.get_target_nodes(self.task)
|
||||
|
||||
if not target_nodes:
|
||||
self.task.add_status_msg(
|
||||
msg="No nodes in scope, no work to to do.",
|
||||
error=False,
|
||||
ctx='NA',
|
||||
ctx_type='NA')
|
||||
self.task.success()
|
||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||
self.task.save()
|
||||
return
|
||||
|
||||
node_release_task = None
|
||||
while True:
|
||||
if node_release_task is None:
|
||||
node_release_task = self.orchestrator.create_task(
|
||||
design_ref=self.task.design_ref,
|
||||
action=hd_fields.OrchestratorAction.DestroyNode,
|
||||
node_filter=self.task.node_filter)
|
||||
self.task.register_subtask(node_release_task)
|
||||
|
||||
self.logger.info(
|
||||
"Starting node driver task %s to Release nodes." %
|
||||
(node_release_task.get_id()))
|
||||
node_driver.execute_task(node_release_task.get_id())
|
||||
|
||||
node_release_task = self.state_manager.get_task(
|
||||
node_release_task.get_id())
|
||||
|
||||
try:
|
||||
if not node_release_task.retry_task(max_attempts=3):
|
||||
break
|
||||
except errors.MaxRetriesReached:
|
||||
self.task.failure()
|
||||
break
|
||||
|
||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||
self.task.bubble_results(
|
||||
action_filter=hd_fields.OrchestratorAction.DestroyNode)
|
||||
self.task.align_result()
|
||||
self.task.save()
|
||||
return
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user