Merge "Added code to support Destroy Node"
This commit is contained in:
commit
a67efe071e
@ -46,6 +46,7 @@ class DrydockConfig(object):
|
|||||||
options = [
|
options = [
|
||||||
cfg.IntOpt(
|
cfg.IntOpt(
|
||||||
'poll_interval',
|
'poll_interval',
|
||||||
|
min=1,
|
||||||
default=10,
|
default=10,
|
||||||
help=
|
help=
|
||||||
'Polling interval in seconds for checking subtask or downstream status'
|
'Polling interval in seconds for checking subtask or downstream status'
|
||||||
@ -196,6 +197,11 @@ class DrydockConfig(object):
|
|||||||
help=
|
help=
|
||||||
'Timeout in minutes between deployment completion and the all boot actions reporting status'
|
'Timeout in minutes between deployment completion and the all boot actions reporting status'
|
||||||
),
|
),
|
||||||
|
cfg.IntOpt(
|
||||||
|
'destroy_node',
|
||||||
|
default=30,
|
||||||
|
help='Timeout in minutes for releasing a node',
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -217,9 +217,168 @@ class InterrogateNode(BaseMaasAction):
|
|||||||
class DestroyNode(BaseMaasAction):
|
class DestroyNode(BaseMaasAction):
|
||||||
"""Action to remove node from MaaS in preparation for redeploy."""
|
"""Action to remove node from MaaS in preparation for redeploy."""
|
||||||
|
|
||||||
|
# define the list of node statuses, from which maas server allows releasing a node
|
||||||
|
|
||||||
|
# A machine can be released from following states, based on MaaS API reference.
|
||||||
|
# The disk of the released machine is erased, and the machine will end up in
|
||||||
|
# "Ready" state in MaaS after release.
|
||||||
|
actionable_node_statuses = (
|
||||||
|
"Allocated",
|
||||||
|
"Deployed",
|
||||||
|
"Deploying",
|
||||||
|
"Failed deployment",
|
||||||
|
"Releasing failed",
|
||||||
|
"Failed disk erasing",
|
||||||
|
)
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
|
"""
|
||||||
|
Destroy Node erases the storage, releases the BM node in MaaS, and
|
||||||
|
finally deletes the BM node as a resource from the MaaS database.
|
||||||
|
After successful completion of this action, the destroyed nodes are removed
|
||||||
|
from MaaS list of resources and will be Unkown to MaaS. These nodes have
|
||||||
|
to go through the enlistment process and be detected by MaaS as new nodes.
|
||||||
|
Destroy Node can be performed from any BM node state.
|
||||||
|
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
machine_list = maas_machine.Machines(self.maas_client)
|
||||||
|
machine_list.refresh()
|
||||||
|
except Exception as ex:
|
||||||
|
self.logger.warning("Error accessing the MaaS API.", exc_info=ex)
|
||||||
|
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||||
|
self.task.failure()
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg='Error accessing MaaS Machines API: {}'.format(str(ex)),
|
||||||
|
error=True,
|
||||||
|
ctx='NA',
|
||||||
|
ctx_type='NA')
|
||||||
|
self.task.save()
|
||||||
|
return
|
||||||
|
|
||||||
|
self.task.set_status(hd_fields.TaskStatus.Running)
|
||||||
|
self.task.save()
|
||||||
|
|
||||||
|
try:
|
||||||
|
site_design = self._load_site_design()
|
||||||
|
except errors.OrchestratorError:
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg="Error loading site design.",
|
||||||
|
error=True,
|
||||||
|
ctx='NA',
|
||||||
|
ctx_type='NA')
|
||||||
|
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||||
|
self.task.failure()
|
||||||
|
self.task.save()
|
||||||
|
return
|
||||||
|
|
||||||
|
nodes = self.orchestrator.process_node_filter(self.task.node_filter,
|
||||||
|
site_design)
|
||||||
|
for n in nodes:
|
||||||
|
try:
|
||||||
|
machine = machine_list.identify_baremetal_node(n, update_name=False)
|
||||||
|
|
||||||
|
if machine is None:
|
||||||
|
msg = "Could not locate machine for node {}".format(n.name)
|
||||||
|
self.logger.info(msg)
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||||
|
self.task.success(focus=n.get_id())
|
||||||
|
continue
|
||||||
|
|
||||||
|
# First release the node and erase its disks, if MaaS API allows
|
||||||
|
if machine.status_name in self.actionable_node_statuses:
|
||||||
|
msg = "Releasing node {}, and erasing storage.".format(
|
||||||
|
n.name)
|
||||||
|
self.logger.info(msg)
|
||||||
|
|
||||||
|
try:
|
||||||
|
machine.release(erase_disk=True, quick_erase=True)
|
||||||
|
except errors.DriverError:
|
||||||
|
msg = "Error Releasing node {}, skipping".format(n.name)
|
||||||
|
self.logger.warning(msg)
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg=msg, error=True, ctx=n.name, ctx_type='node')
|
||||||
|
self.task.failure(focus=n.get_id())
|
||||||
|
continue
|
||||||
|
|
||||||
|
# node release with erase disk will take sometime monitor it
|
||||||
|
attempts = 0
|
||||||
|
max_attempts = (config.config_mgr.conf.timeouts.destroy_node
|
||||||
|
* 60) // config.config_mgr.conf.maasdriver.poll_interval
|
||||||
|
|
||||||
|
while (attempts < max_attempts
|
||||||
|
and (not machine.status_name.startswith('Ready')
|
||||||
|
and not machine.status_name.startswith(
|
||||||
|
'Failed'))):
|
||||||
|
attempts = attempts + 1
|
||||||
|
time.sleep(
|
||||||
|
config.config_mgr.conf.maasdriver.poll_interval)
|
||||||
|
try:
|
||||||
|
machine.refresh()
|
||||||
|
self.logger.debug(
|
||||||
|
"Polling node {} status attempt {:d} of {:d}: {}".format(
|
||||||
|
n.name, attempts, max_attempts,
|
||||||
|
machine.status_name))
|
||||||
|
except Exception:
|
||||||
|
self.logger.warning(
|
||||||
|
"Error updating node {} status during release node, will re-attempt.".format(n.name))
|
||||||
|
if machine.status_name.startswith('Ready'):
|
||||||
|
msg = "Node {} released and disk erased.".format(
|
||||||
|
n.name)
|
||||||
|
self.logger.info(msg)
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||||
|
self.task.success(focus=n.get_id())
|
||||||
|
else:
|
||||||
|
msg = "Node {} release timed out".format(n.name)
|
||||||
|
self.logger.warning(msg)
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg=msg, error=True, ctx=n.name, ctx_type='node')
|
||||||
|
self.task.failure(focus=n.get_id())
|
||||||
|
else:
|
||||||
|
# Node is in a state that cannot be released from MaaS API.
|
||||||
|
# Reset the storage instead
|
||||||
|
msg = "Destroy node {} in status: {}, resetting storage.".format(
|
||||||
|
n.name, machine.status_name)
|
||||||
|
self.logger.info(msg)
|
||||||
|
machine.reset_storage_config()
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||||
|
|
||||||
|
# for both cases above delete the node to force re-commissioning
|
||||||
|
# But, before deleting the node reset it power type in maas if
|
||||||
|
# the node power type should be virsh.
|
||||||
|
try:
|
||||||
|
if n.oob_type == 'libvirt':
|
||||||
|
self.logger.info(
|
||||||
|
'Resetting MaaS virsh power parameters for node {}.'.format(
|
||||||
|
n.name))
|
||||||
|
# setting power type attibutes to empty string
|
||||||
|
# will remove them from maas BMC table
|
||||||
|
machine.reset_power_parameters()
|
||||||
|
except AttributeError as attr_er:
|
||||||
|
pass
|
||||||
|
|
||||||
|
machine.delete()
|
||||||
|
msg = "Deleted Node: {} in status: {}.".format(n.name,
|
||||||
|
machine.status_name)
|
||||||
|
self.logger.info(msg)
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg=msg, error=False, ctx=n.name, ctx_type='node')
|
||||||
|
self.task.success(focus=n.get_id())
|
||||||
|
|
||||||
|
except errors.DriverError as dex:
|
||||||
|
msg = "Driver error, while destroying node {}, skipping".format(
|
||||||
|
n.name)
|
||||||
|
self.logger.warning(msg, exc_info=dex)
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg=msg, error=True, ctx=n.name, ctx_type='node')
|
||||||
|
self.task.failure(focus=n.get_id())
|
||||||
|
continue
|
||||||
|
|
||||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||||
self.task.failure()
|
|
||||||
self.task.save()
|
self.task.save()
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -970,9 +1129,8 @@ class ConfigureHardware(BaseMaasAction):
|
|||||||
|
|
||||||
# Poll machine status
|
# Poll machine status
|
||||||
attempts = 0
|
attempts = 0
|
||||||
max_attempts = config.config_mgr.conf.timeouts.configure_hardware * (
|
max_attempts = (config.config_mgr.conf.timeouts.configure_hardware
|
||||||
60 //
|
* 60) // config.config_mgr.conf.maasdriver.poll_interval
|
||||||
config.config_mgr.conf.maasdriver.poll_interval)
|
|
||||||
|
|
||||||
while (attempts < max_attempts and
|
while (attempts < max_attempts and
|
||||||
(machine.status_name != 'Ready' and
|
(machine.status_name != 'Ready' and
|
||||||
@ -2139,8 +2297,8 @@ class DeployNode(BaseMaasAction):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
max_attempts = config.config_mgr.conf.timeouts.deploy_node * (
|
max_attempts = (config.config_mgr.conf.timeouts.deploy_node
|
||||||
60 // config.config_mgr.conf.maasdriver.poll_interval)
|
* 60) // config.config_mgr.conf.maasdriver.poll_interval
|
||||||
|
|
||||||
while (attempts < max_attempts
|
while (attempts < max_attempts
|
||||||
and (not machine.status_name.startswith('Deployed')
|
and (not machine.status_name.startswith('Deployed')
|
||||||
|
@ -207,14 +207,39 @@ class Machine(model_base.ResourceBase):
|
|||||||
self.logger.error(msg)
|
self.logger.error(msg)
|
||||||
raise errors.DriverError(msg)
|
raise errors.DriverError(msg)
|
||||||
|
|
||||||
def release(self, erase_disk=False):
|
def release(self, erase_disk=False, secure_erase=False, quick_erase=False):
|
||||||
"""Release a node so it can be redeployed.
|
"""Release a node so it can be redeployed.
|
||||||
|
Release is opposite of acquire/allocate. After a successful release, the node
|
||||||
|
will be in Ready state.
|
||||||
|
|
||||||
:param erase_disk: If true, the local disks on the machine will be quick wiped
|
:param erase_disk: If true, the local disks on the machine will be erased.
|
||||||
|
:param secure_erase: If erase_disk and secure_erase are set to True, and
|
||||||
|
quick_erase is not specified (default to False), MaaS
|
||||||
|
will try secure_erase first. If the drive does not
|
||||||
|
support secure erase, MaaS will overwirte th entire
|
||||||
|
drive with null butes.
|
||||||
|
:param quick_erase: If erase_disk and quick_erase are true, 1MB at the
|
||||||
|
start and at the end of the drive will be erased to make
|
||||||
|
data recovery inconvenient.
|
||||||
|
If all three parameters are True and the drive supports
|
||||||
|
secure erase, secure_erase will have precedence.
|
||||||
|
If the all three parameters are true, but the disk drive
|
||||||
|
does not support secure erase, MaaS will do quick erase.
|
||||||
|
But, if the disk drive supports neither secure nor
|
||||||
|
quick erase, the disk will be re-written with null bytes.
|
||||||
|
If erase_disk is true, but both secure_erase and quick_erase
|
||||||
|
are Fasle (default), MAAS will overwrite the whole disk
|
||||||
|
with null bytes.
|
||||||
|
If erase_disk is false, MaaS will not erase the drive, before
|
||||||
|
releasing the node.
|
||||||
"""
|
"""
|
||||||
url = self.interpolate_url()
|
url = self.interpolate_url()
|
||||||
|
|
||||||
options = {'erase': erase_disk}
|
options = {
|
||||||
|
'erase': erase_disk,
|
||||||
|
'secure_erase': secure_erase,
|
||||||
|
'quick_erase': quick_erase,
|
||||||
|
}
|
||||||
|
|
||||||
resp = self.api_client.post(url, op='release', files=options)
|
resp = self.api_client.post(url, op='release', files=options)
|
||||||
|
|
||||||
@ -225,6 +250,26 @@ class Machine(model_base.ResourceBase):
|
|||||||
self.logger.debug("MaaS response: %s" % resp.text)
|
self.logger.debug("MaaS response: %s" % resp.text)
|
||||||
raise errors.DriverError(brief_msg)
|
raise errors.DriverError(brief_msg)
|
||||||
|
|
||||||
|
def delete(self):
|
||||||
|
""" Reset the node storage, and delete it.
|
||||||
|
After node deletion, the node resource is purged from MaaS resources.
|
||||||
|
MaaS API machine delete call, only removes the machine from MaaS resource list.
|
||||||
|
AFter delete, he namchine needs to be manually pwowered on to be re-enlisted
|
||||||
|
in MaaS as a New node.
|
||||||
|
|
||||||
|
:param erase_disk: If true, the node storage is reset, before node resource
|
||||||
|
is deleted from maas.
|
||||||
|
"""
|
||||||
|
url = self.interpolate_url()
|
||||||
|
resp = self.api_client.delete(url)
|
||||||
|
|
||||||
|
if not resp.ok:
|
||||||
|
brief_msg = ("Error deleting node, received HTTP %s from MaaS" %
|
||||||
|
resp.status_code)
|
||||||
|
self.logger.error(brief_msg)
|
||||||
|
self.logger.debug("MaaS response: %s" % resp.text)
|
||||||
|
raise errors.DriverError(brief_msg)
|
||||||
|
|
||||||
def commission(self, debug=False):
|
def commission(self, debug=False):
|
||||||
"""Start the MaaS commissioning process.
|
"""Start the MaaS commissioning process.
|
||||||
|
|
||||||
@ -370,6 +415,29 @@ class Machine(model_base.ResourceBase):
|
|||||||
"Failed updating power parameters MAAS url %s - return code %s\n%s"
|
"Failed updating power parameters MAAS url %s - return code %s\n%s"
|
||||||
% (url, resp.status_code.resp.text))
|
% (url, resp.status_code.resp.text))
|
||||||
|
|
||||||
|
def reset_power_parameters(self):
|
||||||
|
"""Reset power type and parameters for this node to manual.
|
||||||
|
This is done to address the MaaS api issue detecting multiple BMC NIC
|
||||||
|
after a node delete.
|
||||||
|
|
||||||
|
Only available after the node has been added to MAAS.
|
||||||
|
"""
|
||||||
|
|
||||||
|
url = self.interpolate_url()
|
||||||
|
|
||||||
|
self.logger.debug("Resetting node power type for machine {}".format(
|
||||||
|
self.resource_id))
|
||||||
|
self.power_type = 'manual'
|
||||||
|
power_params = {'power_type': 'manual'}
|
||||||
|
resp = self.api_client.put(url, files=power_params)
|
||||||
|
|
||||||
|
if resp.status_code == 200:
|
||||||
|
return True
|
||||||
|
|
||||||
|
raise errors.DriverError(
|
||||||
|
"Failed updating power parameters MAAS url {} - return code {}\n{}".format(
|
||||||
|
url, resp.status_code.resp.text))
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
"""Serialize this resource instance into a dict.
|
"""Serialize this resource instance into a dict.
|
||||||
|
|
||||||
|
@ -202,10 +202,66 @@ class DestroyNodes(BaseAction):
|
|||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
"""Start executing this action."""
|
"""Start executing this action."""
|
||||||
self.task.set_status(hd_fields.TaskStatus.Complete)
|
self.task.set_status(hd_fields.TaskStatus.Running)
|
||||||
self.task.failure()
|
|
||||||
self.task.save()
|
self.task.save()
|
||||||
|
|
||||||
|
node_driver = self._get_driver('node')
|
||||||
|
|
||||||
|
if node_driver is None:
|
||||||
|
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg="No node driver enabled, ending task.",
|
||||||
|
error=True,
|
||||||
|
ctx=str(self.task.get_id()),
|
||||||
|
ctx_type='task')
|
||||||
|
self.task.result.set_message("No NodeDriver enabled.")
|
||||||
|
self.task.result.set_reason("Bad Configuration.")
|
||||||
|
self.task.failure()
|
||||||
|
self.task.save()
|
||||||
|
return
|
||||||
|
|
||||||
|
target_nodes = self.orchestrator.get_target_nodes(self.task)
|
||||||
|
|
||||||
|
if not target_nodes:
|
||||||
|
self.task.add_status_msg(
|
||||||
|
msg="No nodes in scope, no work to to do.",
|
||||||
|
error=False,
|
||||||
|
ctx='NA',
|
||||||
|
ctx_type='NA')
|
||||||
|
self.task.success()
|
||||||
|
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||||
|
self.task.save()
|
||||||
|
return
|
||||||
|
|
||||||
|
node_release_task = None
|
||||||
|
while True:
|
||||||
|
if node_release_task is None:
|
||||||
|
node_release_task = self.orchestrator.create_task(
|
||||||
|
design_ref=self.task.design_ref,
|
||||||
|
action=hd_fields.OrchestratorAction.DestroyNode,
|
||||||
|
node_filter=self.task.node_filter)
|
||||||
|
self.task.register_subtask(node_release_task)
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
"Starting node driver task %s to Release nodes." %
|
||||||
|
(node_release_task.get_id()))
|
||||||
|
node_driver.execute_task(node_release_task.get_id())
|
||||||
|
|
||||||
|
node_release_task = self.state_manager.get_task(
|
||||||
|
node_release_task.get_id())
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not node_release_task.retry_task(max_attempts=3):
|
||||||
|
break
|
||||||
|
except errors.MaxRetriesReached:
|
||||||
|
self.task.failure()
|
||||||
|
break
|
||||||
|
|
||||||
|
self.task.set_status(hd_fields.TaskStatus.Complete)
|
||||||
|
self.task.bubble_results(
|
||||||
|
action_filter=hd_fields.OrchestratorAction.DestroyNode)
|
||||||
|
self.task.align_result()
|
||||||
|
self.task.save()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user