Extend install_bootloader command timeout
Adds a new command_timeout_factor argument to the agent client _command method with a default value of 1, and sets the install_bootloader command to send a value of 2 to extend the timeout factor to give the agent time to complete its work before giving up on installing the bootloader and automatically retrying. Change-Id: Iab72903c3d30b0ae5b80a5115fc635179869eeee Story: 2007483 Task: 39201
This commit is contained in:
parent
281634dc4e
commit
4829df2966
@ -104,7 +104,11 @@ opts = [
|
|||||||
'service.')),
|
'service.')),
|
||||||
cfg.IntOpt('command_timeout',
|
cfg.IntOpt('command_timeout',
|
||||||
default=60,
|
default=60,
|
||||||
help=_('Timeout (in seconds) for IPA commands.')),
|
help=_('Timeout (in seconds) for IPA commands. '
|
||||||
|
'Please note, the bootloader installation command '
|
||||||
|
'to the agent is permitted a timeout of twice the '
|
||||||
|
'value set here as these are IO heavy operations '
|
||||||
|
'depending on the configuration of the instance.')),
|
||||||
cfg.IntOpt('max_command_attempts',
|
cfg.IntOpt('max_command_attempts',
|
||||||
default=3,
|
default=3,
|
||||||
help=_('This is the maximum number of attempts that will be '
|
help=_('This is the maximum number of attempts that will be '
|
||||||
|
@ -61,7 +61,8 @@ class AgentClient(object):
|
|||||||
retry_on_exception=(
|
retry_on_exception=(
|
||||||
lambda e: isinstance(e, exception.AgentConnectionFailed)),
|
lambda e: isinstance(e, exception.AgentConnectionFailed)),
|
||||||
stop_max_attempt_number=CONF.agent.max_command_attempts)
|
stop_max_attempt_number=CONF.agent.max_command_attempts)
|
||||||
def _command(self, node, method, params, wait=False):
|
def _command(self, node, method, params, wait=False,
|
||||||
|
command_timeout_factor=1):
|
||||||
"""Sends command to agent.
|
"""Sends command to agent.
|
||||||
|
|
||||||
:param node: A Node object.
|
:param node: A Node object.
|
||||||
@ -71,6 +72,13 @@ class AgentClient(object):
|
|||||||
body.
|
body.
|
||||||
:param wait: True to wait for the command to finish executing, False
|
:param wait: True to wait for the command to finish executing, False
|
||||||
otherwise.
|
otherwise.
|
||||||
|
:param command_timeout_factor: An integer, default 1, by which to
|
||||||
|
multiply the [agent]command_timeout
|
||||||
|
value. This is intended for use with
|
||||||
|
extremely long running commands to
|
||||||
|
the agent ramdisk where a general
|
||||||
|
timeout value should not be extended
|
||||||
|
in all cases.
|
||||||
:raises: IronicException when failed to issue the request or there was
|
:raises: IronicException when failed to issue the request or there was
|
||||||
a malformed response from the agent.
|
a malformed response from the agent.
|
||||||
:raises: AgentAPIError when agent failed to execute specified command.
|
:raises: AgentAPIError when agent failed to execute specified command.
|
||||||
@ -89,8 +97,9 @@ class AgentClient(object):
|
|||||||
{'node': node.uuid, 'method': method})
|
{'node': node.uuid, 'method': method})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = self.session.post(url, params=request_params, data=body,
|
response = self.session.post(
|
||||||
timeout=CONF.agent.command_timeout)
|
url, params=request_params, data=body,
|
||||||
|
timeout=CONF.agent.command_timeout * command_timeout_factor)
|
||||||
except (requests.ConnectionError, requests.Timeout) as e:
|
except (requests.ConnectionError, requests.Timeout) as e:
|
||||||
msg = (_('Failed to connect to the agent running on node %(node)s '
|
msg = (_('Failed to connect to the agent running on node %(node)s '
|
||||||
'for invoking command %(method)s. Error: %(error)s') %
|
'for invoking command %(method)s. Error: %(error)s') %
|
||||||
@ -271,10 +280,20 @@ class AgentClient(object):
|
|||||||
params = {'root_uuid': root_uuid,
|
params = {'root_uuid': root_uuid,
|
||||||
'efi_system_part_uuid': efi_system_part_uuid,
|
'efi_system_part_uuid': efi_system_part_uuid,
|
||||||
'prep_boot_part_uuid': prep_boot_part_uuid}
|
'prep_boot_part_uuid': prep_boot_part_uuid}
|
||||||
|
|
||||||
|
# NOTE(TheJulia): This command explicitly sends a larger timeout
|
||||||
|
# factor to the _command call such that the agent ramdisk has enough
|
||||||
|
# time to perform its work.
|
||||||
|
# TODO(TheJulia): We should likely split install_bootloader into many
|
||||||
|
# commands at some point, even though that would not be backwards
|
||||||
|
# compatible. We could at least begin to delineate the commands apart
|
||||||
|
# over the next cycle or two so we don't need a command timeout
|
||||||
|
# extension factor.
|
||||||
return self._command(node=node,
|
return self._command(node=node,
|
||||||
method='image.install_bootloader',
|
method='image.install_bootloader',
|
||||||
params=params,
|
params=params,
|
||||||
wait=True)
|
wait=True,
|
||||||
|
command_timeout_factor=2)
|
||||||
|
|
||||||
@METRICS.timer('AgentClient.get_clean_steps')
|
@METRICS.timer('AgentClient.get_clean_steps')
|
||||||
def get_clean_steps(self, node, ports):
|
def get_clean_steps(self, node, ports):
|
||||||
|
@ -282,7 +282,8 @@ class TestAgentClient(base.TestCase):
|
|||||||
self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
|
self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
|
||||||
prep_boot_part_uuid=prep_boot_part_uuid)
|
prep_boot_part_uuid=prep_boot_part_uuid)
|
||||||
self.client._command.assert_called_once_with(
|
self.client._command.assert_called_once_with(
|
||||||
node=self.node, method='image.install_bootloader', params=params,
|
command_timeout_factor=2, node=self.node,
|
||||||
|
method='image.install_bootloader', params=params,
|
||||||
wait=True)
|
wait=True)
|
||||||
|
|
||||||
def test_install_bootloader(self):
|
def test_install_bootloader(self):
|
||||||
|
@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes an agent command issue in the bootloader installation process that
|
||||||
|
can present itself as a connection timeout under heavy IO load conditions.
|
||||||
|
Now installation commands have an internal timeout which is double the
|
||||||
|
conductor wide ``[agent]command_timeout``. For more information, see
|
||||||
|
bug `2007483 <https://storyboard.openstack.org/#!/story/2007483>`_.
|
Loading…
Reference in New Issue
Block a user