Extend install_bootloader command timeout
Adds a new command_timeout_factor argument to the agent client _command method with a default value of 1, and sets the install_bootloader command to send a value of 2 to extend the timeout factor to give the agent time to complete its work before giving up on installing the bootloader and automatically retrying. Change-Id: Iab72903c3d30b0ae5b80a5115fc635179869eeee Story: 2007483 Task: 39201
This commit is contained in:
parent
281634dc4e
commit
4829df2966
@ -104,7 +104,11 @@ opts = [
|
||||
'service.')),
|
||||
cfg.IntOpt('command_timeout',
|
||||
default=60,
|
||||
help=_('Timeout (in seconds) for IPA commands.')),
|
||||
help=_('Timeout (in seconds) for IPA commands. '
|
||||
'Please note, the bootloader installation command '
|
||||
'to the agent is permitted a timeout of twice the '
|
||||
'value set here as these are IO heavy operations '
|
||||
'depending on the configuration of the instance.')),
|
||||
cfg.IntOpt('max_command_attempts',
|
||||
default=3,
|
||||
help=_('This is the maximum number of attempts that will be '
|
||||
|
@ -61,7 +61,8 @@ class AgentClient(object):
|
||||
retry_on_exception=(
|
||||
lambda e: isinstance(e, exception.AgentConnectionFailed)),
|
||||
stop_max_attempt_number=CONF.agent.max_command_attempts)
|
||||
def _command(self, node, method, params, wait=False):
|
||||
def _command(self, node, method, params, wait=False,
|
||||
command_timeout_factor=1):
|
||||
"""Sends command to agent.
|
||||
|
||||
:param node: A Node object.
|
||||
@ -71,6 +72,13 @@ class AgentClient(object):
|
||||
body.
|
||||
:param wait: True to wait for the command to finish executing, False
|
||||
otherwise.
|
||||
:param command_timeout_factor: An integer, default 1, by which to
|
||||
multiply the [agent]command_timeout
|
||||
value. This is intended for use with
|
||||
extremely long running commands to
|
||||
the agent ramdisk where a general
|
||||
timeout value should not be extended
|
||||
in all cases.
|
||||
:raises: IronicException when failed to issue the request or there was
|
||||
a malformed response from the agent.
|
||||
:raises: AgentAPIError when agent failed to execute specified command.
|
||||
@ -89,8 +97,9 @@ class AgentClient(object):
|
||||
{'node': node.uuid, 'method': method})
|
||||
|
||||
try:
|
||||
response = self.session.post(url, params=request_params, data=body,
|
||||
timeout=CONF.agent.command_timeout)
|
||||
response = self.session.post(
|
||||
url, params=request_params, data=body,
|
||||
timeout=CONF.agent.command_timeout * command_timeout_factor)
|
||||
except (requests.ConnectionError, requests.Timeout) as e:
|
||||
msg = (_('Failed to connect to the agent running on node %(node)s '
|
||||
'for invoking command %(method)s. Error: %(error)s') %
|
||||
@ -271,10 +280,20 @@ class AgentClient(object):
|
||||
params = {'root_uuid': root_uuid,
|
||||
'efi_system_part_uuid': efi_system_part_uuid,
|
||||
'prep_boot_part_uuid': prep_boot_part_uuid}
|
||||
|
||||
# NOTE(TheJulia): This command explicitly sends a larger timeout
|
||||
# factor to the _command call such that the agent ramdisk has enough
|
||||
# time to perform its work.
|
||||
# TODO(TheJulia): We should likely split install_bootloader into many
|
||||
# commands at some point, even though that would not be backwards
|
||||
# compatible. We could at least begin to delineate the commands apart
|
||||
# over the next cycle or two so we don't need a command timeout
|
||||
# extension factor.
|
||||
return self._command(node=node,
|
||||
method='image.install_bootloader',
|
||||
params=params,
|
||||
wait=True)
|
||||
wait=True,
|
||||
command_timeout_factor=2)
|
||||
|
||||
@METRICS.timer('AgentClient.get_clean_steps')
|
||||
def get_clean_steps(self, node, ports):
|
||||
|
@ -282,7 +282,8 @@ class TestAgentClient(base.TestCase):
|
||||
self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
|
||||
prep_boot_part_uuid=prep_boot_part_uuid)
|
||||
self.client._command.assert_called_once_with(
|
||||
node=self.node, method='image.install_bootloader', params=params,
|
||||
command_timeout_factor=2, node=self.node,
|
||||
method='image.install_bootloader', params=params,
|
||||
wait=True)
|
||||
|
||||
def test_install_bootloader(self):
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes an agent command issue in the bootloader installation process that
|
||||
can present itself as a connection timeout under heavy IO load conditions.
|
||||
Now installation commands have an internal timeout which is double the
|
||||
conductor wide ``[agent]command_timeout``. For more information, see
|
||||
bug `2007483 <https://storyboard.openstack.org/#!/story/2007483>`_.
|
Loading…
Reference in New Issue
Block a user