From 578f01678c6e1b919bf1b0b48868a1c3204ca04f Mon Sep 17 00:00:00 2001 From: Ruby Loo Date: Tue, 11 Jul 2017 12:43:08 -0400 Subject: [PATCH] Follow-up to fix for power action failure This is a follow-up patch to the patch so that the power status is not retried if a power action fails: ee5d4942a1c33736ffe05ec01619142be400c2f4 It addresses the comments as well as adds more clarification and updates the documentation to refer to the new [ipmi]command_retry_timeout config option. Change-Id: Ib21544da260565ae399e2d07b32af9bd8b810280 Related-Bug: #1692895 --- doc/source/install/configure-ipmi.rst | 4 +-- etc/ironic/ironic.conf.sample | 31 ++++++++++--------- ironic/conductor/utils.py | 9 ++++++ ironic/conf/conductor.py | 5 +-- ironic/conf/ipmi.py | 22 +++++++------ ironic/drivers/modules/ipmitool.py | 4 +-- .../fix-bug-1675529-479357c217819420.yaml | 25 +++++++++------ 7 files changed, 61 insertions(+), 39 deletions(-) diff --git a/doc/source/install/configure-ipmi.rst b/doc/source/install/configure-ipmi.rst index 0265650804..ea7c3e946a 100644 --- a/doc/source/install/configure-ipmi.rst +++ b/doc/source/install/configure-ipmi.rst @@ -48,8 +48,8 @@ IPMI configuration ~~~~~~~~~~~~~~~~~~ If there are slow or unresponsive BMCs in the environment, the -``retry_timeout`` configuration option in the ``[ipmi]`` section may need -to be lowered. The default is fairly conservative, as setting this timeout +``command_retry_timeout`` configuration option in the ``[ipmi]`` section may +need to be lowered. The default is fairly conservative, as setting this timeout too low can cause older BMCs to crash and require a hard-reset. Collecting sensor data diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index dd31ebdadb..5676c04d51 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -1140,9 +1140,10 @@ # Minimum value: 1 #soft_power_off_timeout = 600 -# Number of seconds to wait for power operations to complete -# on the baremetal node before declaring the power operation -# has failed (integer value) +# Number of seconds to wait for power operations to complete, +# i.e., so that a baremetal node is in the desired power +# state. If timed out, the power operation is considered a +# failure. (integer value) # Minimum value: 2 #power_state_change_timeout = 30 @@ -1900,13 +1901,14 @@ # From ironic # -# Maximum time in seconds to retry, retryable IPMI operations. -# For example if the requested action fails because the BMC is -# busy. There is a tradeoff when setting this value. Setting -# this too low may cause older BMCs to crash and require a -# hard reset. However, setting too high can cause the sync -# power state periodic task to hang when there are slow or -# unresponsive BMCs. (integer value) +# Maximum time in seconds to retry retryable IPMI operations. +# (An operation is retryable, for example, if the requested +# operation fails because the BMC is busy.) There is a +# tradeoff when setting this value. Setting this too low may +# cause older BMCs to crash and require a hard reset. However, +# setting too high can cause the sync power state periodic +# task to hang when there are slow or unresponsive BMCs. +# (integer value) #command_retry_timeout = 60 # DEPRECATED: Maximum time in seconds to retry IPMI @@ -1917,10 +1919,11 @@ # slow or unresponsive BMCs. (integer value) # This option is deprecated for removal. # Its value may be silently ignored in the future. -# Reason: Option ipmi.command_retry_timeout should be used to -# define IPMI command retries and option -# conductor.power_state_change_timeout should be use to define -# timeout value for waiting for power operations to complete +# Reason: Use option [ipmi]/command_retry_timeout to specify +# the timeout value for IPMI command retries, and use option +# [conductor]/power_state_change_timeout to specify the +# timeout value for waiting for a power operation to complete +# so that a baremetal node reaches the desired power state. #retry_timeout = # Minimum time, in seconds, between IPMI operations sent to a diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 3ab85628dc..48b11bbfc8 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -70,6 +70,15 @@ def node_set_boot_device(task, device, persistent=False): def node_wait_for_power_state(task, new_state, timeout=None): + """Wait for node to be in new power state. + + :param task: a TaskManager instance. + :param new_state: the desired new power state, one of the power states + in :mod:`ironic.common.states`. + :param timeout: number of seconds to wait before giving up. If not + specified, uses the conductor.power_state_change_timeout config value. + :raises: PowerStateFailure if timed out + """ retry_timeout = (timeout or CONF.conductor.power_state_change_timeout) def _wait(): diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 36ab5a6531..e8d2d73b20 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -148,8 +148,9 @@ opts = [ cfg.IntOpt('power_state_change_timeout', min=2, default=30, help=_('Number of seconds to wait for power operations to ' - 'complete on the baremetal node before declaring the ' - 'power operation has failed')), + 'complete, i.e., so that a baremetal node is in the ' + 'desired power state. If timed out, the power operation ' + 'is considered a failure.')), ] diff --git a/ironic/conf/ipmi.py b/ironic/conf/ipmi.py index e79daf5bd5..c62b71dbba 100644 --- a/ironic/conf/ipmi.py +++ b/ironic/conf/ipmi.py @@ -22,9 +22,10 @@ from ironic.common.i18n import _ opts = [ cfg.IntOpt('command_retry_timeout', default=60, - help=_('Maximum time in seconds to retry, retryable IPMI ' - 'operations. For example if the requested action fails ' - 'because the BMC is busy. There is a tradeoff when ' + help=_('Maximum time in seconds to retry retryable IPMI ' + 'operations. (An operation is retryable, for ' + 'example, if the requested operation fails ' + 'because the BMC is busy.) There is a tradeoff when ' 'setting this value. Setting this too low may cause ' 'older BMCs to crash and require a hard reset. However, ' 'setting too high can cause the sync power state ' @@ -38,13 +39,14 @@ opts = [ 'sync power state periodic task to hang when there are ' 'slow or unresponsive BMCs.'), deprecated_for_removal=True, - deprecated_reason=_('Option ipmi.command_retry_timeout should ' - 'be used to define IPMI command retries ' - 'and option ' - 'conductor.power_state_change_timeout ' - 'should be use to define timeout value for ' - 'waiting for power operations to ' - 'complete')), + deprecated_reason=_('Use option [ipmi]/command_retry_timeout ' + 'to specify the timeout value for ' + 'IPMI command retries, and use option ' + '[conductor]/power_state_change_timeout to ' + 'specify the timeout value for waiting for ' + 'a power operation to complete so that a ' + 'baremetal node reaches the desired ' + 'power state.')), cfg.IntOpt('min_command_interval', default=5, help=_('Minimum time, in seconds, between IPMI operations ' diff --git a/ironic/drivers/modules/ipmitool.py b/ironic/drivers/modules/ipmitool.py index a630d8c3e5..18c2e281bd 100644 --- a/ironic/drivers/modules/ipmitool.py +++ b/ironic/drivers/modules/ipmitool.py @@ -396,7 +396,7 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None): args.append(option) args.append(driver_info[name]) - # TODO(sambetts) Remove useage of ipmi.retry_timeout in Queens + # TODO(sambetts) Remove usage of ipmi.retry_timeout in Queens timeout = CONF.ipmi.retry_timeout or CONF.ipmi.command_retry_timeout # specify retry timing more precisely, if supported @@ -473,7 +473,7 @@ def _set_and_wait(task, power_action, driver_info, timeout=None): :returns: one of ironic.common.states """ - # TODO(sambetts) Remove useage of ipmi.retry_timeout in Queens + # TODO(sambetts) Remove usage of ipmi.retry_timeout in Queens default_timeout = CONF.ipmi.retry_timeout if power_action == states.POWER_ON: diff --git a/releasenotes/notes/fix-bug-1675529-479357c217819420.yaml b/releasenotes/notes/fix-bug-1675529-479357c217819420.yaml index 6e56caa791..952ccd2155 100644 --- a/releasenotes/notes/fix-bug-1675529-479357c217819420.yaml +++ b/releasenotes/notes/fix-bug-1675529-479357c217819420.yaml @@ -1,14 +1,21 @@ --- deprecations: - | - Configuration option IPMI.retry_timeout is deprecated in favor of new - options IPMI.command_retry_timeout, and - CONDUCTOR.power_state_change_timeout + Configuration option ``[ipmi]/retry_timeout`` is deprecated in favor of + these new options: + + * ``[ipmi]/command_retry_timeout``: timeout value to wait for an IPMI + command to complete (be acknowledged by the baremetal node) + * ``[conductor]/power_state_change_timeout``: timeout value to wait for + a power operation to complete, so that the baremetal node is in the + desired new power state fixes: - | - Prevents the IPMI driver from needlessly checking status if the power - change action fails. Additionally stop retrying power actions and power - status polls if we receive a non-retryable error from ipmitool. - https//bugs.launchpad.net/ironic/+bug/1675529. New configuration option - `power_state_change_timeout` added to define how many seconds to wait for a - server to change power state when a power action is requested. + Prevents the IPMI driver from needlessly checking status of the baremetal + node if a power change action fails. Additionally, stops retrying power + actions and power status polls on receipt of a non-retryable error from + ipmitool. For more information, see + https//bugs.launchpad.net/ironic/+bug/1675529. A new configuration option + ``[conductor]/power_state_change_timeout`` can be used to specify how many + seconds to wait for a baremetal node to change power state when a power + action is requested.