Improve maintenance power/reset control command retry handling

This update improves on and drives consistency into the
maintenance power on/off and reset handling in terms of
retries and use of graceful and immediate commands.

This update maintains the 10 retries for both power-on
and power-off commands and increases the number of retries
for the reset command from 5 to 10 to line up with the
power operation commands.

This update also ensures that the first 5 retries are done
with the graceful action command while the last 5 are with
the immediate.

This update also removed a power on handling case that could
have lead to a stuck state. This case was virtually impossible
to hit based on the required sequence of intermittent command
failures but that scenario handling was fixed up anyway.

Issues have been seen with the power-off handling on some servers.
Suspect that those servers need more time to power-off. So, this
introduced a 30 seconds delay following a power-off command before
issuing the power status query to give the server some time to
power-off before retrying the power-off command.

Test Plan: Both IPMI and Redfish

PASS: Verify power on/off and reset handling support up to 10 retries
PASS: Verify graceful command is used for the first power on/off
      or reset try and the first 5 retries
PASS: Verify immediate command is used for the final 5 retries
PASS: Verify reset handling with/without retries (none/mid/max)
PASS: Verify power-on  handling with/without retries (none/mid/max)
PASS: Verify power-off handling  with/without retries (none/mid/max)
PASS: Verify power status command failure handling for power on/off
NOTE: FIT (fault insertion testing) was used to create retry scenarios

PASS: Verify power-off inter retry delay feature
PASS: Verify 30 second power-off to power query delay
PASS: Verify redfish power/reset commands used are logged by default
PASS: Verify power-off/on and reset logging

Regression:

PASS: verify power-on/off and reset handling without retries
PASS: Verify power-off handling when power is already off
PASS: Verify power-on handling when power is already on

Closes-Bug: 2031945
Signed-off-by: Eric Macdonald <eric.macdonald@windriver.com>
Change-Id: Ie39326bcb205702df48ff9dd090f461c7110dd36
This commit is contained in:
Eric Macdonald 2023-09-18 18:48:56 +00:00 committed by Eric MacDonald
parent c5b611f510
commit 50dc29f6c0
7 changed files with 164 additions and 169 deletions

View File

@ -187,6 +187,7 @@ typedef enum
#define DEFAULT_GOENABLE_TIMEOUT (300) #define DEFAULT_GOENABLE_TIMEOUT (300)
#define DEFAULT_DOR_MODE_TIMEOUT (20) #define DEFAULT_DOR_MODE_TIMEOUT (20)
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600) #define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
#define DEFAULT_POWER_OFF_RETRY_WAIT (30)
/** TODO: Convert names to omit JSON part */ /** TODO: Convert names to omit JSON part */
#define MTC_JSON_INV_LABEL "ihosts" #define MTC_JSON_INV_LABEL "ihosts"
@ -323,9 +324,14 @@ typedef enum
#define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */ #define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */
#define COMMAND_DELAY (2) /* from sshUtil.h */ #define COMMAND_DELAY (2) /* from sshUtil.h */
#define MTC_POWER_ACTION_RETRY_DELAY (20) /* Define Reset and Power Action retry controls ; delay, count and switch threshold */
#define MTC_POWER_ACTION_RETRY_COUNT (10) #define MTC_POWER_ACTION_QUERY_WAIT (30)
#define MTC_RESET_ACTION_RETRY_COUNT (5) #define MTC_POWER_ACTION_RETRY_DELAY (20)
#define MTC_POWER_ACTION_RETRY_COUNT (10)
#define MTC_POWER_ACTION_SWITCH_THRESHOLD (MTC_POWER_ACTION_RETRY_COUNT/2)
#define MTC_RESET_ACTION_RETRY_DELAY (20)
#define MTC_RESET_ACTION_RETRY_COUNT (10)
#define MTC_RESET_ACTION_SWITCH_THRESHOLD (MTC_RESET_ACTION_RETRY_COUNT/2)
/* number of calls to the bmc_handler while bm_access is not confirmed */ /* number of calls to the bmc_handler while bm_access is not confirmed */
#define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5) #define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5)

View File

@ -2,10 +2,10 @@
#define __INCLUDE_NODETIMERS_HH__ #define __INCLUDE_NODETIMERS_HH__
/* /*
* Copyright (c) 2013-2016 Wind River Systems, Inc. * Copyright (c) 2013-2023 Wind River Systems, Inc.
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
* *
*/ */
/** /**
@ -94,6 +94,9 @@
#define MTC_AGENT_TIMEOUT_EXTENSION (5) #define MTC_AGENT_TIMEOUT_EXTENSION (5)
#define MTC_LOCK_CEPH_DELAY (90) #define MTC_LOCK_CEPH_DELAY (90)
#define MTC_RECV_RETRY_WAIT (MTC_RETRY_WAIT)
#define MTC_RECV_WAIT (MTC_RETRY_WAIT)
/** Host must stay enabled for this long for the /** Host must stay enabled for this long for the
* failed_recovery_counter to get cleared */ * failed_recovery_counter to get cleared */
#define MTC_ENABLED_TIMER (5) #define MTC_ENABLED_TIMER (5)

View File

@ -244,6 +244,7 @@ nodeLinkClass::nodeLinkClass()
memory_used = 0 ; memory_used = 0 ;
hosts = 0 ; hosts = 0 ;
host_deleted = false ; host_deleted = false ;
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
/* Init the base level pulse info and pointers for all interfaces */ /* Init the base level pulse info and pointers for all interfaces */
pulse_ptr = NULL ; pulse_ptr = NULL ;

View File

@ -1508,6 +1508,9 @@ public:
/** Host has been deleted */ /** Host has been deleted */
bool host_deleted ; bool host_deleted ;
/** seconds to wait between power-off retries */
int power_off_retry_wait ;
/** Host Administrative State Change public member function */ /** Host Administrative State Change public member function */
int admin_state_change ( string hostname, int admin_state_change ( string hostname,
string newAdminState ); string newAdminState );

View File

@ -100,16 +100,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
} }
case BMC_THREAD_CMD__POWER_RESET: case BMC_THREAD_CMD__POWER_RESET:
{ {
/* use immediate for all retries if server supports an immediate command */ /* Use graceful for the first half of the retry countdown
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() )) * and immediate for the remaining retries. */
if ((!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) &&
( node_ptr->power_action_retries < MTC_RESET_ACTION_SWITCH_THRESHOLD))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
/* unfaulted graceful if it exists */ /* Unfaulted graceful if it exists */
else if ( ! node_ptr->bmc_info.power_ctrl.reset.graceful.empty() ) else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful);
/* unfaulted immediate if graceful does not exist */ /* Unfaulted immediate if graceful does not exist */
else if ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
else else
{ {
@ -120,18 +122,19 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
} }
case BMC_THREAD_CMD__POWER_ON: case BMC_THREAD_CMD__POWER_ON:
{ {
/* use immediate for all retries if server supports an immediate command */ /* Use graceful for the first half of the retry countdown
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() )) * and immediate for the remaining retries. */
if ((!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) &&
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
/* unfaulted graceful if it exists */ /* Unfaulted graceful if it exists */
else if ( ! node_ptr->bmc_info.power_ctrl.poweron.graceful.empty() ) else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful);
/* unfaulted immediate if graceful does not exist */ /* Unfaulted immediate if graceful does not exist */
else if ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
else else
{ {
elog("%s offers no supported poweron commands", node_ptr->hostname.c_str()); elog("%s offers no supported poweron commands", node_ptr->hostname.c_str());
@ -141,16 +144,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
} }
case BMC_THREAD_CMD__POWER_OFF: case BMC_THREAD_CMD__POWER_OFF:
{ {
/* use immediate for all retries if server supports an immediate command */ /* Use graceful for the first half of the retry countdown
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() )) * and immediate for the remaining retries. */
if ((!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ) &&
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
/* unfaulted graceful if it exists */ /* Unfaulted graceful if it exists */
else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() ) else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() )
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful);
/* unfaulted immediate if graceful does not exist */ /* Unfaulted immediate if graceful does not exist */
else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty()) else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
else else
{ {
@ -193,11 +198,23 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
{ {
want_fit = true ; want_fit = true ;
} }
else if (( command == BMC_THREAD_CMD__POWER_ON ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
{
/* Just change the command to query status */
command = BMC_THREAD_CMD__POWER_STATUS ;
}
else if (( command == BMC_THREAD_CMD__POWER_OFF ) && else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true )) ( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true ))
{ {
want_fit = true ; want_fit = true ;
} }
else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
{
/* Just change the command to query status */
command = BMC_THREAD_CMD__POWER_STATUS ;
}
else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) && else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true )) ( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true ))
{ {

View File

@ -4007,7 +4007,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
} }
case MTC_RESET__REQ_SEND: case MTC_RESET__REQ_SEND:
{ {
node_ptr->power_action_retries--;
/* Handle loss of connectivity over retries */ /* Handle loss of connectivity over retries */
if ( node_ptr->bmc_provisioned == false ) if ( node_ptr->bmc_provisioned == false )
@ -4022,18 +4021,17 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
{ {
wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n", wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_DELAY); MTC_RESET_ACTION_RETRY_DELAY);
mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
resetStageChange ( node_ptr , MTC_RESET__QUEUE ); resetStageChange ( node_ptr , MTC_RESET__QUEUE );
break ; break ;
} }
else else
{ {
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
if ( rc ) if ( rc )
{ {
wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc ); wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc );
@ -4044,7 +4042,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
blog ("%s Reset requested\n", node_ptr->hostname.c_str()); blog ("%s Reset requested\n", node_ptr->hostname.c_str());
resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT ); resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT );
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
} }
break ; break ;
} }
@ -4053,17 +4051,16 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
{ {
if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
rc = bmc_command_recv ( node_ptr ); rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY ) if ( rc == RETRY )
{ {
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ; break ;
} }
else if ( rc )
if ( rc )
{ {
elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc ); elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
resetStageChange ( node_ptr, MTC_RESET__QUEUE ); resetStageChange ( node_ptr, MTC_RESET__QUEUE );
} }
else else
@ -4082,7 +4079,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
node_ptr->mtcTimer.ring = false ; node_ptr->mtcTimer.ring = false ;
if ( node_ptr->power_action_retries > 0 ) if ( --node_ptr->power_action_retries >= 0 )
{ {
char buffer[64] ; char buffer[64] ;
int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
@ -4455,7 +4452,8 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_REINSTALL__POWEROFF: case MTC_REINSTALL__POWEROFF:
{ {
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); mtcTimer_reset ( node_ptr->mtcTimer ) ;
powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT ); reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
break ; break ;
} }
@ -4975,54 +4973,56 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->bm_ip.c_str(), node_ptr->bm_ip.c_str(),
rc ); rc );
} }
else
{
;
}
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
//the fall through to MTC_POWEROFF__REQ_SEND is intentional
MTCE_FALLTHROUGH; /* don't allow a timeout of zero to be passed in */
if ( power_off_retry_wait == 0 )
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
ilog ("%s power off retry wait is %d seconds",
node_ptr->hostname.c_str(), power_off_retry_wait);
mtcTimer_reset ( node_ptr->mtcTimer ) ;
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
break ;
} }
case MTC_POWEROFF__REQ_SEND: case MTC_POWEROFF__REQ_SEND:
{ {
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
/* Handle loss of connectivity over retries */
if ( node_ptr->bmc_provisioned == false )
{ {
elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str()); /* Handle loss of connectivity over retries */
mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV ); if ( node_ptr->bmc_provisioned == false )
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
break ;
}
if ( node_ptr->bmc_accessible == false )
{
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n",
node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_DELAY);
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
break ;
}
else
{
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
if ( rc )
{ {
node_ptr->power_action_retries--; elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str());
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc ); mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
break ;
} }
if ( node_ptr->bmc_accessible == false )
{
wlog ("%s Power Off request rejected ; BMC not accessible",
node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
break ;
}
else else
{ {
ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str()); rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT ); if ( rc )
{
wlog ("%s Power-Off request failed (%d)", node_ptr->hostname.c_str(), rc );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
else
{
ilog ("%s Power-Off requested", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
} }
break ; break ;
} }
@ -5034,41 +5034,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_recv ( node_ptr ); rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY ) if ( rc == RETRY )
{ {
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
break ; break ;
} }
else if ( rc ) else if ( rc )
{ {
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str()); elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
// Need to handle retries in this case since we don't mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
// go through the QUEUE stage.
if ( --node_ptr->power_action_retries > 0 )
{
char buffer[255] ;
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
mtcInvApi_update_task ( node_ptr, buffer);
// The power off command can fail due to connectivity
// issue or if the server is now already powered off.
// The latter could occur if the previous power off
// command failed 'in response' but actually did end up
// powering off. In that case, if we continue to just
// retry the power off when the power is already off
// then that will just fail again since most redfish
// implementations fail rather than wave-on a power off
// request while the power is already off. In this case
// its better to switch to power query power status
// again and allow that result to put this power off
// FSM into the correct state to continue/retry the
// quest for power off.
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
}
else
{
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
}
} }
else else
{ {
@ -5091,6 +5064,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
plog ("%s is now offline\n", node_ptr->hostname.c_str()); plog ("%s is now offline\n", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
} }
else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
@ -5101,27 +5075,31 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
} }
case MTC_POWEROFF__POWERQRY: case MTC_POWEROFF__POWERQRY:
{ {
if ( node_ptr->bmc_thread_ctrl.done ) /* give the power off action some time to complete */
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
/* Query Host Power Status */ if ( node_ptr->bmc_thread_ctrl.done )
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
{ {
elog ("%s '%s' send failed\n", /* Query Host Power Status */
node_ptr->hostname.c_str(), if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
bmcUtil_getCmd_str( {
node_ptr->bmc_thread_info.command).c_str()); elog ("%s '%s' send failed",
pingUtil_restart ( node_ptr->bm_ping_info ); node_ptr->hostname.c_str(),
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
pingUtil_restart ( node_ptr->bm_ping_info );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
else
{
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
} }
else else
{ {
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
else
{
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
} }
break ; break ;
} }
@ -5132,7 +5110,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
int rc = bmc_command_recv ( node_ptr ) ; int rc = bmc_command_recv ( node_ptr ) ;
if ( rc == RETRY ) if ( rc == RETRY )
{ {
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
break ; break ;
} }
else if ( rc != PASS ) else if ( rc != PASS )
@ -5183,37 +5161,36 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
} }
} }
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
} }
break ; break ;
} }
case MTC_POWEROFF__QUEUE: case MTC_POWEROFF__QUEUE:
{ {
if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) if ( --node_ptr->power_action_retries >= 0 )
{ {
if ( --node_ptr->power_action_retries > 0 ) char buffer[255] ;
{ int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
char buffer[255] ; snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; mtcInvApi_update_task ( node_ptr, buffer);
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
mtcInvApi_update_task ( node_ptr, buffer);
/* Check the thread error status if there is one. Skip the /* Check the thread error status if there is one. Skip the
* typical system call log which just floods the log file. * typical system call log which just floods the log file.
* The failure is reported in the update task log above. */ * The failure is reported in the update task log above. */
if (( node_ptr->bmc_thread_info.status ) && if (( node_ptr->bmc_thread_info.status ) &&
( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL)) ( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL))
{
wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
node_ptr->bmc_thread_info.status_string.c_str(),
node_ptr->bmc_thread_info.status );
}
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
}
else
{ {
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); wlog ("%s ... %s (rc:%d)", node_ptr->hostname.c_str(),
node_ptr->bmc_thread_info.status_string.c_str(),
node_ptr->bmc_thread_info.status );
} }
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
ilog ("%s waiting %d seconds before next power off retry",
node_ptr->hostname.c_str(), power_off_retry_wait);
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, power_off_retry_wait );
}
else
{
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
} }
break ; break ;
} }
@ -5294,7 +5271,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_DELAY); MTC_POWER_ACTION_RETRY_DELAY);
node_ptr->power_action_retries-- ;
mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
@ -5304,7 +5280,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ; rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ;
if ( rc ) if ( rc )
{ {
node_ptr->power_action_retries-- ;
powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
} }
else else
@ -5349,18 +5324,11 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
} }
} }
/* failure path handling */
else if ( node_ptr->power_action_retries <= 0 )
{
wlog ("%s current power state query failed ; "
"proceeding with power-on",
node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
}
else else
{ {
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); wlog ("%s power state query failed",
node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
} }
} }
break ; break ;
@ -5383,7 +5351,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->bmc_accessible == false ) if ( node_ptr->bmc_accessible == false )
{ {
node_ptr->power_action_retries--;
wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n", wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);
@ -5397,7 +5364,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON ); rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON );
if ( rc ) if ( rc )
{ {
node_ptr->power_action_retries--;
wlog ("%s Power-On request failed (%d)\n", wlog ("%s Power-On request failed (%d)\n",
node_ptr->hostname.c_str(), rc ); node_ptr->hostname.c_str(), rc );
@ -5429,7 +5395,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( rc ) if ( rc )
{ {
node_ptr->power_action_retries--;
elog ("%s Power-On command failed\n", node_ptr->hostname.c_str()); elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
@ -5452,7 +5417,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
node_ptr->mtcTimer.ring = false ; node_ptr->mtcTimer.ring = false ;
if ( node_ptr->power_action_retries > 0 ) if ( --node_ptr->power_action_retries >= 0 )
{ {
char buffer[64] ; char buffer[64] ;
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;

View File

@ -1,9 +1,9 @@
/* /*
* Copyright (c) 2016-2017 Wind River Systems, Inc. * Copyright (c) 2016-2023 Wind River Systems, Inc.
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
* *
*/ */
/** /**
@ -316,7 +316,7 @@ void * mtcThread_bmc ( void * arg )
{ {
string chopped_request = bmcUtil_chop_system_req(request); string chopped_request = bmcUtil_chop_system_req(request);
daemon_remove_file ( datafile.data() ) ; daemon_remove_file ( datafile.data() ) ;
blog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str()); ilog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str());
/****** Make the system call ******/ /****** Make the system call ******/
rc = rc =