Merge "Improve maintenance power/reset control command retry handling"
This commit is contained in:
commit
25bb6a1dbf
@ -187,6 +187,7 @@ typedef enum
|
||||
#define DEFAULT_GOENABLE_TIMEOUT (300)
|
||||
#define DEFAULT_DOR_MODE_TIMEOUT (20)
|
||||
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
|
||||
#define DEFAULT_POWER_OFF_RETRY_WAIT (30)
|
||||
|
||||
/** TODO: Convert names to omit JSON part */
|
||||
#define MTC_JSON_INV_LABEL "ihosts"
|
||||
@ -323,9 +324,14 @@ typedef enum
|
||||
#define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */
|
||||
#define COMMAND_DELAY (2) /* from sshUtil.h */
|
||||
|
||||
#define MTC_POWER_ACTION_RETRY_DELAY (20)
|
||||
#define MTC_POWER_ACTION_RETRY_COUNT (10)
|
||||
#define MTC_RESET_ACTION_RETRY_COUNT (5)
|
||||
/* Define Reset and Power Action retry controls ; delay, count and switch threshold */
|
||||
#define MTC_POWER_ACTION_QUERY_WAIT (30)
|
||||
#define MTC_POWER_ACTION_RETRY_DELAY (20)
|
||||
#define MTC_POWER_ACTION_RETRY_COUNT (10)
|
||||
#define MTC_POWER_ACTION_SWITCH_THRESHOLD (MTC_POWER_ACTION_RETRY_COUNT/2)
|
||||
#define MTC_RESET_ACTION_RETRY_DELAY (20)
|
||||
#define MTC_RESET_ACTION_RETRY_COUNT (10)
|
||||
#define MTC_RESET_ACTION_SWITCH_THRESHOLD (MTC_RESET_ACTION_RETRY_COUNT/2)
|
||||
|
||||
/* number of calls to the bmc_handler while bm_access is not confirmed */
|
||||
#define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5)
|
||||
|
@ -2,10 +2,10 @@
|
||||
#define __INCLUDE_NODETIMERS_HH__
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013-2016 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Copyright (c) 2013-2023 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -94,6 +94,9 @@
|
||||
#define MTC_AGENT_TIMEOUT_EXTENSION (5)
|
||||
#define MTC_LOCK_CEPH_DELAY (90)
|
||||
|
||||
#define MTC_RECV_RETRY_WAIT (MTC_RETRY_WAIT)
|
||||
#define MTC_RECV_WAIT (MTC_RETRY_WAIT)
|
||||
|
||||
/** Host must stay enabled for this long for the
|
||||
* failed_recovery_counter to get cleared */
|
||||
#define MTC_ENABLED_TIMER (5)
|
||||
|
@ -244,6 +244,7 @@ nodeLinkClass::nodeLinkClass()
|
||||
memory_used = 0 ;
|
||||
hosts = 0 ;
|
||||
host_deleted = false ;
|
||||
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
|
||||
|
||||
/* Init the base level pulse info and pointers for all interfaces */
|
||||
pulse_ptr = NULL ;
|
||||
|
@ -1508,6 +1508,9 @@ public:
|
||||
/** Host has been deleted */
|
||||
bool host_deleted ;
|
||||
|
||||
/** seconds to wait between power-off retries */
|
||||
int power_off_retry_wait ;
|
||||
|
||||
/** Host Administrative State Change public member function */
|
||||
int admin_state_change ( string hostname,
|
||||
string newAdminState );
|
||||
|
@ -100,16 +100,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
||||
}
|
||||
case BMC_THREAD_CMD__POWER_RESET:
|
||||
{
|
||||
/* use immediate for all retries if server supports an immediate command */
|
||||
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() ))
|
||||
/* Use graceful for the first half of the retry countdown
|
||||
* and immediate for the remaining retries. */
|
||||
if ((!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) &&
|
||||
( node_ptr->power_action_retries < MTC_RESET_ACTION_SWITCH_THRESHOLD))
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
|
||||
|
||||
/* unfaulted graceful if it exists */
|
||||
else if ( ! node_ptr->bmc_info.power_ctrl.reset.graceful.empty() )
|
||||
/* Unfaulted graceful if it exists */
|
||||
else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty())
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful);
|
||||
|
||||
/* unfaulted immediate if graceful does not exist */
|
||||
else if ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty())
|
||||
/* Unfaulted immediate if graceful does not exist */
|
||||
else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty())
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
|
||||
else
|
||||
{
|
||||
@ -120,18 +122,19 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
||||
}
|
||||
case BMC_THREAD_CMD__POWER_ON:
|
||||
{
|
||||
/* use immediate for all retries if server supports an immediate command */
|
||||
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() ))
|
||||
/* Use graceful for the first half of the retry countdown
|
||||
* and immediate for the remaining retries. */
|
||||
if ((!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) &&
|
||||
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
|
||||
|
||||
/* unfaulted graceful if it exists */
|
||||
else if ( ! node_ptr->bmc_info.power_ctrl.poweron.graceful.empty() )
|
||||
/* Unfaulted graceful if it exists */
|
||||
else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty())
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful);
|
||||
|
||||
/* unfaulted immediate if graceful does not exist */
|
||||
else if ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty())
|
||||
/* Unfaulted immediate if graceful does not exist */
|
||||
else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty())
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
|
||||
|
||||
else
|
||||
{
|
||||
elog("%s offers no supported poweron commands", node_ptr->hostname.c_str());
|
||||
@ -141,16 +144,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
||||
}
|
||||
case BMC_THREAD_CMD__POWER_OFF:
|
||||
{
|
||||
/* use immediate for all retries if server supports an immediate command */
|
||||
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ))
|
||||
/* Use graceful for the first half of the retry countdown
|
||||
* and immediate for the remaining retries. */
|
||||
if ((!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ) &&
|
||||
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
|
||||
|
||||
/* unfaulted graceful if it exists */
|
||||
else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() )
|
||||
/* Unfaulted graceful if it exists */
|
||||
else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() )
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful);
|
||||
|
||||
/* unfaulted immediate if graceful does not exist */
|
||||
else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty())
|
||||
/* Unfaulted immediate if graceful does not exist */
|
||||
else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty())
|
||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
|
||||
else
|
||||
{
|
||||
@ -193,11 +198,23 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
||||
{
|
||||
want_fit = true ;
|
||||
}
|
||||
else if (( command == BMC_THREAD_CMD__POWER_ON ) &&
|
||||
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
|
||||
{
|
||||
/* Just change the command to query status */
|
||||
command = BMC_THREAD_CMD__POWER_STATUS ;
|
||||
}
|
||||
else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
|
||||
( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true ))
|
||||
{
|
||||
want_fit = true ;
|
||||
}
|
||||
else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
|
||||
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
|
||||
{
|
||||
/* Just change the command to query status */
|
||||
command = BMC_THREAD_CMD__POWER_STATUS ;
|
||||
}
|
||||
else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) &&
|
||||
( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true ))
|
||||
{
|
||||
|
@ -4007,7 +4007,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
case MTC_RESET__REQ_SEND:
|
||||
{
|
||||
node_ptr->power_action_retries--;
|
||||
|
||||
/* Handle loss of connectivity over retries */
|
||||
if ( node_ptr->bmc_provisioned == false )
|
||||
@ -4022,18 +4021,17 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n",
|
||||
node_ptr->hostname.c_str(),
|
||||
MTC_POWER_ACTION_RETRY_DELAY);
|
||||
MTC_RESET_ACTION_RETRY_DELAY);
|
||||
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
|
||||
resetStageChange ( node_ptr , MTC_RESET__QUEUE );
|
||||
break ;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
|
||||
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
|
||||
if ( rc )
|
||||
{
|
||||
wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
||||
@ -4044,7 +4042,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
blog ("%s Reset requested\n", node_ptr->hostname.c_str());
|
||||
resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -4053,17 +4051,16 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
rc = bmc_command_recv ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
break ;
|
||||
}
|
||||
|
||||
if ( rc )
|
||||
rc = bmc_command_recv ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
break ;
|
||||
}
|
||||
else if ( rc )
|
||||
{
|
||||
elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
|
||||
resetStageChange ( node_ptr, MTC_RESET__QUEUE );
|
||||
}
|
||||
else
|
||||
@ -4082,7 +4079,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
node_ptr->mtcTimer.ring = false ;
|
||||
if ( node_ptr->power_action_retries > 0 )
|
||||
if ( --node_ptr->power_action_retries >= 0 )
|
||||
{
|
||||
char buffer[64] ;
|
||||
int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||
@ -4455,7 +4452,8 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
case MTC_REINSTALL__POWEROFF:
|
||||
{
|
||||
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
||||
mtcTimer_reset ( node_ptr->mtcTimer ) ;
|
||||
powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND );
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
|
||||
break ;
|
||||
}
|
||||
@ -4975,54 +4973,56 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->bm_ip.c_str(),
|
||||
rc );
|
||||
}
|
||||
else
|
||||
{
|
||||
;
|
||||
}
|
||||
|
||||
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
||||
//the fall through to MTC_POWEROFF__REQ_SEND is intentional
|
||||
MTCE_FALLTHROUGH;
|
||||
|
||||
/* don't allow a timeout of zero to be passed in */
|
||||
if ( power_off_retry_wait == 0 )
|
||||
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
|
||||
|
||||
ilog ("%s power off retry wait is %d seconds",
|
||||
node_ptr->hostname.c_str(), power_off_retry_wait);
|
||||
|
||||
mtcTimer_reset ( node_ptr->mtcTimer ) ;
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
||||
break ;
|
||||
}
|
||||
case MTC_POWEROFF__REQ_SEND:
|
||||
{
|
||||
|
||||
/* Handle loss of connectivity over retries */
|
||||
if ( node_ptr->bmc_provisioned == false )
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str());
|
||||
mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
||||
break ;
|
||||
}
|
||||
|
||||
if ( node_ptr->bmc_accessible == false )
|
||||
{
|
||||
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
MTC_POWER_ACTION_RETRY_DELAY);
|
||||
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||
break ;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
|
||||
if ( rc )
|
||||
/* Handle loss of connectivity over retries */
|
||||
if ( node_ptr->bmc_provisioned == false )
|
||||
{
|
||||
node_ptr->power_action_retries--;
|
||||
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||
elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str());
|
||||
mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
||||
break ;
|
||||
}
|
||||
|
||||
if ( node_ptr->bmc_accessible == false )
|
||||
{
|
||||
wlog ("%s Power Off request rejected ; BMC not accessible",
|
||||
node_ptr->hostname.c_str());
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||
break ;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
|
||||
if ( rc )
|
||||
{
|
||||
wlog ("%s Power-Off request failed (%d)", node_ptr->hostname.c_str(), rc );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s Power-Off requested", node_ptr->hostname.c_str());
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -5034,41 +5034,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
rc = bmc_command_recv ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
|
||||
break ;
|
||||
}
|
||||
else if ( rc )
|
||||
{
|
||||
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
|
||||
|
||||
// Need to handle retries in this case since we don't
|
||||
// go through the QUEUE stage.
|
||||
if ( --node_ptr->power_action_retries > 0 )
|
||||
{
|
||||
char buffer[255] ;
|
||||
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
|
||||
mtcInvApi_update_task ( node_ptr, buffer);
|
||||
|
||||
// The power off command can fail due to connectivity
|
||||
// issue or if the server is now already powered off.
|
||||
// The latter could occur if the previous power off
|
||||
// command failed 'in response' but actually did end up
|
||||
// powering off. In that case, if we continue to just
|
||||
// retry the power off when the power is already off
|
||||
// then that will just fail again since most redfish
|
||||
// implementations fail rather than wave-on a power off
|
||||
// request while the power is already off. In this case
|
||||
// its better to switch to power query power status
|
||||
// again and allow that result to put this power off
|
||||
// FSM into the correct state to continue/retry the
|
||||
// quest for power off.
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
|
||||
}
|
||||
else
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
||||
}
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -5091,6 +5064,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
plog ("%s is now offline\n", node_ptr->hostname.c_str());
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
|
||||
}
|
||||
else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
@ -5101,27 +5075,31 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
case MTC_POWEROFF__POWERQRY:
|
||||
{
|
||||
if ( node_ptr->bmc_thread_ctrl.done )
|
||||
/* give the power off action some time to complete */
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
/* Query Host Power Status */
|
||||
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
|
||||
if ( node_ptr->bmc_thread_ctrl.done )
|
||||
{
|
||||
elog ("%s '%s' send failed\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
bmcUtil_getCmd_str(
|
||||
node_ptr->bmc_thread_info.command).c_str());
|
||||
pingUtil_restart ( node_ptr->bm_ping_info );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||
/* Query Host Power Status */
|
||||
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
|
||||
{
|
||||
elog ("%s '%s' send failed",
|
||||
node_ptr->hostname.c_str(),
|
||||
bmcUtil_getCmd_str(
|
||||
node_ptr->bmc_thread_info.command).c_str());
|
||||
pingUtil_restart ( node_ptr->bm_ping_info );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||
}
|
||||
else
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
|
||||
}
|
||||
else
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT );
|
||||
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
}
|
||||
else
|
||||
{
|
||||
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -5132,7 +5110,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
int rc = bmc_command_recv ( node_ptr ) ;
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
|
||||
break ;
|
||||
}
|
||||
else if ( rc != PASS )
|
||||
@ -5183,37 +5161,36 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||
}
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_POWEROFF__QUEUE:
|
||||
{
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
if ( --node_ptr->power_action_retries >= 0 )
|
||||
{
|
||||
if ( --node_ptr->power_action_retries > 0 )
|
||||
{
|
||||
char buffer[255] ;
|
||||
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
|
||||
mtcInvApi_update_task ( node_ptr, buffer);
|
||||
char buffer[255] ;
|
||||
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
|
||||
mtcInvApi_update_task ( node_ptr, buffer);
|
||||
|
||||
/* Check the thread error status if there is one. Skip the
|
||||
* typical system call log which just floods the log file.
|
||||
* The failure is reported in the update task log above. */
|
||||
if (( node_ptr->bmc_thread_info.status ) &&
|
||||
( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL))
|
||||
{
|
||||
wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
|
||||
node_ptr->bmc_thread_info.status_string.c_str(),
|
||||
node_ptr->bmc_thread_info.status );
|
||||
}
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
||||
}
|
||||
else
|
||||
/* Check the thread error status if there is one. Skip the
|
||||
* typical system call log which just floods the log file.
|
||||
* The failure is reported in the update task log above. */
|
||||
if (( node_ptr->bmc_thread_info.status ) &&
|
||||
( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL))
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
||||
wlog ("%s ... %s (rc:%d)", node_ptr->hostname.c_str(),
|
||||
node_ptr->bmc_thread_info.status_string.c_str(),
|
||||
node_ptr->bmc_thread_info.status );
|
||||
}
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
||||
ilog ("%s waiting %d seconds before next power off retry",
|
||||
node_ptr->hostname.c_str(), power_off_retry_wait);
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, power_off_retry_wait );
|
||||
}
|
||||
else
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -5294,7 +5271,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->hostname.c_str(),
|
||||
MTC_POWER_ACTION_RETRY_DELAY);
|
||||
|
||||
node_ptr->power_action_retries-- ;
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||
@ -5304,7 +5280,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ;
|
||||
if ( rc )
|
||||
{
|
||||
node_ptr->power_action_retries-- ;
|
||||
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||
}
|
||||
else
|
||||
@ -5349,18 +5324,11 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
||||
}
|
||||
}
|
||||
/* failure path handling */
|
||||
else if ( node_ptr->power_action_retries <= 0 )
|
||||
{
|
||||
wlog ("%s current power state query failed ; "
|
||||
"proceeding with power-on",
|
||||
node_ptr->hostname.c_str());
|
||||
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
||||
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
||||
}
|
||||
else
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
|
||||
wlog ("%s power state query failed",
|
||||
node_ptr->hostname.c_str());
|
||||
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||
}
|
||||
}
|
||||
break ;
|
||||
@ -5383,7 +5351,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
if ( node_ptr->bmc_accessible == false )
|
||||
{
|
||||
node_ptr->power_action_retries--;
|
||||
wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
|
||||
node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);
|
||||
|
||||
@ -5397,7 +5364,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON );
|
||||
if ( rc )
|
||||
{
|
||||
node_ptr->power_action_retries--;
|
||||
wlog ("%s Power-On request failed (%d)\n",
|
||||
node_ptr->hostname.c_str(), rc );
|
||||
|
||||
@ -5429,7 +5395,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
if ( rc )
|
||||
{
|
||||
node_ptr->power_action_retries--;
|
||||
elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||
@ -5452,7 +5417,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
node_ptr->mtcTimer.ring = false ;
|
||||
if ( node_ptr->power_action_retries > 0 )
|
||||
if ( --node_ptr->power_action_retries >= 0 )
|
||||
{
|
||||
char buffer[64] ;
|
||||
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||
|
@ -1,9 +1,9 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2016-2017 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Copyright (c) 2016-2023 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -316,7 +316,7 @@ void * mtcThread_bmc ( void * arg )
|
||||
{
|
||||
string chopped_request = bmcUtil_chop_system_req(request);
|
||||
daemon_remove_file ( datafile.data() ) ;
|
||||
blog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str());
|
||||
ilog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str());
|
||||
|
||||
/****** Make the system call ******/
|
||||
rc =
|
||||
|
Loading…
Reference in New Issue
Block a user