Merge "Improve maintenance power/reset control command retry handling"
This commit is contained in:
commit
25bb6a1dbf
@ -187,6 +187,7 @@ typedef enum
|
|||||||
#define DEFAULT_GOENABLE_TIMEOUT (300)
|
#define DEFAULT_GOENABLE_TIMEOUT (300)
|
||||||
#define DEFAULT_DOR_MODE_TIMEOUT (20)
|
#define DEFAULT_DOR_MODE_TIMEOUT (20)
|
||||||
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
|
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
|
||||||
|
#define DEFAULT_POWER_OFF_RETRY_WAIT (30)
|
||||||
|
|
||||||
/** TODO: Convert names to omit JSON part */
|
/** TODO: Convert names to omit JSON part */
|
||||||
#define MTC_JSON_INV_LABEL "ihosts"
|
#define MTC_JSON_INV_LABEL "ihosts"
|
||||||
@ -323,9 +324,14 @@ typedef enum
|
|||||||
#define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */
|
#define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */
|
||||||
#define COMMAND_DELAY (2) /* from sshUtil.h */
|
#define COMMAND_DELAY (2) /* from sshUtil.h */
|
||||||
|
|
||||||
|
/* Define Reset and Power Action retry controls ; delay, count and switch threshold */
|
||||||
|
#define MTC_POWER_ACTION_QUERY_WAIT (30)
|
||||||
#define MTC_POWER_ACTION_RETRY_DELAY (20)
|
#define MTC_POWER_ACTION_RETRY_DELAY (20)
|
||||||
#define MTC_POWER_ACTION_RETRY_COUNT (10)
|
#define MTC_POWER_ACTION_RETRY_COUNT (10)
|
||||||
#define MTC_RESET_ACTION_RETRY_COUNT (5)
|
#define MTC_POWER_ACTION_SWITCH_THRESHOLD (MTC_POWER_ACTION_RETRY_COUNT/2)
|
||||||
|
#define MTC_RESET_ACTION_RETRY_DELAY (20)
|
||||||
|
#define MTC_RESET_ACTION_RETRY_COUNT (10)
|
||||||
|
#define MTC_RESET_ACTION_SWITCH_THRESHOLD (MTC_RESET_ACTION_RETRY_COUNT/2)
|
||||||
|
|
||||||
/* number of calls to the bmc_handler while bm_access is not confirmed */
|
/* number of calls to the bmc_handler while bm_access is not confirmed */
|
||||||
#define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5)
|
#define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5)
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
#define __INCLUDE_NODETIMERS_HH__
|
#define __INCLUDE_NODETIMERS_HH__
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2013-2016 Wind River Systems, Inc.
|
* Copyright (c) 2013-2023 Wind River Systems, Inc.
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*
|
*
|
||||||
@ -94,6 +94,9 @@
|
|||||||
#define MTC_AGENT_TIMEOUT_EXTENSION (5)
|
#define MTC_AGENT_TIMEOUT_EXTENSION (5)
|
||||||
#define MTC_LOCK_CEPH_DELAY (90)
|
#define MTC_LOCK_CEPH_DELAY (90)
|
||||||
|
|
||||||
|
#define MTC_RECV_RETRY_WAIT (MTC_RETRY_WAIT)
|
||||||
|
#define MTC_RECV_WAIT (MTC_RETRY_WAIT)
|
||||||
|
|
||||||
/** Host must stay enabled for this long for the
|
/** Host must stay enabled for this long for the
|
||||||
* failed_recovery_counter to get cleared */
|
* failed_recovery_counter to get cleared */
|
||||||
#define MTC_ENABLED_TIMER (5)
|
#define MTC_ENABLED_TIMER (5)
|
||||||
|
@ -244,6 +244,7 @@ nodeLinkClass::nodeLinkClass()
|
|||||||
memory_used = 0 ;
|
memory_used = 0 ;
|
||||||
hosts = 0 ;
|
hosts = 0 ;
|
||||||
host_deleted = false ;
|
host_deleted = false ;
|
||||||
|
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
|
||||||
|
|
||||||
/* Init the base level pulse info and pointers for all interfaces */
|
/* Init the base level pulse info and pointers for all interfaces */
|
||||||
pulse_ptr = NULL ;
|
pulse_ptr = NULL ;
|
||||||
|
@ -1508,6 +1508,9 @@ public:
|
|||||||
/** Host has been deleted */
|
/** Host has been deleted */
|
||||||
bool host_deleted ;
|
bool host_deleted ;
|
||||||
|
|
||||||
|
/** seconds to wait between power-off retries */
|
||||||
|
int power_off_retry_wait ;
|
||||||
|
|
||||||
/** Host Administrative State Change public member function */
|
/** Host Administrative State Change public member function */
|
||||||
int admin_state_change ( string hostname,
|
int admin_state_change ( string hostname,
|
||||||
string newAdminState );
|
string newAdminState );
|
||||||
|
@ -100,15 +100,17 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
|||||||
}
|
}
|
||||||
case BMC_THREAD_CMD__POWER_RESET:
|
case BMC_THREAD_CMD__POWER_RESET:
|
||||||
{
|
{
|
||||||
/* use immediate for all retries if server supports an immediate command */
|
/* Use graceful for the first half of the retry countdown
|
||||||
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() ))
|
* and immediate for the remaining retries. */
|
||||||
|
if ((!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) &&
|
||||||
|
( node_ptr->power_action_retries < MTC_RESET_ACTION_SWITCH_THRESHOLD))
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
|
||||||
|
|
||||||
/* unfaulted graceful if it exists */
|
/* Unfaulted graceful if it exists */
|
||||||
else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty())
|
else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty())
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful);
|
||||||
|
|
||||||
/* unfaulted immediate if graceful does not exist */
|
/* Unfaulted immediate if graceful does not exist */
|
||||||
else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty())
|
else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty())
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
|
||||||
else
|
else
|
||||||
@ -120,18 +122,19 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
|||||||
}
|
}
|
||||||
case BMC_THREAD_CMD__POWER_ON:
|
case BMC_THREAD_CMD__POWER_ON:
|
||||||
{
|
{
|
||||||
/* use immediate for all retries if server supports an immediate command */
|
/* Use graceful for the first half of the retry countdown
|
||||||
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() ))
|
* and immediate for the remaining retries. */
|
||||||
|
if ((!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) &&
|
||||||
|
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
|
||||||
|
|
||||||
/* unfaulted graceful if it exists */
|
/* Unfaulted graceful if it exists */
|
||||||
else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty())
|
else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty())
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful);
|
||||||
|
|
||||||
/* unfaulted immediate if graceful does not exist */
|
/* Unfaulted immediate if graceful does not exist */
|
||||||
else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty())
|
else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty())
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
elog("%s offers no supported poweron commands", node_ptr->hostname.c_str());
|
elog("%s offers no supported poweron commands", node_ptr->hostname.c_str());
|
||||||
@ -141,15 +144,17 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
|||||||
}
|
}
|
||||||
case BMC_THREAD_CMD__POWER_OFF:
|
case BMC_THREAD_CMD__POWER_OFF:
|
||||||
{
|
{
|
||||||
/* use immediate for all retries if server supports an immediate command */
|
/* Use graceful for the first half of the retry countdown
|
||||||
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ))
|
* and immediate for the remaining retries. */
|
||||||
|
if ((!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ) &&
|
||||||
|
( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD))
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
|
||||||
|
|
||||||
/* unfaulted graceful if it exists */
|
/* Unfaulted graceful if it exists */
|
||||||
else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() )
|
else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() )
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful);
|
||||||
|
|
||||||
/* unfaulted immediate if graceful does not exist */
|
/* Unfaulted immediate if graceful does not exist */
|
||||||
else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty())
|
else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty())
|
||||||
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
|
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
|
||||||
else
|
else
|
||||||
@ -193,11 +198,23 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
|||||||
{
|
{
|
||||||
want_fit = true ;
|
want_fit = true ;
|
||||||
}
|
}
|
||||||
|
else if (( command == BMC_THREAD_CMD__POWER_ON ) &&
|
||||||
|
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
|
||||||
|
{
|
||||||
|
/* Just change the command to query status */
|
||||||
|
command = BMC_THREAD_CMD__POWER_STATUS ;
|
||||||
|
}
|
||||||
else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
|
else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
|
||||||
( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true ))
|
( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true ))
|
||||||
{
|
{
|
||||||
want_fit = true ;
|
want_fit = true ;
|
||||||
}
|
}
|
||||||
|
else if (( command == BMC_THREAD_CMD__POWER_OFF ) &&
|
||||||
|
( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true ))
|
||||||
|
{
|
||||||
|
/* Just change the command to query status */
|
||||||
|
command = BMC_THREAD_CMD__POWER_STATUS ;
|
||||||
|
}
|
||||||
else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) &&
|
else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) &&
|
||||||
( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true ))
|
( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true ))
|
||||||
{
|
{
|
||||||
|
@ -4007,7 +4007,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
}
|
}
|
||||||
case MTC_RESET__REQ_SEND:
|
case MTC_RESET__REQ_SEND:
|
||||||
{
|
{
|
||||||
node_ptr->power_action_retries--;
|
|
||||||
|
|
||||||
/* Handle loss of connectivity over retries */
|
/* Handle loss of connectivity over retries */
|
||||||
if ( node_ptr->bmc_provisioned == false )
|
if ( node_ptr->bmc_provisioned == false )
|
||||||
@ -4022,10 +4021,10 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
{
|
{
|
||||||
wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n",
|
wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n",
|
||||||
node_ptr->hostname.c_str(),
|
node_ptr->hostname.c_str(),
|
||||||
MTC_POWER_ACTION_RETRY_DELAY);
|
MTC_RESET_ACTION_RETRY_DELAY);
|
||||||
|
|
||||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
|
||||||
resetStageChange ( node_ptr , MTC_RESET__QUEUE );
|
resetStageChange ( node_ptr , MTC_RESET__QUEUE );
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
@ -4033,7 +4032,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
|
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
|
||||||
|
|
||||||
if ( rc )
|
if ( rc )
|
||||||
{
|
{
|
||||||
wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
||||||
@ -4044,7 +4042,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
blog ("%s Reset requested\n", node_ptr->hostname.c_str());
|
blog ("%s Reset requested\n", node_ptr->hostname.c_str());
|
||||||
resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT );
|
resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT );
|
||||||
}
|
}
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
|
||||||
}
|
}
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
@ -4059,11 +4057,10 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
|
else if ( rc )
|
||||||
if ( rc )
|
|
||||||
{
|
{
|
||||||
elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
|
elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY );
|
||||||
resetStageChange ( node_ptr, MTC_RESET__QUEUE );
|
resetStageChange ( node_ptr, MTC_RESET__QUEUE );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -4082,7 +4079,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||||
{
|
{
|
||||||
node_ptr->mtcTimer.ring = false ;
|
node_ptr->mtcTimer.ring = false ;
|
||||||
if ( node_ptr->power_action_retries > 0 )
|
if ( --node_ptr->power_action_retries >= 0 )
|
||||||
{
|
{
|
||||||
char buffer[64] ;
|
char buffer[64] ;
|
||||||
int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||||
@ -4455,6 +4452,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
case MTC_REINSTALL__POWEROFF:
|
case MTC_REINSTALL__POWEROFF:
|
||||||
{
|
{
|
||||||
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
||||||
|
mtcTimer_reset ( node_ptr->mtcTimer ) ;
|
||||||
powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND );
|
powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND );
|
||||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
|
||||||
break ;
|
break ;
|
||||||
@ -4975,18 +4973,24 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
node_ptr->bm_ip.c_str(),
|
node_ptr->bm_ip.c_str(),
|
||||||
rc );
|
rc );
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
;
|
|
||||||
}
|
|
||||||
|
|
||||||
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
||||||
//the fall through to MTC_POWEROFF__REQ_SEND is intentional
|
|
||||||
MTCE_FALLTHROUGH;
|
/* don't allow a timeout of zero to be passed in */
|
||||||
|
if ( power_off_retry_wait == 0 )
|
||||||
|
power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ;
|
||||||
|
|
||||||
|
ilog ("%s power off retry wait is %d seconds",
|
||||||
|
node_ptr->hostname.c_str(), power_off_retry_wait);
|
||||||
|
|
||||||
|
mtcTimer_reset ( node_ptr->mtcTimer ) ;
|
||||||
|
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
||||||
|
break ;
|
||||||
}
|
}
|
||||||
case MTC_POWEROFF__REQ_SEND:
|
case MTC_POWEROFF__REQ_SEND:
|
||||||
{
|
{
|
||||||
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||||
|
{
|
||||||
/* Handle loss of connectivity over retries */
|
/* Handle loss of connectivity over retries */
|
||||||
if ( node_ptr->bmc_provisioned == false )
|
if ( node_ptr->bmc_provisioned == false )
|
||||||
{
|
{
|
||||||
@ -4998,12 +5002,8 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
|
|
||||||
if ( node_ptr->bmc_accessible == false )
|
if ( node_ptr->bmc_accessible == false )
|
||||||
{
|
{
|
||||||
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n",
|
wlog ("%s Power Off request rejected ; BMC not accessible",
|
||||||
node_ptr->hostname.c_str(),
|
node_ptr->hostname.c_str());
|
||||||
MTC_POWER_ACTION_RETRY_DELAY);
|
|
||||||
|
|
||||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
@ -5013,16 +5013,16 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
|
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
|
||||||
if ( rc )
|
if ( rc )
|
||||||
{
|
{
|
||||||
node_ptr->power_action_retries--;
|
wlog ("%s Power-Off request failed (%d)", node_ptr->hostname.c_str(), rc );
|
||||||
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
|
ilog ("%s Power-Off requested", node_ptr->hostname.c_str());
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
|
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
|
||||||
}
|
}
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
@ -5034,41 +5034,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
rc = bmc_command_recv ( node_ptr );
|
rc = bmc_command_recv ( node_ptr );
|
||||||
if ( rc == RETRY )
|
if ( rc == RETRY )
|
||||||
{
|
{
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
else if ( rc )
|
else if ( rc )
|
||||||
{
|
{
|
||||||
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
|
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
|
||||||
|
|
||||||
// Need to handle retries in this case since we don't
|
|
||||||
// go through the QUEUE stage.
|
|
||||||
if ( --node_ptr->power_action_retries > 0 )
|
|
||||||
{
|
|
||||||
char buffer[255] ;
|
|
||||||
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
|
||||||
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
|
|
||||||
mtcInvApi_update_task ( node_ptr, buffer);
|
|
||||||
|
|
||||||
// The power off command can fail due to connectivity
|
|
||||||
// issue or if the server is now already powered off.
|
|
||||||
// The latter could occur if the previous power off
|
|
||||||
// command failed 'in response' but actually did end up
|
|
||||||
// powering off. In that case, if we continue to just
|
|
||||||
// retry the power off when the power is already off
|
|
||||||
// then that will just fail again since most redfish
|
|
||||||
// implementations fail rather than wave-on a power off
|
|
||||||
// request while the power is already off. In this case
|
|
||||||
// its better to switch to power query power status
|
|
||||||
// again and allow that result to put this power off
|
|
||||||
// FSM into the correct state to continue/retry the
|
|
||||||
// quest for power off.
|
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
|
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
|
||||||
}
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
|
||||||
else
|
|
||||||
{
|
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -5091,6 +5064,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
|
|
||||||
plog ("%s is now offline\n", node_ptr->hostname.c_str());
|
plog ("%s is now offline\n", node_ptr->hostname.c_str());
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
|
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
|
||||||
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT );
|
||||||
}
|
}
|
||||||
else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||||
{
|
{
|
||||||
@ -5100,13 +5074,16 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
case MTC_POWEROFF__POWERQRY:
|
case MTC_POWEROFF__POWERQRY:
|
||||||
|
{
|
||||||
|
/* give the power off action some time to complete */
|
||||||
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||||
{
|
{
|
||||||
if ( node_ptr->bmc_thread_ctrl.done )
|
if ( node_ptr->bmc_thread_ctrl.done )
|
||||||
{
|
{
|
||||||
/* Query Host Power Status */
|
/* Query Host Power Status */
|
||||||
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
|
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
|
||||||
{
|
{
|
||||||
elog ("%s '%s' send failed\n",
|
elog ("%s '%s' send failed",
|
||||||
node_ptr->hostname.c_str(),
|
node_ptr->hostname.c_str(),
|
||||||
bmcUtil_getCmd_str(
|
bmcUtil_getCmd_str(
|
||||||
node_ptr->bmc_thread_info.command).c_str());
|
node_ptr->bmc_thread_info.command).c_str());
|
||||||
@ -5117,12 +5094,13 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
{
|
{
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT );
|
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT );
|
||||||
}
|
}
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
|
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
case MTC_POWEROFF__POWERQRY_WAIT:
|
case MTC_POWEROFF__POWERQRY_WAIT:
|
||||||
@ -5132,7 +5110,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
int rc = bmc_command_recv ( node_ptr ) ;
|
int rc = bmc_command_recv ( node_ptr ) ;
|
||||||
if ( rc == RETRY )
|
if ( rc == RETRY )
|
||||||
{
|
{
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT );
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
else if ( rc != PASS )
|
else if ( rc != PASS )
|
||||||
@ -5183,15 +5161,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
||||||
}
|
}
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
case MTC_POWEROFF__QUEUE:
|
case MTC_POWEROFF__QUEUE:
|
||||||
{
|
{
|
||||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
if ( --node_ptr->power_action_retries >= 0 )
|
||||||
{
|
|
||||||
if ( --node_ptr->power_action_retries > 0 )
|
|
||||||
{
|
{
|
||||||
char buffer[255] ;
|
char buffer[255] ;
|
||||||
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||||
@ -5204,17 +5179,19 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
if (( node_ptr->bmc_thread_info.status ) &&
|
if (( node_ptr->bmc_thread_info.status ) &&
|
||||||
( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL))
|
( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL))
|
||||||
{
|
{
|
||||||
wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
|
wlog ("%s ... %s (rc:%d)", node_ptr->hostname.c_str(),
|
||||||
node_ptr->bmc_thread_info.status_string.c_str(),
|
node_ptr->bmc_thread_info.status_string.c_str(),
|
||||||
node_ptr->bmc_thread_info.status );
|
node_ptr->bmc_thread_info.status );
|
||||||
}
|
}
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
||||||
|
ilog ("%s waiting %d seconds before next power off retry",
|
||||||
|
node_ptr->hostname.c_str(), power_off_retry_wait);
|
||||||
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, power_off_retry_wait );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
||||||
}
|
}
|
||||||
}
|
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
case MTC_POWEROFF__DONE:
|
case MTC_POWEROFF__DONE:
|
||||||
@ -5294,7 +5271,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
node_ptr->hostname.c_str(),
|
node_ptr->hostname.c_str(),
|
||||||
MTC_POWER_ACTION_RETRY_DELAY);
|
MTC_POWER_ACTION_RETRY_DELAY);
|
||||||
|
|
||||||
node_ptr->power_action_retries-- ;
|
|
||||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||||
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||||
@ -5304,7 +5280,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ;
|
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ;
|
||||||
if ( rc )
|
if ( rc )
|
||||||
{
|
{
|
||||||
node_ptr->power_action_retries-- ;
|
|
||||||
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -5349,18 +5324,11 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* failure path handling */
|
|
||||||
else if ( node_ptr->power_action_retries <= 0 )
|
|
||||||
{
|
|
||||||
wlog ("%s current power state query failed ; "
|
|
||||||
"proceeding with power-on",
|
|
||||||
node_ptr->hostname.c_str());
|
|
||||||
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
|
||||||
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
|
wlog ("%s power state query failed",
|
||||||
|
node_ptr->hostname.c_str());
|
||||||
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break ;
|
break ;
|
||||||
@ -5383,7 +5351,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
|
|
||||||
if ( node_ptr->bmc_accessible == false )
|
if ( node_ptr->bmc_accessible == false )
|
||||||
{
|
{
|
||||||
node_ptr->power_action_retries--;
|
|
||||||
wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
|
wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
|
||||||
node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);
|
node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);
|
||||||
|
|
||||||
@ -5397,7 +5364,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON );
|
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON );
|
||||||
if ( rc )
|
if ( rc )
|
||||||
{
|
{
|
||||||
node_ptr->power_action_retries--;
|
|
||||||
wlog ("%s Power-On request failed (%d)\n",
|
wlog ("%s Power-On request failed (%d)\n",
|
||||||
node_ptr->hostname.c_str(), rc );
|
node_ptr->hostname.c_str(), rc );
|
||||||
|
|
||||||
@ -5429,7 +5395,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
|
|
||||||
if ( rc )
|
if ( rc )
|
||||||
{
|
{
|
||||||
node_ptr->power_action_retries--;
|
|
||||||
elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
|
elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
|
||||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||||
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
||||||
@ -5452,7 +5417,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||||
{
|
{
|
||||||
node_ptr->mtcTimer.ring = false ;
|
node_ptr->mtcTimer.ring = false ;
|
||||||
if ( node_ptr->power_action_retries > 0 )
|
if ( --node_ptr->power_action_retries >= 0 )
|
||||||
{
|
{
|
||||||
char buffer[64] ;
|
char buffer[64] ;
|
||||||
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2016-2017 Wind River Systems, Inc.
|
* Copyright (c) 2016-2023 Wind River Systems, Inc.
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*
|
*
|
||||||
@ -316,7 +316,7 @@ void * mtcThread_bmc ( void * arg )
|
|||||||
{
|
{
|
||||||
string chopped_request = bmcUtil_chop_system_req(request);
|
string chopped_request = bmcUtil_chop_system_req(request);
|
||||||
daemon_remove_file ( datafile.data() ) ;
|
daemon_remove_file ( datafile.data() ) ;
|
||||||
blog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str());
|
ilog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str());
|
||||||
|
|
||||||
/****** Make the system call ******/
|
/****** Make the system call ******/
|
||||||
rc =
|
rc =
|
||||||
|
Loading…
Reference in New Issue
Block a user