Modify Mtce Reinstall FSM to first power-off BMC provisioned hosts
This update only applies to servers that support and are provisioned for Board Management Control (BMC). The BMC of some servers silently reject the 'set next boot device', a command while it is executing BIOS. The current reinstall algorithm when the BMC is provisioned starts by detecting the power state of the target server. If the power is off it will 'first power it on' and then proceed to 'set next boot device' to pxe followed by a reset. For the initial power off state case, the timing of these operations is such that the server is in BIOS when the 'set next boot device' command is issued. This update modifies the host reinstall algorithm to first power-off a server followed by setting the next boot device while the server is confirmed to be powered off, then powered on. This ensures the server gets and handles the set next boot device command operation properly. This update also fixes a race condition between the bmc_handler and power_handler by moving the final power state update in the power handler to the power done phase. Test Plan: Verify all new reinstall failure path handling via fault insertion testing Verify reinstall of powered off host Verify reinstall of powered on host Verify reinstall of Wildcat server with ipmi Verify reinstall of Supermicro server with ipmi and redfish Verify reinstall of Ironpass server with ipmi Verify reinstall of WolfPass server with redfish and ipmi Verify reinstall of Dell server with ipmi Over 30 reinstalls were performed across all server types, with initial power on and off using both ipmi and redfish (where supported). Change-Id: Iefb17e9aa76c45f2ceadf83f23b1231ae82f000f Closes-Bug: 1862065 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
932d7df90a
commit
da7b2e94f1
@ -461,12 +461,16 @@ void mtc_stages_init ( void )
|
||||
|
||||
reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start";
|
||||
reinstallStages_str [MTC_REINSTALL__START_WAIT ] = "Reinstall-Start-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__POWERQRY ] = "Reinstall-Power-State-Query";
|
||||
reinstallStages_str [MTC_REINSTALL__POWERQRY_WAIT ] = "Reinstall-Power-State-Query-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__RESTART ] = "Reinstall-ReStart";
|
||||
reinstallStages_str [MTC_REINSTALL__RESTART_WAIT ] = "Reinstall-ReStart-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
|
||||
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__POWEROFF ] = "Reinstall-PowerOff";
|
||||
reinstallStages_str [MTC_REINSTALL__POWEROFF_WAIT ] = "Reinstall-PowerOff-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__NETBOOT ] = "Reinstall-Netboot";
|
||||
reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
|
||||
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset";
|
||||
reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait";
|
||||
reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk";
|
||||
|
@ -245,9 +245,11 @@ typedef enum
|
||||
#define MTC_TASK_REINSTALL_FAIL_OL "Reinstall Failed ; timeout waiting for offline"
|
||||
#define MTC_TASK_REINSTALL_FAIL_TO "Reinstall Failed ; timeout waiting for online"
|
||||
#define MTC_TASK_REINSTALL_FAIL_BA "Reinstall Failed ; timeout waiting BMC access"
|
||||
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power on host"
|
||||
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power off host"
|
||||
#define MTC_TASK_REINSTALL_FAIL_PU "Reinstall Failed ; could not power on host"
|
||||
#define MTC_TASK_REINSTALL_FAIL_NB "Reinstall Failed ; netboot request"
|
||||
#define MTC_TASK_REINSTALL_FAIL_PR "Reinstall Failed ; power reset request"
|
||||
#define MTC_TASK_REINSTALL_FAIL_PQ "Reinstall Failed ; could not query power state"
|
||||
|
||||
#define MTC_TASK_REINSTALL_FAIL "Reinstall Failed"
|
||||
#define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded"
|
||||
@ -1059,10 +1061,14 @@ typedef enum
|
||||
MTC_REINSTALL__START_WAIT,
|
||||
MTC_REINSTALL__RESTART,
|
||||
MTC_REINSTALL__RESTART_WAIT,
|
||||
MTC_REINSTALL__POWERON,
|
||||
MTC_REINSTALL__POWERON_WAIT,
|
||||
MTC_REINSTALL__POWERQRY,
|
||||
MTC_REINSTALL__POWERQRY_WAIT,
|
||||
MTC_REINSTALL__POWEROFF,
|
||||
MTC_REINSTALL__POWEROFF_WAIT,
|
||||
MTC_REINSTALL__NETBOOT,
|
||||
MTC_REINSTALL__NETBOOT_WAIT,
|
||||
MTC_REINSTALL__POWERON,
|
||||
MTC_REINSTALL__POWERON_WAIT,
|
||||
MTC_REINSTALL__RESET,
|
||||
MTC_REINSTALL__RESET_WAIT,
|
||||
MTC_REINSTALL__WIPEDISK,
|
||||
|
@ -1,3 +1,3 @@
|
||||
SRC_DIR="src"
|
||||
TIS_PATCH_VER=157
|
||||
TIS_PATCH_VER=158
|
||||
BUILD_IS_SLOW=5
|
||||
|
@ -193,7 +193,8 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
/* handle the redfishtool root query as a special case because
|
||||
* it is likely to fail and we don't want un-necessary error logs */
|
||||
if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) &&
|
||||
if ((( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) ||
|
||||
( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_INFO )) &&
|
||||
(( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE )))
|
||||
{
|
||||
blog ("%s bmc redfish %s failed",
|
||||
@ -201,14 +202,6 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
|
||||
bmcUtil_getCmd_str(
|
||||
node_ptr->bmc_thread_info.command).c_str());
|
||||
}
|
||||
else if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_INFO ) &&
|
||||
(( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE )))
|
||||
{
|
||||
wlog ("%s bmc redfish %s failed",
|
||||
node_ptr->hostname.c_str(),
|
||||
bmcUtil_getCmd_str(
|
||||
node_ptr->bmc_thread_info.command).c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s bmc redfish %s command failed (%s) (data:%s) (rc:%d:%d:%s)\n",
|
||||
@ -220,6 +213,7 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->bmc_thread_info.status,
|
||||
node_ptr->bmc_thread_info.status_string.c_str());
|
||||
}
|
||||
goto bmc_command_recv_cleanup;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -382,6 +376,8 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
}
|
||||
|
||||
bmc_command_recv_cleanup:
|
||||
|
||||
if ( rc != RETRY )
|
||||
{
|
||||
node_ptr->bmc_thread_ctrl.done = true ;
|
||||
|
@ -4036,14 +4036,15 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
* Description: This FSM handles node (re)install with and without
|
||||
* a provisioned Board Management Controller (BMC).
|
||||
*
|
||||
* BMC provisioned case: using IPMI commands to BMC ...
|
||||
* BMC provisioned case: board management commands to BMC ...
|
||||
*
|
||||
* - ensure host power is on
|
||||
* - power off host
|
||||
* - force network boot on next reset
|
||||
* - issue node reset
|
||||
* - power on host
|
||||
*
|
||||
* BMC not provisioned case: using mtce messaging to node ...
|
||||
* BMC not provisioned case: mtce messaging to node ...
|
||||
*
|
||||
* - host must be online
|
||||
* - send mtcClient wipedisk command
|
||||
* fail reinstall if no ACK
|
||||
* - send mtcClient reboot command
|
||||
@ -4120,17 +4121,9 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_TIMEOUT_BMC_ACC );
|
||||
reinstallStageChange ( node_ptr, MTC_REINSTALL__START_WAIT );
|
||||
}
|
||||
else if ( node_ptr->power_on == false )
|
||||
{
|
||||
/* need to power on node */
|
||||
wlog ("%s Reinstall power-on required", node_ptr->hostname.c_str());
|
||||
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* power is on so issue net boot command */
|
||||
ilog ("%s Reinstall power is on", node_ptr->hostname.c_str());
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY );
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -4211,18 +4204,107 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_REINSTALL__POWERON:
|
||||
case MTC_REINSTALL__POWERQRY:
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
|
||||
if ( node_ptr->bmc_thread_ctrl.done )
|
||||
{
|
||||
/* Query Host Power Status */
|
||||
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
|
||||
{
|
||||
elog ("%s '%s' send failed\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
bmcUtil_getCmd_str(
|
||||
node_ptr->bmc_thread_info.command).c_str());
|
||||
pingUtil_restart ( node_ptr->bm_ping_info );
|
||||
}
|
||||
else
|
||||
{
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY_WAIT );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_REINSTALL__POWERON_WAIT:
|
||||
case MTC_REINSTALL__POWERQRY_WAIT:
|
||||
{
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
int rc = bmc_command_recv ( node_ptr ) ;
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
break ;
|
||||
}
|
||||
else if ( rc != PASS )
|
||||
{
|
||||
wlog ("%s '%s' failed receive (rc:%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
bmcUtil_getCmd_str(
|
||||
node_ptr->bmc_thread_info.command).c_str(),
|
||||
rc );
|
||||
}
|
||||
else if ( node_ptr->bmc_thread_info.data.empty() )
|
||||
{
|
||||
wlog ("%s '%s' request yielded no response data",
|
||||
node_ptr->hostname.c_str(),
|
||||
bmcUtil_getCmd_str(
|
||||
node_ptr->bmc_thread_info.command).c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
int rc =
|
||||
bmcUtil_is_power_on ( node_ptr->hostname,
|
||||
node_ptr->bmc_protocol,
|
||||
node_ptr->bmc_thread_info.data,
|
||||
node_ptr->power_on);
|
||||
if ( rc == PASS )
|
||||
{
|
||||
if ( node_ptr->power_on == true )
|
||||
{
|
||||
ilog ("%s Reinstall power-off required",
|
||||
node_ptr->hostname.c_str());
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF );
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s Reinstall power-off already",
|
||||
node_ptr->hostname.c_str());
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s Reinstall power query failed (rc:%d)",
|
||||
node_ptr->hostname.c_str(), rc );
|
||||
}
|
||||
}
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PQ );
|
||||
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
|
||||
}
|
||||
else
|
||||
{
|
||||
; /* wait longer */
|
||||
}
|
||||
break ;
|
||||
}
|
||||
|
||||
case MTC_REINSTALL__POWEROFF:
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
|
||||
break ;
|
||||
}
|
||||
case MTC_REINSTALL__POWEROFF_WAIT:
|
||||
{
|
||||
/* The power handler manages timeout */
|
||||
if ( node_ptr->powerStage == MTC_POWER__DONE )
|
||||
{
|
||||
if ( node_ptr->power_on == true )
|
||||
if ( node_ptr->power_on == false )
|
||||
{
|
||||
if ( node_ptr->task != MTC_TASK_REINSTALL )
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
|
||||
@ -4276,7 +4358,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if ( rc == PASS )
|
||||
{
|
||||
ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str());
|
||||
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET);
|
||||
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON);
|
||||
}
|
||||
else if ( rc == RETRY )
|
||||
{
|
||||
@ -4293,6 +4375,41 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_REINSTALL__POWERON:
|
||||
{
|
||||
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
|
||||
break ;
|
||||
}
|
||||
case MTC_REINSTALL__POWERON_WAIT:
|
||||
{
|
||||
/* The power handler manages timeout */
|
||||
if ( node_ptr->powerStage == MTC_POWER__DONE )
|
||||
{
|
||||
if ( node_ptr->power_on == true )
|
||||
{
|
||||
if ( node_ptr->task != MTC_TASK_REINSTALL )
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
|
||||
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__OFFLINE_WAIT );
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_PU);
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PU );
|
||||
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* run the power handler till the host's power is on or
|
||||
* the power-on handler times out */
|
||||
power_handler ( node_ptr );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_REINSTALL__RESET:
|
||||
{
|
||||
int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
|
||||
@ -4736,7 +4853,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
else
|
||||
{
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
|
||||
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
|
||||
if ( rc )
|
||||
{
|
||||
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
||||
@ -4744,7 +4861,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else
|
||||
{
|
||||
blog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
|
||||
ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
@ -4756,14 +4873,13 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
rc = bmc_command_recv ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
break ;
|
||||
}
|
||||
|
||||
if ( rc )
|
||||
rc = bmc_command_recv ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
||||
break ;
|
||||
}
|
||||
else if ( rc )
|
||||
{
|
||||
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
||||
@ -4772,10 +4888,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
else
|
||||
{
|
||||
ilog ("%s is Powering Off\n", node_ptr->hostname.c_str() );
|
||||
mtcInvApi_update_task ( node_ptr, "Powering Off" );
|
||||
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
|
||||
{
|
||||
mtcInvApi_update_task ( node_ptr, "Powering Off" );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
||||
powerStageChange ( node_ptr , MTC_POWEROFF__DONE );
|
||||
node_ptr->power_on = false ;
|
||||
}
|
||||
}
|
||||
break ;
|
||||
@ -4822,6 +4940,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
|
||||
|
||||
powerStageChange ( node_ptr , MTC_POWER__DONE );
|
||||
node_ptr->power_on = false ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -5021,10 +5140,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
else
|
||||
{
|
||||
ilog ("%s is Powering On\n", node_ptr->hostname.c_str() );
|
||||
mtcInvApi_update_task ( node_ptr, "Powering On" );
|
||||
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
|
||||
{
|
||||
mtcInvApi_update_task ( node_ptr, "Powering On" );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
||||
powerStageChange ( node_ptr , MTC_POWERON__DONE );
|
||||
node_ptr->power_on = true ;
|
||||
}
|
||||
}
|
||||
break ;
|
||||
@ -5067,6 +5188,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
|
||||
|
||||
powerStageChange ( node_ptr , MTC_POWER__DONE );
|
||||
node_ptr->power_on = true ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -5083,7 +5205,10 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
ar_enable ( node_ptr );
|
||||
|
||||
mtcInvApi_force_task ( node_ptr, "" );
|
||||
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
|
||||
{
|
||||
mtcInvApi_force_task ( node_ptr, "" );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user