Fix Mtce Heartbeat period recovery on MNFA Exit
When Multi-Node Failure Avoidance (MNFA) occurs, maintenance commands the Heartbeat Agent to slow down by a factor of 4. The rate recovery following a MNFA is not occurring. Update https://review.opendev.org/#/c/701057 made a condition check change that introduced this issue by requiring mnfa_timeout to be non-zero before an attempt is made to recover heartbeat period following MNFA recovery. This update switches that condition check to use more specific mnfa_backoff state tracker and because MNFA is a global maintenance mode feature rather than a node specific feature, moves the recovery check code from the node level fsm into a mnfa_recovery_handler called in the main select loop. Test Plan: PASS: Verify MNFA handling/recovery with mnfa_timeout!=0 that expires. PASS: Verify MNFA handling/recovery when mnfa_timeout!=0 but before the timeout expires. PASS: Verify MNFA handling/recovery when mnfa_timeout=0 PASS: Verify MNFA backoff rate recovery over mtcAgent process restart. PASS: Verify MNFA backoff rate is sent to hbsAgent if hbsAgent restarts while MNFA his active. Change-Id: I8da5a000ab503692c7cfa620233ed8aa772c50f8 Closes-Bug: #1893212 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
c7e18ca9e9
commit
2210c71216
@ -1382,7 +1382,9 @@ public:
|
||||
/* the main fsm entrypoint to service all hosts */
|
||||
void fsm ( void ) ;
|
||||
|
||||
/** This controller's hostname set'er */
|
||||
void mnfa_recovery_handler ( string & hostname );
|
||||
|
||||
/** This controller's hostname set'er */
|
||||
void set_my_hostname ( string hostname );
|
||||
|
||||
/** This controller's hostname get'er */
|
||||
@ -1506,6 +1508,7 @@ public:
|
||||
* node failure avoidance threshold and until there are no more
|
||||
* in service trouble hosts */
|
||||
bool mnfa_active ;
|
||||
bool mnfa_backoff = false ;
|
||||
void mnfa_cancel( void );
|
||||
|
||||
std::list<string> mnfa_awol_list ;
|
||||
|
@ -1226,6 +1226,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
||||
ilog ("%s %s inventory push ... done",
|
||||
controller.c_str(),
|
||||
MTC_SERVICE_HBSAGENT_NAME);
|
||||
|
||||
/* Ensure that the hbsAgent heartbeat period is correct */
|
||||
if ( obj_ptr->mnfa_backoff == true )
|
||||
send_hbs_command ( obj_ptr->my_hostname, MTC_BACKOFF_HBS, CONTROLLER );
|
||||
else
|
||||
send_hbs_command ( obj_ptr->my_hostname, MTC_RECOVER_HBS, CONTROLLER );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1569,6 +1569,9 @@ void daemon_service_run ( void )
|
||||
continue ;
|
||||
}
|
||||
|
||||
/* Handle recovery from MNFA */
|
||||
mtcInv.mnfa_recovery_handler ( mtcInv.my_hostname );
|
||||
|
||||
mtcInv.fsm ( );
|
||||
|
||||
/* Initialize the master fd_set */
|
||||
|
@ -41,13 +41,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
||||
return FAIL ;
|
||||
}
|
||||
|
||||
/* if the multi-Node-Failure Avoidance timer rang then run its recovery handler */
|
||||
if (( this->mnfa_timeout != 0 ) && ( mtcTimer_mnfa.ring == true ))
|
||||
{
|
||||
mtcTimer_mnfa.ring = false ;
|
||||
mnfa_exit ( true );
|
||||
}
|
||||
|
||||
/* handle clear task request */
|
||||
if ( node_ptr->clear_task == true )
|
||||
{
|
||||
@ -57,7 +50,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
/* Service the libEvent work queue */
|
||||
workQueue_process ( node_ptr ) ;
|
||||
|
||||
|
||||
/* Service the maintenance command queue if there are commands waiting */
|
||||
if ( node_ptr->mtcCmd_work_fifo.size())
|
||||
{
|
||||
|
@ -202,7 +202,7 @@ void nodeLinkClass::mnfa_enter ( void )
|
||||
wlog ("MNFA ENTER --> Entering Multi-Node Failure Avoidance\n");
|
||||
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_ENTER );
|
||||
mnfa_active = true ;
|
||||
|
||||
mnfa_backoff = true ;
|
||||
send_hbs_command ( my_hostname, MTC_BACKOFF_HBS );
|
||||
|
||||
/* Handle the case where we are already trying to recover from a
|
||||
@ -237,6 +237,10 @@ void nodeLinkClass::mnfa_enter ( void )
|
||||
wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout);
|
||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
|
||||
}
|
||||
else
|
||||
{
|
||||
this->mtcTimer_mnfa.ring = false ;
|
||||
}
|
||||
log_mnfa_pool ( mnfa_awol_list );
|
||||
}
|
||||
|
||||
@ -342,11 +346,6 @@ void nodeLinkClass::mnfa_exit ( bool force )
|
||||
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
|
||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
|
||||
}
|
||||
else
|
||||
{
|
||||
send_hbs_command ( my_hostname, MTC_RECOVER_HBS );
|
||||
}
|
||||
|
||||
mnfa_host_count[MGMNT_IFACE] = 0 ;
|
||||
mnfa_host_count[CLSTR_IFACE] = 0 ;
|
||||
mnfa_awol_list.clear();
|
||||
@ -392,3 +391,55 @@ void nodeLinkClass::mnfa_cancel ( void )
|
||||
}
|
||||
mnfa_awol_list.clear();
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Name : mnfa_recovery_handler
|
||||
*
|
||||
* Purpose : Handle recovery from mnfa
|
||||
*
|
||||
* Description: This handler is called from the main loop to handle
|
||||
* exiting MNFA and scheduling a timer to send the recover
|
||||
* command to hbsAgent at base level.
|
||||
*
|
||||
* Assumptions: Need to send the recover command to hbsAgent at base level.
|
||||
*
|
||||
* If mnfa is timer driven ( mnfa_timeout != 0 ) then exit
|
||||
* from mnfa happens within the mnfa timer handler which
|
||||
* should not be sending messages.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
void nodeLinkClass::mnfa_recovery_handler ( string & hostname )
|
||||
{
|
||||
/* if the multi-Node-Failure Avoidance timer rang
|
||||
* then run the recovery handler */
|
||||
if ( this->mtcTimer_mnfa.ring == true )
|
||||
{
|
||||
/* rang due to mnfa_timeout */
|
||||
if ( this->mnfa_active == true )
|
||||
{
|
||||
mtcTimer_mnfa.ring = false ;
|
||||
mnfa_exit ( true );
|
||||
}
|
||||
/* rang due to 3 second recovery timer set in mnfa_exit */
|
||||
else if ( this->mnfa_backoff == true )
|
||||
{
|
||||
ilog("%s heartbeat backoff recovery", hostname.c_str())
|
||||
if ( send_hbs_command ( my_hostname, MTC_RECOVER_HBS ) == PASS )
|
||||
{
|
||||
this->mnfa_backoff = false ;
|
||||
}
|
||||
else
|
||||
{
|
||||
int retry_timeout = MTC_SECS_30 ;
|
||||
|
||||
/* in the case of a send failure, to avoid log flooding,
|
||||
* start the timer again in 30 seconds */
|
||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, retry_timeout );
|
||||
ilog("%s heartbeat backoff recovery command send failed, retrying in %d secs",
|
||||
hostname.c_str(), retry_timeout);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user