Fix Mtce Heartbeat period recovery on MNFA Exit

When Multi-Node Failure Avoidance (MNFA) occurs, maintenance commands the Heartbeat Agent to slow down by a factor of 4. The rate recovery following a MNFA is not occurring. Update https://review.opendev.org/#/c/701057 made a condition check change that introduced this issue by requiring mnfa_timeout to be non-zero before an attempt is made to recover heartbeat period following MNFA recovery. This update switches that condition check to use more specific mnfa_backoff state tracker and because MNFA is a global maintenance mode feature rather than a node specific feature, moves the recovery check code from the node level fsm into a mnfa_recovery_handler called in the main select loop. Test Plan: PASS: Verify MNFA handling/recovery with mnfa_timeout!=0 that expires. PASS: Verify MNFA handling/recovery when mnfa_timeout!=0 but before the timeout expires. PASS: Verify MNFA handling/recovery when mnfa_timeout=0 PASS: Verify MNFA backoff rate recovery over mtcAgent process restart. PASS: Verify MNFA backoff rate is sent to hbsAgent if hbsAgent restarts while MNFA his active. Change-Id: I8da5a000ab503692c7cfa620233ed8aa772c50f8 Closes-Bug: #1893212 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2020-09-06 07:58:29 -04:00 · 2020-09-06 07:58:29 -04:00 · 2210c71216
commit 2210c71216
parent c7e18ca9e9
5 changed files with 71 additions and 15 deletions
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@ -1382,7 +1382,9 @@ public:
    /* the main fsm entrypoint to service all hosts */
    void fsm ( void ) ;

-   /** This controller's hostname set'er */
+    void mnfa_recovery_handler ( string & hostname );
+
+    /** This controller's hostname set'er */
    void   set_my_hostname ( string hostname );

    /** This controller's hostname get'er */
@ -1506,6 +1508,7 @@ public:
     *  node failure avoidance threshold and until there are no more
     *  in service trouble hosts */
    bool mnfa_active ;
+    bool mnfa_backoff = false ;
    void mnfa_cancel( void );

    std::list<string>           mnfa_awol_list ;
--- a/mtce/src/maintenance/mtcCtrlMsg.cpp
+++ b/mtce/src/maintenance/mtcCtrlMsg.cpp
@ -1226,6 +1226,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
        ilog ("%s %s inventory push ... done",
                  controller.c_str(),
                  MTC_SERVICE_HBSAGENT_NAME);
+
+        /* Ensure that the hbsAgent heartbeat period is correct */
+        if ( obj_ptr->mnfa_backoff == true )
+            send_hbs_command ( obj_ptr->my_hostname, MTC_BACKOFF_HBS, CONTROLLER );
+        else
+            send_hbs_command ( obj_ptr->my_hostname, MTC_RECOVER_HBS, CONTROLLER );
    }
    else
    {
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -1569,6 +1569,9 @@ void daemon_service_run ( void )
            continue ;
        }

+        /* Handle recovery from MNFA */
+        mtcInv.mnfa_recovery_handler ( mtcInv.my_hostname );
+
        mtcInv.fsm ( );

        /* Initialize the master fd_set */
--- a/mtce/src/maintenance/mtcNodeFsm.cpp
+++ b/mtce/src/maintenance/mtcNodeFsm.cpp
@ -41,13 +41,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
        return FAIL ;
    }

-    /* if the multi-Node-Failure Avoidance timer rang then run its recovery handler */
-    if (( this->mnfa_timeout != 0 ) && ( mtcTimer_mnfa.ring == true ))
-    {
-        mtcTimer_mnfa.ring = false ;
-        mnfa_exit ( true );
-    }
-
    /* handle clear task request */
    if ( node_ptr->clear_task == true )
    {
@ -57,7 +50,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )

    /* Service the libEvent work queue */
    workQueue_process ( node_ptr ) ;
-    
+
    /* Service the maintenance command queue if there are commands waiting */
    if ( node_ptr->mtcCmd_work_fifo.size())
    {
--- a/mtce/src/maintenance/mtcNodeMnfa.cpp
+++ b/mtce/src/maintenance/mtcNodeMnfa.cpp
@ -202,7 +202,7 @@ void nodeLinkClass::mnfa_enter ( void )
     wlog ("MNFA ENTER --> Entering Multi-Node Failure Avoidance\n");
     mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_ENTER );
     mnfa_active = true ;
-
+     mnfa_backoff = true ;
     send_hbs_command ( my_hostname, MTC_BACKOFF_HBS );

     /* Handle the case where we are already trying to recover from a
@ -237,6 +237,10 @@ void nodeLinkClass::mnfa_enter ( void )
         wlog ("MNFA Auto-Recovery in %d seconds\n",       this->mnfa_timeout);
         mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
     }
+     else
+     {
+         this->mtcTimer_mnfa.ring = false ;
+     }
     log_mnfa_pool ( mnfa_awol_list );
 }

@ -342,11 +346,6 @@ void nodeLinkClass::mnfa_exit ( bool force )
        /* Start the timer that will eventually send the MTC_RECOVER_HBS command */
        mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
    }
-    else
-    {
-        send_hbs_command ( my_hostname, MTC_RECOVER_HBS );
-    }
-
    mnfa_host_count[MGMNT_IFACE] = 0 ;
    mnfa_host_count[CLSTR_IFACE] = 0 ;
    mnfa_awol_list.clear();
@ -392,3 +391,55 @@ void nodeLinkClass::mnfa_cancel ( void )
    }
    mnfa_awol_list.clear();
 }
+
+/**************************************************************************
+ *
+ * Name       : mnfa_recovery_handler
+ *
+ * Purpose    : Handle recovery from mnfa
+ *
+ * Description: This handler is called from the main loop to handle
+ *              exiting MNFA and scheduling a timer to send the recover
+ *              command to hbsAgent at base level.
+ *
+ * Assumptions: Need to send the recover command to hbsAgent at base level.
+ *
+ *              If mnfa is timer driven ( mnfa_timeout != 0 ) then exit
+ *              from mnfa happens within the mnfa timer handler which
+ *              should not be sending messages.
+ *
+ **************************************************************************/
+
+void nodeLinkClass::mnfa_recovery_handler ( string & hostname )
+{
+    /* if the multi-Node-Failure Avoidance timer rang
+     * then run the recovery handler */
+    if ( this->mtcTimer_mnfa.ring == true )
+    {
+        /* rang due to mnfa_timeout */
+        if ( this->mnfa_active == true )
+        {
+            mtcTimer_mnfa.ring = false ;
+            mnfa_exit ( true );
+        }
+        /* rang due to 3 second recovery timer set in mnfa_exit */
+        else if ( this->mnfa_backoff == true )
+        {
+            ilog("%s heartbeat backoff recovery", hostname.c_str())
+            if ( send_hbs_command ( my_hostname, MTC_RECOVER_HBS ) == PASS )
+            {
+                this->mnfa_backoff = false ;
+            }
+            else
+            {
+                int retry_timeout = MTC_SECS_30 ;
+
+                /* in the case of a send failure, to avoid log flooding,
+                 * start the timer again in 30 seconds */
+                mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, retry_timeout );
+                ilog("%s heartbeat backoff recovery command send failed, retrying in %d secs",
+                         hostname.c_str(), retry_timeout);
+            }
+        }
+    }
+}