Merge "Add SM process heartbeat and status to the hbs cluster"

2020-12-16 16:27:18 +00:00 · 2020-12-16 16:27:18 +00:00 · c88c4cb7ee
commit c88c4cb7ee
parent 932c78c455 8c81914ea5
8 changed files with 234 additions and 43 deletions
--- a/mtce-common/src/daemon/daemon_common.h
+++ b/mtce-common/src/daemon/daemon_common.h
@ -245,6 +245,7 @@ typedef struct
    long msecs ;
 } time_delta_type ;

+int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta );
 int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
 int gettime   ( time_debug_type & nowtime ) ;
 unsigned long long  gettime_monotonic_nsec ( void );
--- a/mtce-common/src/daemon/daemon_debug.cpp
+++ b/mtce-common/src/daemon/daemon_debug.cpp
@ -26,20 +26,20 @@ unsigned long long  gettime_monotonic_nsec ( void )
 {
    struct timespec ts;
    clock_gettime (CLOCK_MONOTONIC, &ts);
-    return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec; 
+    return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec;
 }
-    
+
 int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta )
-{ 
+{
    /* Subtract before from after */

    if ((after.ts.tv_sec < before.ts.tv_sec) ||
        ((after.ts.tv_sec == before.ts.tv_sec) &&
-         (after.ts.tv_nsec <= before.ts.tv_nsec))) 
+         (after.ts.tv_nsec <= before.ts.tv_nsec)))
    {
        delta.secs = delta.msecs = 1 ;
-    } 
-    else 
+    }
+    else
    {
        delta.secs = after.ts.tv_sec - before.ts.tv_sec ;
        if (after.ts.tv_nsec < before.ts.tv_nsec)
@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t
        {
            delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ;
        }
-        delta.msecs = (delta.msecs/1000);
+        /* convert nsec time to msec time */
+        delta.msecs = (delta.msecs/1000000);
+    }
+    return (PASS) ;
+}
+
+int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta )
+{
+    /* Subtract before from after */
+
+    if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec)))
+    {
+        delta.secs = delta.msecs = 0 ;
+    }
+    else
+    {
+        delta.secs = after.tv_sec - before.tv_sec ;
+        if (after.tv_nsec < before.tv_nsec)
+        {
+            delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ;
+            delta.secs-- ;
+        }
+        else
+        {
+            delta.msecs = after.tv_nsec - before.tv_nsec ;
+        }
+        /* convert nsec time to msec time */
+        delta.msecs = (delta.msecs/1000000);
    }
    return (PASS) ;
 }
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@ -69,6 +69,19 @@ using namespace std;

 #define MAX_LEN 1000

+/* Stores the MONOTONIC time the last SM heartbeat was received
+ * Heartbeat metrics
+ * SM heartbeat period definition */
+static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ;
+static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ;
+static int sm_heartbeat_count_b2b_misses = 0 ;
+static int sm_heartbeat_count            = 0 ;
+const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100;
+const int SM_HEARTBEAT_PULSE_PERIOD_MSECS  = 800;
+const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2;
+const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER =
+    SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC;
+
 /* Historical String data for mem_logs */
 static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
 static string arrival_histogram[MAX_IFACES]     = { "" , "" } ;
@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype )
 *
 ****************************************************************************/
 static int _hbs_sm_handler_log_throttle = 0 ;
+static int _hbs_sm_heartbeat_log_throttle = 0 ;
 void hbs_sm_handler ( void )
 {
    #define _MAX_MSG_LEN           (80)
@ -1438,9 +1452,26 @@ void hbs_sm_handler ( void )
                if (( service == SUPPERTED_SERVICE ) &&
                    ( request == SUPPORTED_REQUEST ))
                {
-                    /* success path ... */
-                    hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
-
+                    /* SM heartbeat pulses have a reqid = 0 and do not require a response */
+                    if ( reqid == 0 )
+                    {
+                        time_delta_type delta ;
+                        struct timespec ts = sm_heartbeat_timestamp_last ;
+                        clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
+                        if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0)
+                        {
+                            sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last;
+                        }
+                        timedelta (ts, sm_heartbeat_timestamp_last, delta );
+                        sm_heartbeat_count++ ;
+                        ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)",
+                              sm_heartbeat_count, delta.secs, delta.msecs);
+                    }
+                    else
+                    {
+                        /* success path ... */
+                        hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
+                    }
                    /* reset log throttle */
                   _hbs_sm_handler_log_throttle = 0 ;
                }
@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void )
    dlog ("... %s", sm_mesg );
 }

+/****************************************************************************
+ *
+ * Name       ; manage_sm_heartbeat
+ *
+ * Purpose    : Determine if we received an SM heartbeat message within
+ *              the last SM_HEARTBEAT_PULSE_PERIOD_MSECS
+ *
+ * Description: Compare the monotonic now time to the monotonic time
+ *              of the last received  SM heartbeat pulse.
+ *
+ * Returns    : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS
+ *              False if time delta is greater
+ *
+ ***************************************************************************/
+bool manage_sm_heartbeat ( void )
+{
+    struct timespec ts ;
+    time_delta_type delta ;
+    clock_gettime (CLOCK_MONOTONIC, &ts );
+    timedelta (sm_heartbeat_timestamp_last, ts, delta );
+    int64_t delta_in_ms = delta.secs * 1000 + delta.msecs;
+    bool heartbeat_ok;
+    if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
+    {
+        sm_heartbeat_count = 0;
+        if (( ++sm_heartbeat_count_b2b_misses < 20 )||
+            (!( sm_heartbeat_count_b2b_misses % 100 )))
+        {
+            wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
+                  delta.secs, delta.msecs,
+                  sm_heartbeat_count_b2b_misses,
+                  sm_heartbeat_count);
+        }
+        heartbeat_ok = false;
+    }
+    else
+    {
+        if(sm_heartbeat_count_b2b_misses)
+        {
+            int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1;
+
+            if(sm_heartbeat_count >= expected_beeps)
+            {
+                if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER)
+                {
+                    ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing",
+                          sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
+                          delta_in_ms,
+                          sm_heartbeat_count_b2b_misses);
+
+                    sm_heartbeat_count_b2b_misses = 0;
+                    sm_heartbeat_count = 0;
+                    sm_heartbeat_timestamp_restart.tv_sec = 0;
+                    sm_heartbeat_timestamp_restart.tv_nsec = 0;
+                    heartbeat_ok = true;
+                }else
+                {
+                    ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing",
+                          sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
+                          delta_in_ms,
+                          sm_heartbeat_count_b2b_misses);
+                    heartbeat_ok = false; // not good enough to declare recovered yet
+                }
+            }else
+            {
+                ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. "
+                     "Counting will restart.",
+                    delta_in_ms, expected_beeps - sm_heartbeat_count);
+                sm_heartbeat_timestamp_restart = ts;
+                sm_heartbeat_count = 1;
+                heartbeat_ok = false; // recover is interrupted by further missing beep
+            }
+        }else
+        {
+            if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2)
+            {
+                ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.",
+                    delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS
+                );
+            }
+            heartbeat_ok = true; // not bad enough to declare heartbeat failed yet
+        }
+    }
+    return heartbeat_ok;
+}
+
 /****************************************************************************
 *
 * Name       : daemon_service_run
@ -1552,7 +1669,8 @@ void daemon_service_run ( void )
                hbs_state_audit ();
            }

-            /* run the first audit in 30 seconds */
+            /* The first audit was run after 30 seconds but then the
+             * continuous rate is every hour */
            mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 );
        }

@ -1719,6 +1837,9 @@ void daemon_service_run ( void )
                    sockets_init = true ;
                    monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START );

+                    /* Update Sm heartbeat time to now time */
+                    clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
+
                    /* no need for the heartbeat audit in a simplex system */
                    if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
                    {
@ -2363,6 +2484,8 @@ void daemon_service_run ( void )
         */
        else
        {
+            bool heartbeat_ok = manage_sm_heartbeat();
+
            /* manage vault wrt peer controller */
            hbs_cluster_peer();

@ -2396,9 +2519,14 @@ void daemon_service_run ( void )
                int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
                if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
                {
-                    hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
+                    hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok );
                }
            }
+            /* log cluster throttled */
+            if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
+            {
+                hbs_state_audit ( );
+            }
            hbsTimer.ring = false ;
            heartbeat_request = true ;
            seq_num++ ;
--- a/mtce/src/heartbeat/hbsBase.h
+++ b/mtce/src/heartbeat/hbsBase.h
@ -290,7 +290,8 @@ void hbs_cluster_peer ( void );
 */
 void hbs_cluster_update ( iface_enum iface,
                          unsigned short not_responding_hosts,
-                          bool storage_0_responding );
+                          bool storage_0_responding,
+                          bool sm_heartbeat_ok);

 /* Called by the hbsAgent pulse transmitter to append this controllers
 * running cluster view in the next multicast pulse request.
--- a/mtce/src/heartbeat/hbsCluster.cpp
+++ b/mtce/src/heartbeat/hbsCluster.cpp
@ -217,7 +217,7 @@ void cluster_list ( void )

 void cluster_storage0_state ( bool enabled )
 {
-    if ( ctrl.cluster.storage0_enabled != enabled )
+    if ( (bool)ctrl.cluster.storage0_enabled != enabled )
    {
        ctrl.cluster.storage0_enabled = enabled ;
        ilog ("storage-0 heartbeat state changed to %s",
@ -305,36 +305,40 @@ void hbs_cluster_add ( string & hostname )
 void hbs_cluster_del ( string & hostname )
 {
    std::list<string>::iterator hostname_ptr ;
+    bool found = false ;
    for ( hostname_ptr  = ctrl.monitored_hostname_list.begin();
          hostname_ptr != ctrl.monitored_hostname_list.end() ;
          hostname_ptr++ )
    {
        if ( hostname_ptr->compare(hostname) == 0 )
        {
-            ctrl.monitored_hostname_list.remove(hostname) ;
-            ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
-
-            /* Manage storage-0 state. */
-            if ( hostname.compare(STORAGE_0) == 0 )
-            {
-                cluster_storage0_state ( false );
-            }
-
-            /* If we get down to 0 monitored hosts then just start fresh */
-            if (( ctrl.monitored_hosts ) == 0 )
-            {
-                hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
-            }
-
-            ilog ("%s deleted from cluster", hostname.c_str());
-
-            cluster_list ();
-
-            hbs_cluster_change ( hostname + " deleted" );
-
+            found = true ;
            break ;
        }
    }
+    if ( found == true )
+    {
+        ctrl.monitored_hostname_list.remove(hostname) ;
+        ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
+
+        /* Manage storage-0 state. */
+        if ( hostname.compare(STORAGE_0) == 0 )
+                cluster_storage0_state ( false );
+
+        /* If we get down to 0 monitored hosts then just start fresh */
+        if (( ctrl.monitored_hosts ) == 0 )
+                hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
+
+        ilog ("%s deleted from cluster", hostname.c_str());
+
+        cluster_list ();
+
+        hbs_cluster_change ( hostname + " deleted" );
+    }
+    else
+    {
+        slog("%s not found in cluster list", hostname.c_str());
+    }
 }

 /****************************************************************************
@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void )
 *               4. Maintain a back to back non-responding count for storage-0.
 *                  Once the count reaches the minimum threshold of
 *                  STORAGE_0_NR_THRESHOLD then the specific network history
- *                  is updated to indicate storgae-0 is not responding. Once
+ *                  is updated to indicate storage-0 is not responding. Once
 *                  storage-0 starts responding again with a single response
 *                  then that network history is updated to indicate storage-0
 *                  is responding.
@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void )

 void hbs_cluster_update ( iface_enum iface,
                      unsigned short not_responding_hosts,
-                                bool storage_0_responding )
+                                bool storage_0_responding,
+                                bool sm_heartbeat_ok)
 {
    if ( ctrl.monitored_hosts == 0 )
        return ;
@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface,
    else
        return ;

+    if ( ctrl.monitored_hosts < not_responding_hosts )
+    {
+        slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
+    }
+
    if ( not_responding_hosts )
    {
        clog ("controller-%d %s enabled:%d not responding:%d",
@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface,
        }
    }

+    /* update sm heartbeat status */
+    if ( sm_heartbeat_ok == true )
+    {
+        history_ptr->sm_heartbeat_fail = false ;
+    }
+    else
+    {
+        history_ptr->sm_heartbeat_fail = true ;
+    }
+
    /* Manage storage-0 status. */
    if ( ctrl.cluster.storage0_enabled )
    {
        /* Handle storage-0 status change from not responding to responding. */
        if ( storage_0_responding == true )
        {
-            if (history_ptr->storage0_responding == false)
+            if ((bool)history_ptr->storage0_responding == false)
            {
                history_ptr->storage0_responding = true ;
                ilog ("controller-%d %s heartbeat ; storage-0 is ok",
@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface,
        }

        /* Handle storage-0 status change from responding to not responding. */
-        if (( history_ptr->storage0_responding == true ) &&
+        if (( (bool)history_ptr->storage0_responding == true ) &&
            ( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
        {
            history_ptr->storage0_responding = false ;
@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface,
    else
    {
        /* Typical path for storage-0 disabled or normal non-storage system case */
-        if ( history_ptr->storage0_responding == true )
+        if ( (bool)history_ptr->storage0_responding == true )
            history_ptr->storage0_responding = false ;

        /* Handle clearing threshold count when storage-0 is not enabled. */
@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface,
    if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
        history_ptr->entries++ ;

+    if ( ctrl.monitored_hosts < not_responding_hosts )
+    {
+        slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
+    }
    /* Update the history with this data. */
    history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
    history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
--- a/mtce/src/heartbeat/hbsUtil.cpp
+++ b/mtce/src/heartbeat/hbsUtil.cpp
@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e
    }
    if ( storage0_enabled )
    {
-        syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
+        syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s",
                 history.controller,
                 hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
                 history.storage0_responding ? "y" : "n",
+                 history.sm_heartbeat_fail ? "miss":" ok",
                 str);
    }
    else
    {
-        syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
+        syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s",
                 history.controller,
                 hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
+                 history.sm_heartbeat_fail ? "miss":" ok ",
                 str);
    }
 }
--- a/mtce/src/heartbeat/mtceHbsCluster.h
+++ b/mtce/src/heartbeat/mtceHbsCluster.h
@ -71,7 +71,10 @@ typedef struct
 {
    unsigned short controller         :4 ; /* value 0 or 1 (and 2 in future) */
    unsigned short network            :4 ; /* see mtce_hbs_network_enum      */
-    unsigned short reserved_bits      :7 ; /* future - initted to 0          */
+    unsigned short reserved_bits      :6 ; /* future - initted to 0          */
+    unsigned short sm_heartbeat_fail  :1 ; /* SM Health
+                                              0 = SM Heartbeat OK,
+                                              1 = SM Heartbeat Failure       */
    unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy    */
    unsigned short entries               ; /* # of valid values in .entry    */
    unsigned short entries_max           ; /* max size of the enry array     */
--- a/mtce/src/scripts/mtc.conf
+++ b/mtce/src/scripts/mtc.conf
@ -30,6 +30,11 @@ barbican_port = 9311          ; The Barbican Port Number
 token_refresh_rate = 1200     ; Authentication token refresh rate in seconds.
                              ; A value of zero means no refresh.
                              ; range is 0 or 600-22800
+
+sm_heartbeat_loss_thld = 800  ; number of msecs SM heartbeat fails before
+                              ; declaring an SM process heartbeat loss in
+                              ; the heartbeat cluster.
+
 autorecovery_threshold = 3    ; The number of times maintenance will try to
                              ;  auto recover a critically failed controller
                              ;  while there is no backup controllers to fail