From 8c81914ea5ca33a70cbaf4b0de81384cb33cf13f Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 11 Sep 2020 20:43:11 -0400 Subject: [PATCH] Add SM process heartbeat and status to the hbs cluster This update is the mtc hbsAgent side of a new SM -> hbsAgent heartbeat algorithm for the purpose of detecting peer SM process stalls. This update adds an 'SM Heartbeat status' bit to the cluster view it injects into its multicast heartbeat requests. Its peer is able to read this on-going hbsAgent/SM heartbeat status through the cluster. The status bit reads 'ok' while the hbsAgent sees the SM heartbeat as steady. The status bit reads 'not ok' while the SM heartbeat is lost for longer than 800 msecs. Change-Id: I0f2079b0fafd7bce0b97ee26d29899659d66f81d Partial-Fix: #1895350 Co-Authored-By: Bin.Qian@windriver.com Signed-off-by: Eric MacDonald --- mtce-common/src/daemon/daemon_common.h | 1 + mtce-common/src/daemon/daemon_debug.cpp | 41 +++++-- mtce/src/heartbeat/hbsAgent.cpp | 138 +++++++++++++++++++++++- mtce/src/heartbeat/hbsBase.h | 3 +- mtce/src/heartbeat/hbsCluster.cpp | 78 +++++++++----- mtce/src/heartbeat/hbsUtil.cpp | 6 +- mtce/src/heartbeat/mtceHbsCluster.h | 5 +- mtce/src/scripts/mtc.conf | 5 + 8 files changed, 234 insertions(+), 43 deletions(-) diff --git a/mtce-common/src/daemon/daemon_common.h b/mtce-common/src/daemon/daemon_common.h index 9ea16339..3f9ac031 100755 --- a/mtce-common/src/daemon/daemon_common.h +++ b/mtce-common/src/daemon/daemon_common.h @@ -245,6 +245,7 @@ typedef struct long msecs ; } time_delta_type ; +int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta ); int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta ); int gettime ( time_debug_type & nowtime ) ; unsigned long long gettime_monotonic_nsec ( void ); diff --git a/mtce-common/src/daemon/daemon_debug.cpp b/mtce-common/src/daemon/daemon_debug.cpp index 4a39ccfa..68e97652 100755 --- a/mtce-common/src/daemon/daemon_debug.cpp +++ b/mtce-common/src/daemon/daemon_debug.cpp @@ -26,20 +26,20 @@ unsigned long long gettime_monotonic_nsec ( void ) { struct timespec ts; clock_gettime (CLOCK_MONOTONIC, &ts); - return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec; + return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec; } - + int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta ) -{ +{ /* Subtract before from after */ if ((after.ts.tv_sec < before.ts.tv_sec) || ((after.ts.tv_sec == before.ts.tv_sec) && - (after.ts.tv_nsec <= before.ts.tv_nsec))) + (after.ts.tv_nsec <= before.ts.tv_nsec))) { delta.secs = delta.msecs = 1 ; - } - else + } + else { delta.secs = after.ts.tv_sec - before.ts.tv_sec ; if (after.ts.tv_nsec < before.ts.tv_nsec) @@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t { delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ; } - delta.msecs = (delta.msecs/1000); + /* convert nsec time to msec time */ + delta.msecs = (delta.msecs/1000000); + } + return (PASS) ; +} + +int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta ) +{ + /* Subtract before from after */ + + if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec))) + { + delta.secs = delta.msecs = 0 ; + } + else + { + delta.secs = after.tv_sec - before.tv_sec ; + if (after.tv_nsec < before.tv_nsec) + { + delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ; + delta.secs-- ; + } + else + { + delta.msecs = after.tv_nsec - before.tv_nsec ; + } + /* convert nsec time to msec time */ + delta.msecs = (delta.msecs/1000000); } return (PASS) ; } diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index b596baba..4eec5a29 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -69,6 +69,19 @@ using namespace std; #define MAX_LEN 1000 +/* Stores the MONOTONIC time the last SM heartbeat was received + * Heartbeat metrics + * SM heartbeat period definition */ +static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ; +static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ; +static int sm_heartbeat_count_b2b_misses = 0 ; +static int sm_heartbeat_count = 0 ; +const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100; +const int SM_HEARTBEAT_PULSE_PERIOD_MSECS = 800; +const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2; +const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER = + SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC; + /* Historical String data for mem_logs */ static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ; static string arrival_histogram[MAX_IFACES] = { "" , "" } ; @@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype ) * ****************************************************************************/ static int _hbs_sm_handler_log_throttle = 0 ; +static int _hbs_sm_heartbeat_log_throttle = 0 ; void hbs_sm_handler ( void ) { #define _MAX_MSG_LEN (80) @@ -1438,9 +1452,26 @@ void hbs_sm_handler ( void ) if (( service == SUPPERTED_SERVICE ) && ( request == SUPPORTED_REQUEST )) { - /* success path ... */ - hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" ); - + /* SM heartbeat pulses have a reqid = 0 and do not require a response */ + if ( reqid == 0 ) + { + time_delta_type delta ; + struct timespec ts = sm_heartbeat_timestamp_last ; + clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last ); + if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0) + { + sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last; + } + timedelta (ts, sm_heartbeat_timestamp_last, delta ); + sm_heartbeat_count++ ; + ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)", + sm_heartbeat_count, delta.secs, delta.msecs); + } + else + { + /* success path ... */ + hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" ); + } /* reset log throttle */ _hbs_sm_handler_log_throttle = 0 ; } @@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void ) dlog ("... %s", sm_mesg ); } +/**************************************************************************** + * + * Name ; manage_sm_heartbeat + * + * Purpose : Determine if we received an SM heartbeat message within + * the last SM_HEARTBEAT_PULSE_PERIOD_MSECS + * + * Description: Compare the monotonic now time to the monotonic time + * of the last received SM heartbeat pulse. + * + * Returns : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS + * False if time delta is greater + * + ***************************************************************************/ +bool manage_sm_heartbeat ( void ) +{ + struct timespec ts ; + time_delta_type delta ; + clock_gettime (CLOCK_MONOTONIC, &ts ); + timedelta (sm_heartbeat_timestamp_last, ts, delta ); + int64_t delta_in_ms = delta.secs * 1000 + delta.msecs; + bool heartbeat_ok; + if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS ) + { + sm_heartbeat_count = 0; + if (( ++sm_heartbeat_count_b2b_misses < 20 )|| + (!( sm_heartbeat_count_b2b_misses % 100 ))) + { + wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d", + delta.secs, delta.msecs, + sm_heartbeat_count_b2b_misses, + sm_heartbeat_count); + } + heartbeat_ok = false; + } + else + { + if(sm_heartbeat_count_b2b_misses) + { + int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1; + + if(sm_heartbeat_count >= expected_beeps) + { + if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER) + { + ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing", + sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER, + delta_in_ms, + sm_heartbeat_count_b2b_misses); + + sm_heartbeat_count_b2b_misses = 0; + sm_heartbeat_count = 0; + sm_heartbeat_timestamp_restart.tv_sec = 0; + sm_heartbeat_timestamp_restart.tv_nsec = 0; + heartbeat_ok = true; + }else + { + ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing", + sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER, + delta_in_ms, + sm_heartbeat_count_b2b_misses); + heartbeat_ok = false; // not good enough to declare recovered yet + } + }else + { + ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. " + "Counting will restart.", + delta_in_ms, expected_beeps - sm_heartbeat_count); + sm_heartbeat_timestamp_restart = ts; + sm_heartbeat_count = 1; + heartbeat_ok = false; // recover is interrupted by further missing beep + } + }else + { + if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2) + { + ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.", + delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS + ); + } + heartbeat_ok = true; // not bad enough to declare heartbeat failed yet + } + } + return heartbeat_ok; +} + /**************************************************************************** * * Name : daemon_service_run @@ -1552,7 +1669,8 @@ void daemon_service_run ( void ) hbs_state_audit (); } - /* run the first audit in 30 seconds */ + /* The first audit was run after 30 seconds but then the + * continuous rate is every hour */ mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 ); } @@ -1719,6 +1837,9 @@ void daemon_service_run ( void ) sockets_init = true ; monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START ); + /* Update Sm heartbeat time to now time */ + clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last ); + /* no need for the heartbeat audit in a simplex system */ if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) { @@ -2363,6 +2484,8 @@ void daemon_service_run ( void ) */ else { + bool heartbeat_ok = manage_sm_heartbeat(); + /* manage vault wrt peer controller */ hbs_cluster_peer(); @@ -2396,9 +2519,14 @@ void daemon_service_run ( void ) int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding); if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled ) { - hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding); + hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok ); } } + /* log cluster throttled */ + if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 ))) + { + hbs_state_audit ( ); + } hbsTimer.ring = false ; heartbeat_request = true ; seq_num++ ; diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index 932ed543..bfa8f1d1 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -290,7 +290,8 @@ void hbs_cluster_peer ( void ); */ void hbs_cluster_update ( iface_enum iface, unsigned short not_responding_hosts, - bool storage_0_responding ); + bool storage_0_responding, + bool sm_heartbeat_ok); /* Called by the hbsAgent pulse transmitter to append this controllers * running cluster view in the next multicast pulse request. diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index 10ac8991..780fa8e3 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -217,7 +217,7 @@ void cluster_list ( void ) void cluster_storage0_state ( bool enabled ) { - if ( ctrl.cluster.storage0_enabled != enabled ) + if ( (bool)ctrl.cluster.storage0_enabled != enabled ) { ctrl.cluster.storage0_enabled = enabled ; ilog ("storage-0 heartbeat state changed to %s", @@ -305,36 +305,40 @@ void hbs_cluster_add ( string & hostname ) void hbs_cluster_del ( string & hostname ) { std::list::iterator hostname_ptr ; + bool found = false ; for ( hostname_ptr = ctrl.monitored_hostname_list.begin(); hostname_ptr != ctrl.monitored_hostname_list.end() ; hostname_ptr++ ) { if ( hostname_ptr->compare(hostname) == 0 ) { - ctrl.monitored_hostname_list.remove(hostname) ; - ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); - - /* Manage storage-0 state. */ - if ( hostname.compare(STORAGE_0) == 0 ) - { - cluster_storage0_state ( false ); - } - - /* If we get down to 0 monitored hosts then just start fresh */ - if (( ctrl.monitored_hosts ) == 0 ) - { - hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); - } - - ilog ("%s deleted from cluster", hostname.c_str()); - - cluster_list (); - - hbs_cluster_change ( hostname + " deleted" ); - + found = true ; break ; } } + if ( found == true ) + { + ctrl.monitored_hostname_list.remove(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + + /* Manage storage-0 state. */ + if ( hostname.compare(STORAGE_0) == 0 ) + cluster_storage0_state ( false ); + + /* If we get down to 0 monitored hosts then just start fresh */ + if (( ctrl.monitored_hosts ) == 0 ) + hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); + + ilog ("%s deleted from cluster", hostname.c_str()); + + cluster_list (); + + hbs_cluster_change ( hostname + " deleted" ); + } + else + { + slog("%s not found in cluster list", hostname.c_str()); + } } /**************************************************************************** @@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void ) * 4. Maintain a back to back non-responding count for storage-0. * Once the count reaches the minimum threshold of * STORAGE_0_NR_THRESHOLD then the specific network history - * is updated to indicate storgae-0 is not responding. Once + * is updated to indicate storage-0 is not responding. Once * storage-0 starts responding again with a single response * then that network history is updated to indicate storage-0 * is responding. @@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void ) void hbs_cluster_update ( iface_enum iface, unsigned short not_responding_hosts, - bool storage_0_responding ) + bool storage_0_responding, + bool sm_heartbeat_ok) { if ( ctrl.monitored_hosts == 0 ) return ; @@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface, else return ; + if ( ctrl.monitored_hosts < not_responding_hosts ) + { + slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts ); + } + if ( not_responding_hosts ) { clog ("controller-%d %s enabled:%d not responding:%d", @@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface, } } + /* update sm heartbeat status */ + if ( sm_heartbeat_ok == true ) + { + history_ptr->sm_heartbeat_fail = false ; + } + else + { + history_ptr->sm_heartbeat_fail = true ; + } + /* Manage storage-0 status. */ if ( ctrl.cluster.storage0_enabled ) { /* Handle storage-0 status change from not responding to responding. */ if ( storage_0_responding == true ) { - if (history_ptr->storage0_responding == false) + if ((bool)history_ptr->storage0_responding == false) { history_ptr->storage0_responding = true ; ilog ("controller-%d %s heartbeat ; storage-0 is ok", @@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface, } /* Handle storage-0 status change from responding to not responding. */ - if (( history_ptr->storage0_responding == true ) && + if (( (bool)history_ptr->storage0_responding == true ) && ( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD )) { history_ptr->storage0_responding = false ; @@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface, else { /* Typical path for storage-0 disabled or normal non-storage system case */ - if ( history_ptr->storage0_responding == true ) + if ( (bool)history_ptr->storage0_responding == true ) history_ptr->storage0_responding = false ; /* Handle clearing threshold count when storage-0 is not enabled. */ @@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface, if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES ) history_ptr->entries++ ; + if ( ctrl.monitored_hosts < not_responding_hosts ) + { + slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts ); + } /* Update the history with this data. */ history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp index ae2a81d3..a9875ed5 100644 --- a/mtce/src/heartbeat/hbsUtil.cpp +++ b/mtce/src/heartbeat/hbsUtil.cpp @@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e } if ( storage0_enabled ) { - syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s", + syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s", history.controller, hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(), history.storage0_responding ? "y" : "n", + history.sm_heartbeat_fail ? "miss":" ok", str); } else { - syslog ( LOG_INFO, "Cluster Vault : C%d %s %s", + syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s", history.controller, hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(), + history.sm_heartbeat_fail ? "miss":" ok ", str); } } diff --git a/mtce/src/heartbeat/mtceHbsCluster.h b/mtce/src/heartbeat/mtceHbsCluster.h index 02255226..7690285c 100644 --- a/mtce/src/heartbeat/mtceHbsCluster.h +++ b/mtce/src/heartbeat/mtceHbsCluster.h @@ -71,7 +71,10 @@ typedef struct { unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */ unsigned short network :4 ; /* see mtce_hbs_network_enum */ - unsigned short reserved_bits :7 ; /* future - initted to 0 */ + unsigned short reserved_bits :6 ; /* future - initted to 0 */ + unsigned short sm_heartbeat_fail :1 ; /* SM Health + 0 = SM Heartbeat OK, + 1 = SM Heartbeat Failure */ unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */ unsigned short entries ; /* # of valid values in .entry */ unsigned short entries_max ; /* max size of the enry array */ diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index 1b431004..0a2b005c 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -30,6 +30,11 @@ barbican_port = 9311 ; The Barbican Port Number token_refresh_rate = 1200 ; Authentication token refresh rate in seconds. ; A value of zero means no refresh. ; range is 0 or 600-22800 + +sm_heartbeat_loss_thld = 800 ; number of msecs SM heartbeat fails before + ; declaring an SM process heartbeat loss in + ; the heartbeat cluster. + autorecovery_threshold = 3 ; The number of times maintenance will try to ; auto recover a critically failed controller ; while there is no backup controllers to fail