Merge "Add SM process heartbeat and status to the hbs cluster"
This commit is contained in:
commit
c88c4cb7ee
@ -245,6 +245,7 @@ typedef struct
|
||||
long msecs ;
|
||||
} time_delta_type ;
|
||||
|
||||
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta );
|
||||
int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
|
||||
int gettime ( time_debug_type & nowtime ) ;
|
||||
unsigned long long gettime_monotonic_nsec ( void );
|
||||
|
@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t
|
||||
{
|
||||
delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ;
|
||||
}
|
||||
delta.msecs = (delta.msecs/1000);
|
||||
/* convert nsec time to msec time */
|
||||
delta.msecs = (delta.msecs/1000000);
|
||||
}
|
||||
return (PASS) ;
|
||||
}
|
||||
|
||||
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta )
|
||||
{
|
||||
/* Subtract before from after */
|
||||
|
||||
if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec)))
|
||||
{
|
||||
delta.secs = delta.msecs = 0 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
delta.secs = after.tv_sec - before.tv_sec ;
|
||||
if (after.tv_nsec < before.tv_nsec)
|
||||
{
|
||||
delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ;
|
||||
delta.secs-- ;
|
||||
}
|
||||
else
|
||||
{
|
||||
delta.msecs = after.tv_nsec - before.tv_nsec ;
|
||||
}
|
||||
/* convert nsec time to msec time */
|
||||
delta.msecs = (delta.msecs/1000000);
|
||||
}
|
||||
return (PASS) ;
|
||||
}
|
||||
|
@ -69,6 +69,19 @@ using namespace std;
|
||||
|
||||
#define MAX_LEN 1000
|
||||
|
||||
/* Stores the MONOTONIC time the last SM heartbeat was received
|
||||
* Heartbeat metrics
|
||||
* SM heartbeat period definition */
|
||||
static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ;
|
||||
static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ;
|
||||
static int sm_heartbeat_count_b2b_misses = 0 ;
|
||||
static int sm_heartbeat_count = 0 ;
|
||||
const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100;
|
||||
const int SM_HEARTBEAT_PULSE_PERIOD_MSECS = 800;
|
||||
const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2;
|
||||
const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER =
|
||||
SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC;
|
||||
|
||||
/* Historical String data for mem_logs */
|
||||
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
|
||||
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
|
||||
@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype )
|
||||
*
|
||||
****************************************************************************/
|
||||
static int _hbs_sm_handler_log_throttle = 0 ;
|
||||
static int _hbs_sm_heartbeat_log_throttle = 0 ;
|
||||
void hbs_sm_handler ( void )
|
||||
{
|
||||
#define _MAX_MSG_LEN (80)
|
||||
@ -1437,10 +1451,27 @@ void hbs_sm_handler ( void )
|
||||
{
|
||||
if (( service == SUPPERTED_SERVICE ) &&
|
||||
( request == SUPPORTED_REQUEST ))
|
||||
{
|
||||
/* SM heartbeat pulses have a reqid = 0 and do not require a response */
|
||||
if ( reqid == 0 )
|
||||
{
|
||||
time_delta_type delta ;
|
||||
struct timespec ts = sm_heartbeat_timestamp_last ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
|
||||
if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0)
|
||||
{
|
||||
sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last;
|
||||
}
|
||||
timedelta (ts, sm_heartbeat_timestamp_last, delta );
|
||||
sm_heartbeat_count++ ;
|
||||
ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)",
|
||||
sm_heartbeat_count, delta.secs, delta.msecs);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* success path ... */
|
||||
hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
|
||||
|
||||
}
|
||||
/* reset log throttle */
|
||||
_hbs_sm_handler_log_throttle = 0 ;
|
||||
}
|
||||
@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void )
|
||||
dlog ("... %s", sm_mesg );
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name ; manage_sm_heartbeat
|
||||
*
|
||||
* Purpose : Determine if we received an SM heartbeat message within
|
||||
* the last SM_HEARTBEAT_PULSE_PERIOD_MSECS
|
||||
*
|
||||
* Description: Compare the monotonic now time to the monotonic time
|
||||
* of the last received SM heartbeat pulse.
|
||||
*
|
||||
* Returns : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS
|
||||
* False if time delta is greater
|
||||
*
|
||||
***************************************************************************/
|
||||
bool manage_sm_heartbeat ( void )
|
||||
{
|
||||
struct timespec ts ;
|
||||
time_delta_type delta ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||
timedelta (sm_heartbeat_timestamp_last, ts, delta );
|
||||
int64_t delta_in_ms = delta.secs * 1000 + delta.msecs;
|
||||
bool heartbeat_ok;
|
||||
if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
|
||||
{
|
||||
sm_heartbeat_count = 0;
|
||||
if (( ++sm_heartbeat_count_b2b_misses < 20 )||
|
||||
(!( sm_heartbeat_count_b2b_misses % 100 )))
|
||||
{
|
||||
wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
|
||||
delta.secs, delta.msecs,
|
||||
sm_heartbeat_count_b2b_misses,
|
||||
sm_heartbeat_count);
|
||||
}
|
||||
heartbeat_ok = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(sm_heartbeat_count_b2b_misses)
|
||||
{
|
||||
int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1;
|
||||
|
||||
if(sm_heartbeat_count >= expected_beeps)
|
||||
{
|
||||
if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER)
|
||||
{
|
||||
ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing",
|
||||
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
|
||||
delta_in_ms,
|
||||
sm_heartbeat_count_b2b_misses);
|
||||
|
||||
sm_heartbeat_count_b2b_misses = 0;
|
||||
sm_heartbeat_count = 0;
|
||||
sm_heartbeat_timestamp_restart.tv_sec = 0;
|
||||
sm_heartbeat_timestamp_restart.tv_nsec = 0;
|
||||
heartbeat_ok = true;
|
||||
}else
|
||||
{
|
||||
ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing",
|
||||
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
|
||||
delta_in_ms,
|
||||
sm_heartbeat_count_b2b_misses);
|
||||
heartbeat_ok = false; // not good enough to declare recovered yet
|
||||
}
|
||||
}else
|
||||
{
|
||||
ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. "
|
||||
"Counting will restart.",
|
||||
delta_in_ms, expected_beeps - sm_heartbeat_count);
|
||||
sm_heartbeat_timestamp_restart = ts;
|
||||
sm_heartbeat_count = 1;
|
||||
heartbeat_ok = false; // recover is interrupted by further missing beep
|
||||
}
|
||||
}else
|
||||
{
|
||||
if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2)
|
||||
{
|
||||
ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.",
|
||||
delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS
|
||||
);
|
||||
}
|
||||
heartbeat_ok = true; // not bad enough to declare heartbeat failed yet
|
||||
}
|
||||
}
|
||||
return heartbeat_ok;
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : daemon_service_run
|
||||
@ -1552,7 +1669,8 @@ void daemon_service_run ( void )
|
||||
hbs_state_audit ();
|
||||
}
|
||||
|
||||
/* run the first audit in 30 seconds */
|
||||
/* The first audit was run after 30 seconds but then the
|
||||
* continuous rate is every hour */
|
||||
mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 );
|
||||
}
|
||||
|
||||
@ -1719,6 +1837,9 @@ void daemon_service_run ( void )
|
||||
sockets_init = true ;
|
||||
monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START );
|
||||
|
||||
/* Update Sm heartbeat time to now time */
|
||||
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
|
||||
|
||||
/* no need for the heartbeat audit in a simplex system */
|
||||
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
{
|
||||
@ -2363,6 +2484,8 @@ void daemon_service_run ( void )
|
||||
*/
|
||||
else
|
||||
{
|
||||
bool heartbeat_ok = manage_sm_heartbeat();
|
||||
|
||||
/* manage vault wrt peer controller */
|
||||
hbs_cluster_peer();
|
||||
|
||||
@ -2396,9 +2519,14 @@ void daemon_service_run ( void )
|
||||
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
|
||||
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
|
||||
{
|
||||
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
|
||||
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok );
|
||||
}
|
||||
}
|
||||
/* log cluster throttled */
|
||||
if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
|
||||
{
|
||||
hbs_state_audit ( );
|
||||
}
|
||||
hbsTimer.ring = false ;
|
||||
heartbeat_request = true ;
|
||||
seq_num++ ;
|
||||
|
@ -290,7 +290,8 @@ void hbs_cluster_peer ( void );
|
||||
*/
|
||||
void hbs_cluster_update ( iface_enum iface,
|
||||
unsigned short not_responding_hosts,
|
||||
bool storage_0_responding );
|
||||
bool storage_0_responding,
|
||||
bool sm_heartbeat_ok);
|
||||
|
||||
/* Called by the hbsAgent pulse transmitter to append this controllers
|
||||
* running cluster view in the next multicast pulse request.
|
||||
|
@ -217,7 +217,7 @@ void cluster_list ( void )
|
||||
|
||||
void cluster_storage0_state ( bool enabled )
|
||||
{
|
||||
if ( ctrl.cluster.storage0_enabled != enabled )
|
||||
if ( (bool)ctrl.cluster.storage0_enabled != enabled )
|
||||
{
|
||||
ctrl.cluster.storage0_enabled = enabled ;
|
||||
ilog ("storage-0 heartbeat state changed to %s",
|
||||
@ -305,35 +305,39 @@ void hbs_cluster_add ( string & hostname )
|
||||
void hbs_cluster_del ( string & hostname )
|
||||
{
|
||||
std::list<string>::iterator hostname_ptr ;
|
||||
bool found = false ;
|
||||
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
|
||||
hostname_ptr != ctrl.monitored_hostname_list.end() ;
|
||||
hostname_ptr++ )
|
||||
{
|
||||
if ( hostname_ptr->compare(hostname) == 0 )
|
||||
{
|
||||
found = true ;
|
||||
break ;
|
||||
}
|
||||
}
|
||||
if ( found == true )
|
||||
{
|
||||
ctrl.monitored_hostname_list.remove(hostname) ;
|
||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||
|
||||
/* Manage storage-0 state. */
|
||||
if ( hostname.compare(STORAGE_0) == 0 )
|
||||
{
|
||||
cluster_storage0_state ( false );
|
||||
}
|
||||
|
||||
/* If we get down to 0 monitored hosts then just start fresh */
|
||||
if (( ctrl.monitored_hosts ) == 0 )
|
||||
{
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||
}
|
||||
|
||||
ilog ("%s deleted from cluster", hostname.c_str());
|
||||
|
||||
cluster_list ();
|
||||
|
||||
hbs_cluster_change ( hostname + " deleted" );
|
||||
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
slog("%s not found in cluster list", hostname.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void )
|
||||
* 4. Maintain a back to back non-responding count for storage-0.
|
||||
* Once the count reaches the minimum threshold of
|
||||
* STORAGE_0_NR_THRESHOLD then the specific network history
|
||||
* is updated to indicate storgae-0 is not responding. Once
|
||||
* is updated to indicate storage-0 is not responding. Once
|
||||
* storage-0 starts responding again with a single response
|
||||
* then that network history is updated to indicate storage-0
|
||||
* is responding.
|
||||
@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void )
|
||||
|
||||
void hbs_cluster_update ( iface_enum iface,
|
||||
unsigned short not_responding_hosts,
|
||||
bool storage_0_responding )
|
||||
bool storage_0_responding,
|
||||
bool sm_heartbeat_ok)
|
||||
{
|
||||
if ( ctrl.monitored_hosts == 0 )
|
||||
return ;
|
||||
@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface,
|
||||
else
|
||||
return ;
|
||||
|
||||
if ( ctrl.monitored_hosts < not_responding_hosts )
|
||||
{
|
||||
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
|
||||
}
|
||||
|
||||
if ( not_responding_hosts )
|
||||
{
|
||||
clog ("controller-%d %s enabled:%d not responding:%d",
|
||||
@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface,
|
||||
}
|
||||
}
|
||||
|
||||
/* update sm heartbeat status */
|
||||
if ( sm_heartbeat_ok == true )
|
||||
{
|
||||
history_ptr->sm_heartbeat_fail = false ;
|
||||
}
|
||||
else
|
||||
{
|
||||
history_ptr->sm_heartbeat_fail = true ;
|
||||
}
|
||||
|
||||
/* Manage storage-0 status. */
|
||||
if ( ctrl.cluster.storage0_enabled )
|
||||
{
|
||||
/* Handle storage-0 status change from not responding to responding. */
|
||||
if ( storage_0_responding == true )
|
||||
{
|
||||
if (history_ptr->storage0_responding == false)
|
||||
if ((bool)history_ptr->storage0_responding == false)
|
||||
{
|
||||
history_ptr->storage0_responding = true ;
|
||||
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
|
||||
@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface,
|
||||
}
|
||||
|
||||
/* Handle storage-0 status change from responding to not responding. */
|
||||
if (( history_ptr->storage0_responding == true ) &&
|
||||
if (( (bool)history_ptr->storage0_responding == true ) &&
|
||||
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
|
||||
{
|
||||
history_ptr->storage0_responding = false ;
|
||||
@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface,
|
||||
else
|
||||
{
|
||||
/* Typical path for storage-0 disabled or normal non-storage system case */
|
||||
if ( history_ptr->storage0_responding == true )
|
||||
if ( (bool)history_ptr->storage0_responding == true )
|
||||
history_ptr->storage0_responding = false ;
|
||||
|
||||
/* Handle clearing threshold count when storage-0 is not enabled. */
|
||||
@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface,
|
||||
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
|
||||
history_ptr->entries++ ;
|
||||
|
||||
if ( ctrl.monitored_hosts < not_responding_hosts )
|
||||
{
|
||||
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
|
||||
}
|
||||
/* Update the history with this data. */
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
|
||||
|
@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e
|
||||
}
|
||||
if ( storage0_enabled )
|
||||
{
|
||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
|
||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s",
|
||||
history.controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
|
||||
history.storage0_responding ? "y" : "n",
|
||||
history.sm_heartbeat_fail ? "miss":" ok",
|
||||
str);
|
||||
}
|
||||
else
|
||||
{
|
||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
|
||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s",
|
||||
history.controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
|
||||
history.sm_heartbeat_fail ? "miss":" ok ",
|
||||
str);
|
||||
}
|
||||
}
|
||||
|
@ -71,7 +71,10 @@ typedef struct
|
||||
{
|
||||
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
|
||||
unsigned short network :4 ; /* see mtce_hbs_network_enum */
|
||||
unsigned short reserved_bits :7 ; /* future - initted to 0 */
|
||||
unsigned short reserved_bits :6 ; /* future - initted to 0 */
|
||||
unsigned short sm_heartbeat_fail :1 ; /* SM Health
|
||||
0 = SM Heartbeat OK,
|
||||
1 = SM Heartbeat Failure */
|
||||
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
|
||||
unsigned short entries ; /* # of valid values in .entry */
|
||||
unsigned short entries_max ; /* max size of the enry array */
|
||||
|
@ -30,6 +30,11 @@ barbican_port = 9311 ; The Barbican Port Number
|
||||
token_refresh_rate = 1200 ; Authentication token refresh rate in seconds.
|
||||
; A value of zero means no refresh.
|
||||
; range is 0 or 600-22800
|
||||
|
||||
sm_heartbeat_loss_thld = 800 ; number of msecs SM heartbeat fails before
|
||||
; declaring an SM process heartbeat loss in
|
||||
; the heartbeat cluster.
|
||||
|
||||
autorecovery_threshold = 3 ; The number of times maintenance will try to
|
||||
; auto recover a critically failed controller
|
||||
; while there is no backup controllers to fail
|
||||
|
Loading…
Reference in New Issue
Block a user