Merge "Add SM process heartbeat and status to the hbs cluster"
This commit is contained in:
commit
c88c4cb7ee
@ -245,6 +245,7 @@ typedef struct
|
|||||||
long msecs ;
|
long msecs ;
|
||||||
} time_delta_type ;
|
} time_delta_type ;
|
||||||
|
|
||||||
|
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta );
|
||||||
int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
|
int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
|
||||||
int gettime ( time_debug_type & nowtime ) ;
|
int gettime ( time_debug_type & nowtime ) ;
|
||||||
unsigned long long gettime_monotonic_nsec ( void );
|
unsigned long long gettime_monotonic_nsec ( void );
|
||||||
|
@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t
|
|||||||
{
|
{
|
||||||
delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ;
|
delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ;
|
||||||
}
|
}
|
||||||
delta.msecs = (delta.msecs/1000);
|
/* convert nsec time to msec time */
|
||||||
|
delta.msecs = (delta.msecs/1000000);
|
||||||
|
}
|
||||||
|
return (PASS) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta )
|
||||||
|
{
|
||||||
|
/* Subtract before from after */
|
||||||
|
|
||||||
|
if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec)))
|
||||||
|
{
|
||||||
|
delta.secs = delta.msecs = 0 ;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
delta.secs = after.tv_sec - before.tv_sec ;
|
||||||
|
if (after.tv_nsec < before.tv_nsec)
|
||||||
|
{
|
||||||
|
delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ;
|
||||||
|
delta.secs-- ;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
delta.msecs = after.tv_nsec - before.tv_nsec ;
|
||||||
|
}
|
||||||
|
/* convert nsec time to msec time */
|
||||||
|
delta.msecs = (delta.msecs/1000000);
|
||||||
}
|
}
|
||||||
return (PASS) ;
|
return (PASS) ;
|
||||||
}
|
}
|
||||||
|
@ -69,6 +69,19 @@ using namespace std;
|
|||||||
|
|
||||||
#define MAX_LEN 1000
|
#define MAX_LEN 1000
|
||||||
|
|
||||||
|
/* Stores the MONOTONIC time the last SM heartbeat was received
|
||||||
|
* Heartbeat metrics
|
||||||
|
* SM heartbeat period definition */
|
||||||
|
static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ;
|
||||||
|
static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ;
|
||||||
|
static int sm_heartbeat_count_b2b_misses = 0 ;
|
||||||
|
static int sm_heartbeat_count = 0 ;
|
||||||
|
const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100;
|
||||||
|
const int SM_HEARTBEAT_PULSE_PERIOD_MSECS = 800;
|
||||||
|
const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2;
|
||||||
|
const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER =
|
||||||
|
SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC;
|
||||||
|
|
||||||
/* Historical String data for mem_logs */
|
/* Historical String data for mem_logs */
|
||||||
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
|
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
|
||||||
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
|
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
|
||||||
@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype )
|
|||||||
*
|
*
|
||||||
****************************************************************************/
|
****************************************************************************/
|
||||||
static int _hbs_sm_handler_log_throttle = 0 ;
|
static int _hbs_sm_handler_log_throttle = 0 ;
|
||||||
|
static int _hbs_sm_heartbeat_log_throttle = 0 ;
|
||||||
void hbs_sm_handler ( void )
|
void hbs_sm_handler ( void )
|
||||||
{
|
{
|
||||||
#define _MAX_MSG_LEN (80)
|
#define _MAX_MSG_LEN (80)
|
||||||
@ -1438,9 +1452,26 @@ void hbs_sm_handler ( void )
|
|||||||
if (( service == SUPPERTED_SERVICE ) &&
|
if (( service == SUPPERTED_SERVICE ) &&
|
||||||
( request == SUPPORTED_REQUEST ))
|
( request == SUPPORTED_REQUEST ))
|
||||||
{
|
{
|
||||||
/* success path ... */
|
/* SM heartbeat pulses have a reqid = 0 and do not require a response */
|
||||||
hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
|
if ( reqid == 0 )
|
||||||
|
{
|
||||||
|
time_delta_type delta ;
|
||||||
|
struct timespec ts = sm_heartbeat_timestamp_last ;
|
||||||
|
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
|
||||||
|
if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0)
|
||||||
|
{
|
||||||
|
sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last;
|
||||||
|
}
|
||||||
|
timedelta (ts, sm_heartbeat_timestamp_last, delta );
|
||||||
|
sm_heartbeat_count++ ;
|
||||||
|
ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)",
|
||||||
|
sm_heartbeat_count, delta.secs, delta.msecs);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* success path ... */
|
||||||
|
hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
|
||||||
|
}
|
||||||
/* reset log throttle */
|
/* reset log throttle */
|
||||||
_hbs_sm_handler_log_throttle = 0 ;
|
_hbs_sm_handler_log_throttle = 0 ;
|
||||||
}
|
}
|
||||||
@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void )
|
|||||||
dlog ("... %s", sm_mesg );
|
dlog ("... %s", sm_mesg );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Name ; manage_sm_heartbeat
|
||||||
|
*
|
||||||
|
* Purpose : Determine if we received an SM heartbeat message within
|
||||||
|
* the last SM_HEARTBEAT_PULSE_PERIOD_MSECS
|
||||||
|
*
|
||||||
|
* Description: Compare the monotonic now time to the monotonic time
|
||||||
|
* of the last received SM heartbeat pulse.
|
||||||
|
*
|
||||||
|
* Returns : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS
|
||||||
|
* False if time delta is greater
|
||||||
|
*
|
||||||
|
***************************************************************************/
|
||||||
|
bool manage_sm_heartbeat ( void )
|
||||||
|
{
|
||||||
|
struct timespec ts ;
|
||||||
|
time_delta_type delta ;
|
||||||
|
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||||
|
timedelta (sm_heartbeat_timestamp_last, ts, delta );
|
||||||
|
int64_t delta_in_ms = delta.secs * 1000 + delta.msecs;
|
||||||
|
bool heartbeat_ok;
|
||||||
|
if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
|
||||||
|
{
|
||||||
|
sm_heartbeat_count = 0;
|
||||||
|
if (( ++sm_heartbeat_count_b2b_misses < 20 )||
|
||||||
|
(!( sm_heartbeat_count_b2b_misses % 100 )))
|
||||||
|
{
|
||||||
|
wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
|
||||||
|
delta.secs, delta.msecs,
|
||||||
|
sm_heartbeat_count_b2b_misses,
|
||||||
|
sm_heartbeat_count);
|
||||||
|
}
|
||||||
|
heartbeat_ok = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if(sm_heartbeat_count_b2b_misses)
|
||||||
|
{
|
||||||
|
int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1;
|
||||||
|
|
||||||
|
if(sm_heartbeat_count >= expected_beeps)
|
||||||
|
{
|
||||||
|
if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER)
|
||||||
|
{
|
||||||
|
ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing",
|
||||||
|
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
|
||||||
|
delta_in_ms,
|
||||||
|
sm_heartbeat_count_b2b_misses);
|
||||||
|
|
||||||
|
sm_heartbeat_count_b2b_misses = 0;
|
||||||
|
sm_heartbeat_count = 0;
|
||||||
|
sm_heartbeat_timestamp_restart.tv_sec = 0;
|
||||||
|
sm_heartbeat_timestamp_restart.tv_nsec = 0;
|
||||||
|
heartbeat_ok = true;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing",
|
||||||
|
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
|
||||||
|
delta_in_ms,
|
||||||
|
sm_heartbeat_count_b2b_misses);
|
||||||
|
heartbeat_ok = false; // not good enough to declare recovered yet
|
||||||
|
}
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. "
|
||||||
|
"Counting will restart.",
|
||||||
|
delta_in_ms, expected_beeps - sm_heartbeat_count);
|
||||||
|
sm_heartbeat_timestamp_restart = ts;
|
||||||
|
sm_heartbeat_count = 1;
|
||||||
|
heartbeat_ok = false; // recover is interrupted by further missing beep
|
||||||
|
}
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2)
|
||||||
|
{
|
||||||
|
ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.",
|
||||||
|
delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS
|
||||||
|
);
|
||||||
|
}
|
||||||
|
heartbeat_ok = true; // not bad enough to declare heartbeat failed yet
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return heartbeat_ok;
|
||||||
|
}
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
*
|
*
|
||||||
* Name : daemon_service_run
|
* Name : daemon_service_run
|
||||||
@ -1552,7 +1669,8 @@ void daemon_service_run ( void )
|
|||||||
hbs_state_audit ();
|
hbs_state_audit ();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* run the first audit in 30 seconds */
|
/* The first audit was run after 30 seconds but then the
|
||||||
|
* continuous rate is every hour */
|
||||||
mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 );
|
mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1719,6 +1837,9 @@ void daemon_service_run ( void )
|
|||||||
sockets_init = true ;
|
sockets_init = true ;
|
||||||
monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START );
|
monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START );
|
||||||
|
|
||||||
|
/* Update Sm heartbeat time to now time */
|
||||||
|
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
|
||||||
|
|
||||||
/* no need for the heartbeat audit in a simplex system */
|
/* no need for the heartbeat audit in a simplex system */
|
||||||
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||||
{
|
{
|
||||||
@ -2363,6 +2484,8 @@ void daemon_service_run ( void )
|
|||||||
*/
|
*/
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
bool heartbeat_ok = manage_sm_heartbeat();
|
||||||
|
|
||||||
/* manage vault wrt peer controller */
|
/* manage vault wrt peer controller */
|
||||||
hbs_cluster_peer();
|
hbs_cluster_peer();
|
||||||
|
|
||||||
@ -2396,9 +2519,14 @@ void daemon_service_run ( void )
|
|||||||
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
|
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
|
||||||
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
|
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
|
||||||
{
|
{
|
||||||
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
|
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* log cluster throttled */
|
||||||
|
if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
|
||||||
|
{
|
||||||
|
hbs_state_audit ( );
|
||||||
|
}
|
||||||
hbsTimer.ring = false ;
|
hbsTimer.ring = false ;
|
||||||
heartbeat_request = true ;
|
heartbeat_request = true ;
|
||||||
seq_num++ ;
|
seq_num++ ;
|
||||||
|
@ -290,7 +290,8 @@ void hbs_cluster_peer ( void );
|
|||||||
*/
|
*/
|
||||||
void hbs_cluster_update ( iface_enum iface,
|
void hbs_cluster_update ( iface_enum iface,
|
||||||
unsigned short not_responding_hosts,
|
unsigned short not_responding_hosts,
|
||||||
bool storage_0_responding );
|
bool storage_0_responding,
|
||||||
|
bool sm_heartbeat_ok);
|
||||||
|
|
||||||
/* Called by the hbsAgent pulse transmitter to append this controllers
|
/* Called by the hbsAgent pulse transmitter to append this controllers
|
||||||
* running cluster view in the next multicast pulse request.
|
* running cluster view in the next multicast pulse request.
|
||||||
|
@ -217,7 +217,7 @@ void cluster_list ( void )
|
|||||||
|
|
||||||
void cluster_storage0_state ( bool enabled )
|
void cluster_storage0_state ( bool enabled )
|
||||||
{
|
{
|
||||||
if ( ctrl.cluster.storage0_enabled != enabled )
|
if ( (bool)ctrl.cluster.storage0_enabled != enabled )
|
||||||
{
|
{
|
||||||
ctrl.cluster.storage0_enabled = enabled ;
|
ctrl.cluster.storage0_enabled = enabled ;
|
||||||
ilog ("storage-0 heartbeat state changed to %s",
|
ilog ("storage-0 heartbeat state changed to %s",
|
||||||
@ -305,36 +305,40 @@ void hbs_cluster_add ( string & hostname )
|
|||||||
void hbs_cluster_del ( string & hostname )
|
void hbs_cluster_del ( string & hostname )
|
||||||
{
|
{
|
||||||
std::list<string>::iterator hostname_ptr ;
|
std::list<string>::iterator hostname_ptr ;
|
||||||
|
bool found = false ;
|
||||||
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
|
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
|
||||||
hostname_ptr != ctrl.monitored_hostname_list.end() ;
|
hostname_ptr != ctrl.monitored_hostname_list.end() ;
|
||||||
hostname_ptr++ )
|
hostname_ptr++ )
|
||||||
{
|
{
|
||||||
if ( hostname_ptr->compare(hostname) == 0 )
|
if ( hostname_ptr->compare(hostname) == 0 )
|
||||||
{
|
{
|
||||||
ctrl.monitored_hostname_list.remove(hostname) ;
|
found = true ;
|
||||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
|
||||||
|
|
||||||
/* Manage storage-0 state. */
|
|
||||||
if ( hostname.compare(STORAGE_0) == 0 )
|
|
||||||
{
|
|
||||||
cluster_storage0_state ( false );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we get down to 0 monitored hosts then just start fresh */
|
|
||||||
if (( ctrl.monitored_hosts ) == 0 )
|
|
||||||
{
|
|
||||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
|
||||||
}
|
|
||||||
|
|
||||||
ilog ("%s deleted from cluster", hostname.c_str());
|
|
||||||
|
|
||||||
cluster_list ();
|
|
||||||
|
|
||||||
hbs_cluster_change ( hostname + " deleted" );
|
|
||||||
|
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if ( found == true )
|
||||||
|
{
|
||||||
|
ctrl.monitored_hostname_list.remove(hostname) ;
|
||||||
|
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||||
|
|
||||||
|
/* Manage storage-0 state. */
|
||||||
|
if ( hostname.compare(STORAGE_0) == 0 )
|
||||||
|
cluster_storage0_state ( false );
|
||||||
|
|
||||||
|
/* If we get down to 0 monitored hosts then just start fresh */
|
||||||
|
if (( ctrl.monitored_hosts ) == 0 )
|
||||||
|
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||||
|
|
||||||
|
ilog ("%s deleted from cluster", hostname.c_str());
|
||||||
|
|
||||||
|
cluster_list ();
|
||||||
|
|
||||||
|
hbs_cluster_change ( hostname + " deleted" );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
slog("%s not found in cluster list", hostname.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void )
|
|||||||
* 4. Maintain a back to back non-responding count for storage-0.
|
* 4. Maintain a back to back non-responding count for storage-0.
|
||||||
* Once the count reaches the minimum threshold of
|
* Once the count reaches the minimum threshold of
|
||||||
* STORAGE_0_NR_THRESHOLD then the specific network history
|
* STORAGE_0_NR_THRESHOLD then the specific network history
|
||||||
* is updated to indicate storgae-0 is not responding. Once
|
* is updated to indicate storage-0 is not responding. Once
|
||||||
* storage-0 starts responding again with a single response
|
* storage-0 starts responding again with a single response
|
||||||
* then that network history is updated to indicate storage-0
|
* then that network history is updated to indicate storage-0
|
||||||
* is responding.
|
* is responding.
|
||||||
@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void )
|
|||||||
|
|
||||||
void hbs_cluster_update ( iface_enum iface,
|
void hbs_cluster_update ( iface_enum iface,
|
||||||
unsigned short not_responding_hosts,
|
unsigned short not_responding_hosts,
|
||||||
bool storage_0_responding )
|
bool storage_0_responding,
|
||||||
|
bool sm_heartbeat_ok)
|
||||||
{
|
{
|
||||||
if ( ctrl.monitored_hosts == 0 )
|
if ( ctrl.monitored_hosts == 0 )
|
||||||
return ;
|
return ;
|
||||||
@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface,
|
|||||||
else
|
else
|
||||||
return ;
|
return ;
|
||||||
|
|
||||||
|
if ( ctrl.monitored_hosts < not_responding_hosts )
|
||||||
|
{
|
||||||
|
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
|
||||||
|
}
|
||||||
|
|
||||||
if ( not_responding_hosts )
|
if ( not_responding_hosts )
|
||||||
{
|
{
|
||||||
clog ("controller-%d %s enabled:%d not responding:%d",
|
clog ("controller-%d %s enabled:%d not responding:%d",
|
||||||
@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* update sm heartbeat status */
|
||||||
|
if ( sm_heartbeat_ok == true )
|
||||||
|
{
|
||||||
|
history_ptr->sm_heartbeat_fail = false ;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
history_ptr->sm_heartbeat_fail = true ;
|
||||||
|
}
|
||||||
|
|
||||||
/* Manage storage-0 status. */
|
/* Manage storage-0 status. */
|
||||||
if ( ctrl.cluster.storage0_enabled )
|
if ( ctrl.cluster.storage0_enabled )
|
||||||
{
|
{
|
||||||
/* Handle storage-0 status change from not responding to responding. */
|
/* Handle storage-0 status change from not responding to responding. */
|
||||||
if ( storage_0_responding == true )
|
if ( storage_0_responding == true )
|
||||||
{
|
{
|
||||||
if (history_ptr->storage0_responding == false)
|
if ((bool)history_ptr->storage0_responding == false)
|
||||||
{
|
{
|
||||||
history_ptr->storage0_responding = true ;
|
history_ptr->storage0_responding = true ;
|
||||||
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
|
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
|
||||||
@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Handle storage-0 status change from responding to not responding. */
|
/* Handle storage-0 status change from responding to not responding. */
|
||||||
if (( history_ptr->storage0_responding == true ) &&
|
if (( (bool)history_ptr->storage0_responding == true ) &&
|
||||||
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
|
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
|
||||||
{
|
{
|
||||||
history_ptr->storage0_responding = false ;
|
history_ptr->storage0_responding = false ;
|
||||||
@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface,
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* Typical path for storage-0 disabled or normal non-storage system case */
|
/* Typical path for storage-0 disabled or normal non-storage system case */
|
||||||
if ( history_ptr->storage0_responding == true )
|
if ( (bool)history_ptr->storage0_responding == true )
|
||||||
history_ptr->storage0_responding = false ;
|
history_ptr->storage0_responding = false ;
|
||||||
|
|
||||||
/* Handle clearing threshold count when storage-0 is not enabled. */
|
/* Handle clearing threshold count when storage-0 is not enabled. */
|
||||||
@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface,
|
|||||||
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
|
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
|
||||||
history_ptr->entries++ ;
|
history_ptr->entries++ ;
|
||||||
|
|
||||||
|
if ( ctrl.monitored_hosts < not_responding_hosts )
|
||||||
|
{
|
||||||
|
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
|
||||||
|
}
|
||||||
/* Update the history with this data. */
|
/* Update the history with this data. */
|
||||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
|
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
|
||||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
|
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
|
||||||
|
@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e
|
|||||||
}
|
}
|
||||||
if ( storage0_enabled )
|
if ( storage0_enabled )
|
||||||
{
|
{
|
||||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
|
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s",
|
||||||
history.controller,
|
history.controller,
|
||||||
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
|
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
|
||||||
history.storage0_responding ? "y" : "n",
|
history.storage0_responding ? "y" : "n",
|
||||||
|
history.sm_heartbeat_fail ? "miss":" ok",
|
||||||
str);
|
str);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
|
syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s",
|
||||||
history.controller,
|
history.controller,
|
||||||
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
|
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
|
||||||
|
history.sm_heartbeat_fail ? "miss":" ok ",
|
||||||
str);
|
str);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -71,7 +71,10 @@ typedef struct
|
|||||||
{
|
{
|
||||||
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
|
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
|
||||||
unsigned short network :4 ; /* see mtce_hbs_network_enum */
|
unsigned short network :4 ; /* see mtce_hbs_network_enum */
|
||||||
unsigned short reserved_bits :7 ; /* future - initted to 0 */
|
unsigned short reserved_bits :6 ; /* future - initted to 0 */
|
||||||
|
unsigned short sm_heartbeat_fail :1 ; /* SM Health
|
||||||
|
0 = SM Heartbeat OK,
|
||||||
|
1 = SM Heartbeat Failure */
|
||||||
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
|
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
|
||||||
unsigned short entries ; /* # of valid values in .entry */
|
unsigned short entries ; /* # of valid values in .entry */
|
||||||
unsigned short entries_max ; /* max size of the enry array */
|
unsigned short entries_max ; /* max size of the enry array */
|
||||||
|
@ -30,6 +30,11 @@ barbican_port = 9311 ; The Barbican Port Number
|
|||||||
token_refresh_rate = 1200 ; Authentication token refresh rate in seconds.
|
token_refresh_rate = 1200 ; Authentication token refresh rate in seconds.
|
||||||
; A value of zero means no refresh.
|
; A value of zero means no refresh.
|
||||||
; range is 0 or 600-22800
|
; range is 0 or 600-22800
|
||||||
|
|
||||||
|
sm_heartbeat_loss_thld = 800 ; number of msecs SM heartbeat fails before
|
||||||
|
; declaring an SM process heartbeat loss in
|
||||||
|
; the heartbeat cluster.
|
||||||
|
|
||||||
autorecovery_threshold = 3 ; The number of times maintenance will try to
|
autorecovery_threshold = 3 ; The number of times maintenance will try to
|
||||||
; auto recover a critically failed controller
|
; auto recover a critically failed controller
|
||||||
; while there is no backup controllers to fail
|
; while there is no backup controllers to fail
|
||||||
|
Loading…
Reference in New Issue
Block a user