Merge "Mtce: Improve robustness of heartbeat Loss reporting"
This commit is contained in:
commit
7512c6b105
@ -8249,7 +8249,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
|
||||
|
||||
|
||||
|
||||
|
||||
#define HBS_LOSS_REPORT_THROTTLE (100)
|
||||
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
{
|
||||
int lost = 0 ;
|
||||
@ -8408,16 +8408,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
( infra_degrade_only == true ))
|
||||
{
|
||||
/* Only print the log at the threshold boundary */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
{
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
|
||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
}
|
||||
}
|
||||
@ -8430,21 +8430,20 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
|
||||
{
|
||||
/* Only print the log at the threshold boundary */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
{
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
}
|
||||
}
|
||||
|
||||
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
|
||||
( pulse_ptr->hbs_failure[iface] == false ))
|
||||
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
{
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
|
@ -1183,11 +1183,11 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
|
||||
memset (&event, 0 , sizeof(mtc_message_type));
|
||||
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
|
||||
{
|
||||
// daemon_dump_membuf_banner ();
|
||||
hbsInv.print_node_info ();
|
||||
if ( hbs_config.debug_state == 2 )
|
||||
{
|
||||
hbsInv.print_node_info ();
|
||||
}
|
||||
hbs_cluster_log ( hbsInv.my_hostname, "event", true );
|
||||
|
||||
// daemon_dump_membuf ();
|
||||
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
|
||||
}
|
||||
else if ( event_cmd == MTC_EVENT_LOOPBACK )
|
||||
@ -1504,9 +1504,6 @@ void daemon_service_run ( void )
|
||||
daemon_exit ();
|
||||
}
|
||||
|
||||
/* set this controller as provisioned */
|
||||
hbs_manage_controller_state ( hbsInv.my_hostname , true );
|
||||
|
||||
/* Run heartbeat service forever or until stop condition */
|
||||
for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; )
|
||||
{
|
||||
@ -1518,7 +1515,6 @@ void daemon_service_run ( void )
|
||||
if ( sockets_init == true )
|
||||
{
|
||||
hbsInv.print_node_info();
|
||||
|
||||
hbs_state_audit ();
|
||||
}
|
||||
|
||||
@ -1725,7 +1721,7 @@ void daemon_service_run ( void )
|
||||
{
|
||||
hbsInv.hbs_disabled = true ;
|
||||
hbsInv.hbs_state_change = true ;
|
||||
hbs_cluster_lock();
|
||||
hbs_controller_lock ();
|
||||
ilog ("heartbeat service going disabled (locked)");
|
||||
|
||||
/* force the throttle 'still disabled' log to wait for
|
||||
@ -1904,17 +1900,12 @@ void daemon_service_run ( void )
|
||||
}
|
||||
else if ( msg.cmd == MTC_CMD_STOP_HOST )
|
||||
{
|
||||
if ( hostname == hbsInv.my_hostname )
|
||||
{
|
||||
ilog ("%s heartbeat service disabled by stop command",
|
||||
hostname.c_str());
|
||||
|
||||
hbs_manage_controller_state( hostname, false );
|
||||
}
|
||||
else
|
||||
if ( hostname != hbsInv.my_hostname )
|
||||
{
|
||||
hbsInv.mon_host ( hostname, false, true );
|
||||
hbs_cluster_del ( hostname );
|
||||
ilog ("%s heartbeat service disabled by stop command",
|
||||
hostname.c_str());
|
||||
}
|
||||
}
|
||||
else if ( msg.cmd == MTC_CMD_START_HOST )
|
||||
@ -2309,7 +2300,10 @@ void daemon_service_run ( void )
|
||||
*/
|
||||
bool storage_0_responding = true ;
|
||||
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
|
||||
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
|
||||
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
|
||||
{
|
||||
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
|
||||
}
|
||||
}
|
||||
hbsTimer.ring = false ;
|
||||
heartbeat_request = true ;
|
||||
|
@ -250,8 +250,9 @@ void hbs_cluster_change ( string cluster_change_reason );
|
||||
* Automatically adjusts the numbers in the cluster vault. */
|
||||
void hbs_cluster_add ( string & hostname );
|
||||
void hbs_cluster_del ( string & hostname );
|
||||
void hbs_cluster_rem ( unsigned short controller );
|
||||
void hbs_cluster_lock ( void );
|
||||
|
||||
/* do actions when this controller is detected as locked */
|
||||
void hbs_controller_lock ( void );
|
||||
|
||||
/* Do stuff in preparation for another pulse period start */
|
||||
void hbs_cluster_period_start ( void );
|
||||
@ -263,9 +264,6 @@ void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
|
||||
int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
|
||||
mtce_hbs_cluster_history_type h2 );
|
||||
|
||||
/* Manage the enabled state of the controllers */
|
||||
void hbs_manage_controller_state ( string & hostname, bool enabled );
|
||||
|
||||
/* Set the number of monitored hosts and this controller's
|
||||
* number in the cluster vault. */
|
||||
void hbs_cluster_nums ( unsigned short this_controller,
|
||||
|
@ -750,6 +750,7 @@ static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;
|
||||
|
||||
static int rx_error_count[MAX_IFACES] = {0,0} ;
|
||||
static int tx_error_count[MAX_IFACES] = {0,0} ;
|
||||
static int missing_history_count[MAX_IFACES] = {0,0} ;
|
||||
|
||||
#define ERROR_LOG_THRESHOLD (200)
|
||||
|
||||
@ -961,18 +962,21 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
}
|
||||
else
|
||||
{
|
||||
int debug_state = daemon_get_cfg_ptr()->debug_state ;
|
||||
|
||||
clog ("controller-%d %s cluster info added to response (%d)",
|
||||
controller?0:1,
|
||||
get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] );
|
||||
get_iface_name_str(iface),
|
||||
missed_controller_summary_tracker[controller?0:1] );
|
||||
|
||||
/* Now copy the other controller's cached cluster info into
|
||||
* this controller's response */
|
||||
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
|
||||
hbs_sock.rx_mesg[iface].cluster );
|
||||
|
||||
if ( daemon_get_cfg_ptr()->debug_state & 4 )
|
||||
string dump_banner = "" ;
|
||||
if ( debug_state )
|
||||
{
|
||||
string dump_banner = "" ;
|
||||
dump_banner.append("controller-") ;
|
||||
dump_banner.append(itos(controller?0:1));
|
||||
dump_banner.append(" cluster info from cache injected into controller-");
|
||||
@ -980,10 +984,30 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
dump_banner.append(":");
|
||||
dump_banner.append(get_iface_name_str(iface));
|
||||
dump_banner.append(" pulse response");
|
||||
}
|
||||
|
||||
if ( debug_state & 4 )
|
||||
{
|
||||
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
|
||||
}
|
||||
else
|
||||
{
|
||||
clog ("%s", dump_banner.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (missing_history_count[iface])
|
||||
{
|
||||
ilog ("controller-%d %s providing cluster history",
|
||||
controller, get_iface_name_str(iface));
|
||||
missing_history_count[iface] = 0 ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog_throttled ( missing_history_count[iface], 5000,
|
||||
"controller-%d %s proividing no cluster history",
|
||||
controller, get_iface_name_str(iface));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,17 +30,17 @@ typedef struct
|
||||
/* Contains the controller number (0 or 1) for this controller. */
|
||||
unsigned short this_controller ;
|
||||
|
||||
/* Preserves which controllers are enabled. */
|
||||
bool controller_0_enabled ;
|
||||
bool controller_1_enabled ;
|
||||
#ifdef THREE_CONTROLLER_SYSTEM
|
||||
bool controller_2_enabled ;
|
||||
#endif
|
||||
|
||||
/* Used to manage the cluster based on this and peer controller state */
|
||||
bool peer_controller_enabled ;
|
||||
|
||||
/* Used to prevent log flooding in presence of back to back errors. */
|
||||
unsigned int log_throttle ;
|
||||
|
||||
/* Used to log when
|
||||
* - peer history goes missing (false -> true change)
|
||||
* - peer history starts being received ( true -> false change ) */
|
||||
bool peer_history_missing ;
|
||||
|
||||
/* Used to threshold storage-0 not responding state */
|
||||
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
|
||||
|
||||
@ -130,12 +130,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
|
||||
void hbs_cluster_ctrl_init ( void )
|
||||
{
|
||||
ctrl.this_controller = 0xffff ;
|
||||
ctrl.controller_0_enabled = false ;
|
||||
ctrl.controller_1_enabled = false ;
|
||||
#ifdef THREE_CONTROLLER_SYSTEM
|
||||
ctrl.controller_2_enabled = false ;
|
||||
#endif
|
||||
ctrl.peer_controller_enabled = false ;
|
||||
ctrl.peer_history_missing = true ;
|
||||
ctrl.log_throttle = 0 ;
|
||||
ctrl.monitored_networks = 0 ;
|
||||
ctrl.monitored_hosts = 0 ;
|
||||
@ -236,64 +232,6 @@ void cluster_storage0_state ( bool enabled )
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_manage_controller_state
|
||||
*
|
||||
* Description : Track the monitored enabled state of the controllers.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_manage_controller_state ( string & hostname, bool enabled )
|
||||
{
|
||||
int controller = -1 ;
|
||||
|
||||
/* track controller state */
|
||||
if ( hostname == CONTROLLER_0 )
|
||||
{
|
||||
controller = 0 ;
|
||||
ctrl.controller_0_enabled = enabled ;
|
||||
}
|
||||
else if ( hostname == CONTROLLER_1 )
|
||||
{
|
||||
controller = 1 ;
|
||||
ctrl.controller_1_enabled = enabled ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ignore all other host names */
|
||||
return ;
|
||||
}
|
||||
|
||||
/* manage the state of the peer controller */
|
||||
if ( ctrl.this_controller != controller )
|
||||
{
|
||||
/* Clear peer controller cluster history when the peer
|
||||
* controller goes disabled */
|
||||
if (( ctrl.peer_controller_enabled == true ) &&
|
||||
( enabled == false ))
|
||||
{
|
||||
hbs_cluster_rem ( controller );
|
||||
}
|
||||
if ( enabled == false )
|
||||
{
|
||||
hbs_cluster_change ( "peer controller disabled" ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
hbs_cluster_change ( "peer controller enabled" ) ;
|
||||
}
|
||||
ctrl.peer_controller_enabled = enabled ;
|
||||
}
|
||||
else if ( enabled == false )
|
||||
{
|
||||
hbs_cluster_change ( "this controller locked" ) ;
|
||||
hbs_cluster_lock();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_add
|
||||
@ -346,8 +284,11 @@ void hbs_cluster_add ( string & hostname )
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||
}
|
||||
|
||||
/* Manage controller state ; true means enabled in this case. */
|
||||
hbs_manage_controller_state ( hostname, true );
|
||||
/* Catch enable/provisioning of the peer controller */
|
||||
if (( hostname == CONTROLLER_0 ) && ( ctrl.this_controller != 0 ))
|
||||
ctrl.peer_controller_enabled = true ;
|
||||
if (( hostname == CONTROLLER_1 ) && ( ctrl.this_controller != 1 ))
|
||||
ctrl.peer_controller_enabled = true ;
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
@ -391,9 +332,6 @@ void hbs_cluster_del ( string & hostname )
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||
}
|
||||
|
||||
/* Manage controller state ; false means not enabled in this case. */
|
||||
hbs_manage_controller_state ( hostname , false );
|
||||
|
||||
ilog ("%s deleted from cluster", hostname.c_str());
|
||||
|
||||
cluster_list ();
|
||||
@ -455,7 +393,6 @@ void hbs_cluster_period_start ( void )
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
void hbs_cluster_update ( iface_enum iface,
|
||||
unsigned short not_responding_hosts,
|
||||
bool storage_0_responding )
|
||||
@ -636,7 +573,7 @@ void hbs_cluster_append ( hbs_message_type & msg )
|
||||
}
|
||||
msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);
|
||||
|
||||
clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
|
||||
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
|
||||
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
|
||||
}
|
||||
|
||||
@ -649,12 +586,22 @@ void hbs_cluster_peer ( void )
|
||||
if (( ctrl.got_peer_controller_history == false ) &&
|
||||
( ctrl.peer_controller_enabled == true ))
|
||||
{
|
||||
ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */
|
||||
|
||||
if ( ctrl.peer_history_missing == false )
|
||||
{
|
||||
wlog ( "missing peer controller cluster view" );
|
||||
ctrl.peer_history_missing = true ;
|
||||
}
|
||||
/* if no nodes have reported peer controller history then inject
|
||||
* a 0:0 value in for this pulse period for that controller. */
|
||||
hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
|
||||
}
|
||||
else if (( ctrl.got_peer_controller_history == true ) &&
|
||||
( ctrl.peer_controller_enabled == true ) &&
|
||||
( ctrl.peer_history_missing == true ))
|
||||
{
|
||||
wlog ( "receiving peer controller cluster view" );
|
||||
ctrl.peer_history_missing = false ;
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
@ -720,7 +667,9 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample )
|
||||
void hbs_history_save ( string hostname,
|
||||
mtce_hbs_network_enum network,
|
||||
mtce_hbs_cluster_history_type & sample )
|
||||
{
|
||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||
{
|
||||
@ -736,12 +685,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
|
||||
memcpy( &ctrl.cluster.history[h], &sample,
|
||||
sizeof(mtce_hbs_cluster_history_type));
|
||||
|
||||
clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)",
|
||||
clog1 ("controller-%d vault update from controller-%d %s reply with %d histories (this:%s)",
|
||||
ctrl.this_controller,
|
||||
sample.controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
|
||||
hostname.c_str(),
|
||||
ctrl.cluster.histories);
|
||||
hbs_cluster_network_name(network).c_str(),
|
||||
ctrl.cluster.histories,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str());
|
||||
return ;
|
||||
}
|
||||
}
|
||||
@ -756,16 +705,17 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
|
||||
ctrl.cluster.histories++ ;
|
||||
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
||||
|
||||
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
|
||||
ilog ("controller-%d added new %s:%s history to vault ; now have %d network views",
|
||||
ctrl.this_controller,
|
||||
sample.controller,
|
||||
hostname.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
|
||||
ctrl.cluster.histories);
|
||||
}
|
||||
|
||||
void hbs_state_audit ( void )
|
||||
{
|
||||
hbs_cluster_dump ( ctrl.cluster, "Audit" );
|
||||
if ( ctrl.monitored_hosts )
|
||||
hbs_cluster_dump ( ctrl.cluster, "Audit" );
|
||||
}
|
||||
|
||||
|
||||
@ -872,7 +822,7 @@ int hbs_cluster_save ( string & hostname,
|
||||
{
|
||||
/* set that we got some history and save it */
|
||||
ctrl.got_peer_controller_history = true ;
|
||||
hbs_history_save ( hostname, msg.cluster.history[h] );
|
||||
hbs_history_save ( hostname, network, msg.cluster.history[h] );
|
||||
}
|
||||
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
|
||||
}
|
||||
@ -915,39 +865,26 @@ void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enable
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_controller_lock
|
||||
*
|
||||
* Description : Clear all history for this controller.
|
||||
* Called when this controller is detected as locked.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_rem ( unsigned short controller )
|
||||
void hbs_controller_lock ( void )
|
||||
{
|
||||
int removed = 0 ;
|
||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||
if ( ctrl.cluster.histories )
|
||||
{
|
||||
if ( ctrl.cluster.history[h].controller == controller )
|
||||
ilog ("controller-%d locked ; clearing all cluster info", ctrl.this_controller );
|
||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||
{
|
||||
removed++ ;
|
||||
wlog ("controller-%d %s network history removed from cluster (slot %d)",
|
||||
controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
|
||||
h );
|
||||
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
|
||||
}
|
||||
ctrl.cluster.histories = 0 ;
|
||||
hbs_cluster_change ( "this controller locked" ) ;
|
||||
}
|
||||
|
||||
if ( removed )
|
||||
{
|
||||
hbs_cluster_change ( "removed controller history" ) ;
|
||||
}
|
||||
|
||||
ctrl.cluster.histories -= removed ;
|
||||
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
||||
}
|
||||
|
||||
/* remove all cluster history on a lock operation */
|
||||
void hbs_cluster_lock( void )
|
||||
{
|
||||
ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller );
|
||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||
{
|
||||
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
|
||||
}
|
||||
ctrl.cluster.histories = 0 ;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user