Merge "Mtce: Improve robustness of heartbeat Loss reporting"
This commit is contained in:
commit
7512c6b105
@ -8249,7 +8249,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define HBS_LOSS_REPORT_THROTTLE (100)
|
||||||
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||||
{
|
{
|
||||||
int lost = 0 ;
|
int lost = 0 ;
|
||||||
@ -8408,16 +8408,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||||||
( infra_degrade_only == true ))
|
( infra_degrade_only == true ))
|
||||||
{
|
{
|
||||||
/* Only print the log at the threshold boundary */
|
/* Only print the log at the threshold boundary */
|
||||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||||
{
|
{
|
||||||
if ( this->active_controller )
|
if ( this->active_controller )
|
||||||
{
|
{
|
||||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||||
}
|
}
|
||||||
|
|
||||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
pulse_ptr->hostname.c_str(),
|
||||||
get_iface_name_str(iface) );
|
get_iface_name_str(iface) );
|
||||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -8430,21 +8430,20 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||||||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
|
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
|
||||||
{
|
{
|
||||||
/* Only print the log at the threshold boundary */
|
/* Only print the log at the threshold boundary */
|
||||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||||
{
|
{
|
||||||
if ( this->active_controller )
|
if ( this->active_controller )
|
||||||
{
|
{
|
||||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||||
}
|
}
|
||||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
pulse_ptr->hostname.c_str(),
|
||||||
get_iface_name_str(iface) );
|
get_iface_name_str(iface));
|
||||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
|
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||||
( pulse_ptr->hbs_failure[iface] == false ))
|
|
||||||
{
|
{
|
||||||
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
||||||
get_iface_name_str(iface) );
|
get_iface_name_str(iface) );
|
||||||
|
@ -1183,11 +1183,11 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
|
|||||||
memset (&event, 0 , sizeof(mtc_message_type));
|
memset (&event, 0 , sizeof(mtc_message_type));
|
||||||
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
|
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
|
||||||
{
|
{
|
||||||
// daemon_dump_membuf_banner ();
|
if ( hbs_config.debug_state == 2 )
|
||||||
hbsInv.print_node_info ();
|
{
|
||||||
|
hbsInv.print_node_info ();
|
||||||
|
}
|
||||||
hbs_cluster_log ( hbsInv.my_hostname, "event", true );
|
hbs_cluster_log ( hbsInv.my_hostname, "event", true );
|
||||||
|
|
||||||
// daemon_dump_membuf ();
|
|
||||||
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
|
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
|
||||||
}
|
}
|
||||||
else if ( event_cmd == MTC_EVENT_LOOPBACK )
|
else if ( event_cmd == MTC_EVENT_LOOPBACK )
|
||||||
@ -1504,9 +1504,6 @@ void daemon_service_run ( void )
|
|||||||
daemon_exit ();
|
daemon_exit ();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* set this controller as provisioned */
|
|
||||||
hbs_manage_controller_state ( hbsInv.my_hostname , true );
|
|
||||||
|
|
||||||
/* Run heartbeat service forever or until stop condition */
|
/* Run heartbeat service forever or until stop condition */
|
||||||
for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; )
|
for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; )
|
||||||
{
|
{
|
||||||
@ -1518,7 +1515,6 @@ void daemon_service_run ( void )
|
|||||||
if ( sockets_init == true )
|
if ( sockets_init == true )
|
||||||
{
|
{
|
||||||
hbsInv.print_node_info();
|
hbsInv.print_node_info();
|
||||||
|
|
||||||
hbs_state_audit ();
|
hbs_state_audit ();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1725,7 +1721,7 @@ void daemon_service_run ( void )
|
|||||||
{
|
{
|
||||||
hbsInv.hbs_disabled = true ;
|
hbsInv.hbs_disabled = true ;
|
||||||
hbsInv.hbs_state_change = true ;
|
hbsInv.hbs_state_change = true ;
|
||||||
hbs_cluster_lock();
|
hbs_controller_lock ();
|
||||||
ilog ("heartbeat service going disabled (locked)");
|
ilog ("heartbeat service going disabled (locked)");
|
||||||
|
|
||||||
/* force the throttle 'still disabled' log to wait for
|
/* force the throttle 'still disabled' log to wait for
|
||||||
@ -1904,17 +1900,12 @@ void daemon_service_run ( void )
|
|||||||
}
|
}
|
||||||
else if ( msg.cmd == MTC_CMD_STOP_HOST )
|
else if ( msg.cmd == MTC_CMD_STOP_HOST )
|
||||||
{
|
{
|
||||||
if ( hostname == hbsInv.my_hostname )
|
if ( hostname != hbsInv.my_hostname )
|
||||||
{
|
|
||||||
ilog ("%s heartbeat service disabled by stop command",
|
|
||||||
hostname.c_str());
|
|
||||||
|
|
||||||
hbs_manage_controller_state( hostname, false );
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
hbsInv.mon_host ( hostname, false, true );
|
hbsInv.mon_host ( hostname, false, true );
|
||||||
hbs_cluster_del ( hostname );
|
hbs_cluster_del ( hostname );
|
||||||
|
ilog ("%s heartbeat service disabled by stop command",
|
||||||
|
hostname.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ( msg.cmd == MTC_CMD_START_HOST )
|
else if ( msg.cmd == MTC_CMD_START_HOST )
|
||||||
@ -2309,7 +2300,10 @@ void daemon_service_run ( void )
|
|||||||
*/
|
*/
|
||||||
bool storage_0_responding = true ;
|
bool storage_0_responding = true ;
|
||||||
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
|
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
|
||||||
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
|
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
|
||||||
|
{
|
||||||
|
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
hbsTimer.ring = false ;
|
hbsTimer.ring = false ;
|
||||||
heartbeat_request = true ;
|
heartbeat_request = true ;
|
||||||
|
@ -250,8 +250,9 @@ void hbs_cluster_change ( string cluster_change_reason );
|
|||||||
* Automatically adjusts the numbers in the cluster vault. */
|
* Automatically adjusts the numbers in the cluster vault. */
|
||||||
void hbs_cluster_add ( string & hostname );
|
void hbs_cluster_add ( string & hostname );
|
||||||
void hbs_cluster_del ( string & hostname );
|
void hbs_cluster_del ( string & hostname );
|
||||||
void hbs_cluster_rem ( unsigned short controller );
|
|
||||||
void hbs_cluster_lock ( void );
|
/* do actions when this controller is detected as locked */
|
||||||
|
void hbs_controller_lock ( void );
|
||||||
|
|
||||||
/* Do stuff in preparation for another pulse period start */
|
/* Do stuff in preparation for another pulse period start */
|
||||||
void hbs_cluster_period_start ( void );
|
void hbs_cluster_period_start ( void );
|
||||||
@ -263,9 +264,6 @@ void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
|
|||||||
int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
|
int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
|
||||||
mtce_hbs_cluster_history_type h2 );
|
mtce_hbs_cluster_history_type h2 );
|
||||||
|
|
||||||
/* Manage the enabled state of the controllers */
|
|
||||||
void hbs_manage_controller_state ( string & hostname, bool enabled );
|
|
||||||
|
|
||||||
/* Set the number of monitored hosts and this controller's
|
/* Set the number of monitored hosts and this controller's
|
||||||
* number in the cluster vault. */
|
* number in the cluster vault. */
|
||||||
void hbs_cluster_nums ( unsigned short this_controller,
|
void hbs_cluster_nums ( unsigned short this_controller,
|
||||||
|
@ -750,6 +750,7 @@ static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;
|
|||||||
|
|
||||||
static int rx_error_count[MAX_IFACES] = {0,0} ;
|
static int rx_error_count[MAX_IFACES] = {0,0} ;
|
||||||
static int tx_error_count[MAX_IFACES] = {0,0} ;
|
static int tx_error_count[MAX_IFACES] = {0,0} ;
|
||||||
|
static int missing_history_count[MAX_IFACES] = {0,0} ;
|
||||||
|
|
||||||
#define ERROR_LOG_THRESHOLD (200)
|
#define ERROR_LOG_THRESHOLD (200)
|
||||||
|
|
||||||
@ -961,18 +962,21 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
int debug_state = daemon_get_cfg_ptr()->debug_state ;
|
||||||
|
|
||||||
clog ("controller-%d %s cluster info added to response (%d)",
|
clog ("controller-%d %s cluster info added to response (%d)",
|
||||||
controller?0:1,
|
controller?0:1,
|
||||||
get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] );
|
get_iface_name_str(iface),
|
||||||
|
missed_controller_summary_tracker[controller?0:1] );
|
||||||
|
|
||||||
/* Now copy the other controller's cached cluster info into
|
/* Now copy the other controller's cached cluster info into
|
||||||
* this controller's response */
|
* this controller's response */
|
||||||
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
|
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
|
||||||
hbs_sock.rx_mesg[iface].cluster );
|
hbs_sock.rx_mesg[iface].cluster );
|
||||||
|
|
||||||
if ( daemon_get_cfg_ptr()->debug_state & 4 )
|
string dump_banner = "" ;
|
||||||
|
if ( debug_state )
|
||||||
{
|
{
|
||||||
string dump_banner = "" ;
|
|
||||||
dump_banner.append("controller-") ;
|
dump_banner.append("controller-") ;
|
||||||
dump_banner.append(itos(controller?0:1));
|
dump_banner.append(itos(controller?0:1));
|
||||||
dump_banner.append(" cluster info from cache injected into controller-");
|
dump_banner.append(" cluster info from cache injected into controller-");
|
||||||
@ -980,10 +984,30 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
|||||||
dump_banner.append(":");
|
dump_banner.append(":");
|
||||||
dump_banner.append(get_iface_name_str(iface));
|
dump_banner.append(get_iface_name_str(iface));
|
||||||
dump_banner.append(" pulse response");
|
dump_banner.append(" pulse response");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( debug_state & 4 )
|
||||||
|
{
|
||||||
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
|
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
clog ("%s", dump_banner.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (missing_history_count[iface])
|
||||||
|
{
|
||||||
|
ilog ("controller-%d %s providing cluster history",
|
||||||
|
controller, get_iface_name_str(iface));
|
||||||
|
missing_history_count[iface] = 0 ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
wlog_throttled ( missing_history_count[iface], 5000,
|
||||||
|
"controller-%d %s proividing no cluster history",
|
||||||
|
controller, get_iface_name_str(iface));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,17 +30,17 @@ typedef struct
|
|||||||
/* Contains the controller number (0 or 1) for this controller. */
|
/* Contains the controller number (0 or 1) for this controller. */
|
||||||
unsigned short this_controller ;
|
unsigned short this_controller ;
|
||||||
|
|
||||||
/* Preserves which controllers are enabled. */
|
/* Used to manage the cluster based on this and peer controller state */
|
||||||
bool controller_0_enabled ;
|
|
||||||
bool controller_1_enabled ;
|
|
||||||
#ifdef THREE_CONTROLLER_SYSTEM
|
|
||||||
bool controller_2_enabled ;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool peer_controller_enabled ;
|
bool peer_controller_enabled ;
|
||||||
|
|
||||||
/* Used to prevent log flooding in presence of back to back errors. */
|
/* Used to prevent log flooding in presence of back to back errors. */
|
||||||
unsigned int log_throttle ;
|
unsigned int log_throttle ;
|
||||||
|
|
||||||
|
/* Used to log when
|
||||||
|
* - peer history goes missing (false -> true change)
|
||||||
|
* - peer history starts being received ( true -> false change ) */
|
||||||
|
bool peer_history_missing ;
|
||||||
|
|
||||||
/* Used to threshold storage-0 not responding state */
|
/* Used to threshold storage-0 not responding state */
|
||||||
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
|
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
|
||||||
|
|
||||||
@ -130,12 +130,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
|
|||||||
void hbs_cluster_ctrl_init ( void )
|
void hbs_cluster_ctrl_init ( void )
|
||||||
{
|
{
|
||||||
ctrl.this_controller = 0xffff ;
|
ctrl.this_controller = 0xffff ;
|
||||||
ctrl.controller_0_enabled = false ;
|
|
||||||
ctrl.controller_1_enabled = false ;
|
|
||||||
#ifdef THREE_CONTROLLER_SYSTEM
|
|
||||||
ctrl.controller_2_enabled = false ;
|
|
||||||
#endif
|
|
||||||
ctrl.peer_controller_enabled = false ;
|
ctrl.peer_controller_enabled = false ;
|
||||||
|
ctrl.peer_history_missing = true ;
|
||||||
ctrl.log_throttle = 0 ;
|
ctrl.log_throttle = 0 ;
|
||||||
ctrl.monitored_networks = 0 ;
|
ctrl.monitored_networks = 0 ;
|
||||||
ctrl.monitored_hosts = 0 ;
|
ctrl.monitored_hosts = 0 ;
|
||||||
@ -236,64 +232,6 @@ void cluster_storage0_state ( bool enabled )
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/****************************************************************************
|
|
||||||
*
|
|
||||||
* Name : hbs_manage_controller_state
|
|
||||||
*
|
|
||||||
* Description : Track the monitored enabled state of the controllers.
|
|
||||||
*
|
|
||||||
***************************************************************************/
|
|
||||||
|
|
||||||
void hbs_manage_controller_state ( string & hostname, bool enabled )
|
|
||||||
{
|
|
||||||
int controller = -1 ;
|
|
||||||
|
|
||||||
/* track controller state */
|
|
||||||
if ( hostname == CONTROLLER_0 )
|
|
||||||
{
|
|
||||||
controller = 0 ;
|
|
||||||
ctrl.controller_0_enabled = enabled ;
|
|
||||||
}
|
|
||||||
else if ( hostname == CONTROLLER_1 )
|
|
||||||
{
|
|
||||||
controller = 1 ;
|
|
||||||
ctrl.controller_1_enabled = enabled ;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* ignore all other host names */
|
|
||||||
return ;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* manage the state of the peer controller */
|
|
||||||
if ( ctrl.this_controller != controller )
|
|
||||||
{
|
|
||||||
/* Clear peer controller cluster history when the peer
|
|
||||||
* controller goes disabled */
|
|
||||||
if (( ctrl.peer_controller_enabled == true ) &&
|
|
||||||
( enabled == false ))
|
|
||||||
{
|
|
||||||
hbs_cluster_rem ( controller );
|
|
||||||
}
|
|
||||||
if ( enabled == false )
|
|
||||||
{
|
|
||||||
hbs_cluster_change ( "peer controller disabled" ) ;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
hbs_cluster_change ( "peer controller enabled" ) ;
|
|
||||||
}
|
|
||||||
ctrl.peer_controller_enabled = enabled ;
|
|
||||||
}
|
|
||||||
else if ( enabled == false )
|
|
||||||
{
|
|
||||||
hbs_cluster_change ( "this controller locked" ) ;
|
|
||||||
hbs_cluster_lock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
*
|
*
|
||||||
* Name : hbs_cluster_add
|
* Name : hbs_cluster_add
|
||||||
@ -346,8 +284,11 @@ void hbs_cluster_add ( string & hostname )
|
|||||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Manage controller state ; true means enabled in this case. */
|
/* Catch enable/provisioning of the peer controller */
|
||||||
hbs_manage_controller_state ( hostname, true );
|
if (( hostname == CONTROLLER_0 ) && ( ctrl.this_controller != 0 ))
|
||||||
|
ctrl.peer_controller_enabled = true ;
|
||||||
|
if (( hostname == CONTROLLER_1 ) && ( ctrl.this_controller != 1 ))
|
||||||
|
ctrl.peer_controller_enabled = true ;
|
||||||
}
|
}
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
@ -391,9 +332,6 @@ void hbs_cluster_del ( string & hostname )
|
|||||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Manage controller state ; false means not enabled in this case. */
|
|
||||||
hbs_manage_controller_state ( hostname , false );
|
|
||||||
|
|
||||||
ilog ("%s deleted from cluster", hostname.c_str());
|
ilog ("%s deleted from cluster", hostname.c_str());
|
||||||
|
|
||||||
cluster_list ();
|
cluster_list ();
|
||||||
@ -455,7 +393,6 @@ void hbs_cluster_period_start ( void )
|
|||||||
*
|
*
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
void hbs_cluster_update ( iface_enum iface,
|
void hbs_cluster_update ( iface_enum iface,
|
||||||
unsigned short not_responding_hosts,
|
unsigned short not_responding_hosts,
|
||||||
bool storage_0_responding )
|
bool storage_0_responding )
|
||||||
@ -636,7 +573,7 @@ void hbs_cluster_append ( hbs_message_type & msg )
|
|||||||
}
|
}
|
||||||
msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);
|
msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);
|
||||||
|
|
||||||
clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
|
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
|
||||||
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
|
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -649,12 +586,22 @@ void hbs_cluster_peer ( void )
|
|||||||
if (( ctrl.got_peer_controller_history == false ) &&
|
if (( ctrl.got_peer_controller_history == false ) &&
|
||||||
( ctrl.peer_controller_enabled == true ))
|
( ctrl.peer_controller_enabled == true ))
|
||||||
{
|
{
|
||||||
ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */
|
if ( ctrl.peer_history_missing == false )
|
||||||
|
{
|
||||||
|
wlog ( "missing peer controller cluster view" );
|
||||||
|
ctrl.peer_history_missing = true ;
|
||||||
|
}
|
||||||
/* if no nodes have reported peer controller history then inject
|
/* if no nodes have reported peer controller history then inject
|
||||||
* a 0:0 value in for this pulse period for that controller. */
|
* a 0:0 value in for this pulse period for that controller. */
|
||||||
hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
|
hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
|
||||||
}
|
}
|
||||||
|
else if (( ctrl.got_peer_controller_history == true ) &&
|
||||||
|
( ctrl.peer_controller_enabled == true ) &&
|
||||||
|
( ctrl.peer_history_missing == true ))
|
||||||
|
{
|
||||||
|
wlog ( "receiving peer controller cluster view" );
|
||||||
|
ctrl.peer_history_missing = false ;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
@ -720,7 +667,9 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
|
|||||||
*
|
*
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample )
|
void hbs_history_save ( string hostname,
|
||||||
|
mtce_hbs_network_enum network,
|
||||||
|
mtce_hbs_cluster_history_type & sample )
|
||||||
{
|
{
|
||||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||||
{
|
{
|
||||||
@ -736,12 +685,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
|
|||||||
memcpy( &ctrl.cluster.history[h], &sample,
|
memcpy( &ctrl.cluster.history[h], &sample,
|
||||||
sizeof(mtce_hbs_cluster_history_type));
|
sizeof(mtce_hbs_cluster_history_type));
|
||||||
|
|
||||||
clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)",
|
clog1 ("controller-%d vault update from controller-%d %s reply with %d histories (this:%s)",
|
||||||
ctrl.this_controller,
|
ctrl.this_controller,
|
||||||
sample.controller,
|
sample.controller,
|
||||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
|
hbs_cluster_network_name(network).c_str(),
|
||||||
hostname.c_str(),
|
ctrl.cluster.histories,
|
||||||
ctrl.cluster.histories);
|
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str());
|
||||||
return ;
|
return ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -756,16 +705,17 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
|
|||||||
ctrl.cluster.histories++ ;
|
ctrl.cluster.histories++ ;
|
||||||
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
||||||
|
|
||||||
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
|
ilog ("controller-%d added new %s:%s history to vault ; now have %d network views",
|
||||||
ctrl.this_controller,
|
ctrl.this_controller,
|
||||||
sample.controller,
|
hostname.c_str(),
|
||||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
|
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
|
||||||
ctrl.cluster.histories);
|
ctrl.cluster.histories);
|
||||||
}
|
}
|
||||||
|
|
||||||
void hbs_state_audit ( void )
|
void hbs_state_audit ( void )
|
||||||
{
|
{
|
||||||
hbs_cluster_dump ( ctrl.cluster, "Audit" );
|
if ( ctrl.monitored_hosts )
|
||||||
|
hbs_cluster_dump ( ctrl.cluster, "Audit" );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -872,7 +822,7 @@ int hbs_cluster_save ( string & hostname,
|
|||||||
{
|
{
|
||||||
/* set that we got some history and save it */
|
/* set that we got some history and save it */
|
||||||
ctrl.got_peer_controller_history = true ;
|
ctrl.got_peer_controller_history = true ;
|
||||||
hbs_history_save ( hostname, msg.cluster.history[h] );
|
hbs_history_save ( hostname, network, msg.cluster.history[h] );
|
||||||
}
|
}
|
||||||
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
|
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
|
||||||
}
|
}
|
||||||
@ -915,39 +865,26 @@ void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enable
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Name : hbs_controller_lock
|
||||||
|
*
|
||||||
|
* Description : Clear all history for this controller.
|
||||||
|
* Called when this controller is detected as locked.
|
||||||
|
*
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
void hbs_cluster_rem ( unsigned short controller )
|
void hbs_controller_lock ( void )
|
||||||
{
|
{
|
||||||
int removed = 0 ;
|
if ( ctrl.cluster.histories )
|
||||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
|
||||||
{
|
{
|
||||||
if ( ctrl.cluster.history[h].controller == controller )
|
ilog ("controller-%d locked ; clearing all cluster info", ctrl.this_controller );
|
||||||
|
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||||
{
|
{
|
||||||
removed++ ;
|
|
||||||
wlog ("controller-%d %s network history removed from cluster (slot %d)",
|
|
||||||
controller,
|
|
||||||
hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
|
|
||||||
h );
|
|
||||||
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
|
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
|
||||||
}
|
}
|
||||||
|
ctrl.cluster.histories = 0 ;
|
||||||
|
hbs_cluster_change ( "this controller locked" ) ;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( removed )
|
|
||||||
{
|
|
||||||
hbs_cluster_change ( "removed controller history" ) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctrl.cluster.histories -= removed ;
|
|
||||||
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* remove all cluster history on a lock operation */
|
|
||||||
void hbs_cluster_lock( void )
|
|
||||||
{
|
|
||||||
ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller );
|
|
||||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
|
||||||
{
|
|
||||||
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
|
|
||||||
}
|
|
||||||
ctrl.cluster.histories = 0 ;
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user