Merge "Mtce heartbeat cluster state change notification improvement"
This commit is contained in:
commit
7a3adb2cdc
@ -1,22 +1,13 @@
|
||||
[Unit]
|
||||
Description=StarlingX Maintenance Heartbeat Agent
|
||||
After=network.target syslog.service config.service
|
||||
After=hbsClient.service
|
||||
Before=pmon.service
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
ExecStart=/etc/rc.d/init.d/hbsAgent start
|
||||
ExecStop=/etc/rc.d/init.d/hbsAgent start
|
||||
ExecStop=/etc/rc.d/init.d/hbsAgent stop
|
||||
PIDFile=/var/run/hbsAgent.pid
|
||||
KillMode=process
|
||||
SendSIGKILL=no
|
||||
|
||||
# Process recovery is handled by pmond if its running.
|
||||
# Delay 10 seconds to give pmond a chance to recover
|
||||
# before systemd kicks in to do it as a backup plan.
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
|
@ -8628,7 +8628,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
|
||||
|
||||
|
||||
|
||||
#define HBS_LOSS_REPORT_THROTTLE (100)
|
||||
#define HBS_LOSS_REPORT_THROTTLE (100000)
|
||||
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
{
|
||||
int lost = 0 ;
|
||||
@ -8668,6 +8668,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
|
||||
{
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " " +
|
||||
get_iface_name_str(iface) +
|
||||
" heartbeat miss " +
|
||||
itos(pulse_ptr->b2b_misses_count[iface]));
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
@ -8774,57 +8781,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
}
|
||||
}
|
||||
|
||||
/* Turn the cluster-host heartbeat loss into a degrade only
|
||||
* condition if the clstr_degrade_only flag is set */
|
||||
if (( iface == CLSTR_IFACE ) &&
|
||||
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
|
||||
( clstr_degrade_only == true ))
|
||||
{
|
||||
/* Only print the log at the threshold boundary */
|
||||
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
{
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
}
|
||||
}
|
||||
|
||||
/* Turn the clstr heartbeat loss into a degrade only
|
||||
* condition for inactive controller on normal system. */
|
||||
else if (( iface == CLSTR_IFACE ) &&
|
||||
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
|
||||
( this->system_type == SYSTEM_TYPE__NORMAL ) &&
|
||||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
|
||||
if (( iface == CLSTR_IFACE ) &&
|
||||
((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
|
||||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
|
||||
( clstr_degrade_only == true )))
|
||||
{
|
||||
/* Only print the log at the threshold boundary */
|
||||
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
|
||||
{
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
get_iface_name_str(iface),
|
||||
clstr_degrade_only ? "config option" : "system type");
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
|
||||
}
|
||||
}
|
||||
|
||||
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
// else if ( pulse_ptr->hbs_failure[iface] == false )
|
||||
{
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
pulse_ptr->b2b_misses_count[iface]);
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
|
||||
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
|
||||
if ( pulse_ptr->hbs_failure[iface] == false )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
||||
{
|
||||
@ -8832,10 +8825,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_failure[iface] = true ;
|
||||
}
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
|
||||
pulse_ptr->hbs_failure_count[iface]++ ;
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
||||
|
@ -2374,6 +2374,7 @@ void daemon_service_run ( void )
|
||||
arrival_histogram[iface] = "" ;
|
||||
unexpected_pulse_list[iface] = "" ;
|
||||
|
||||
|
||||
rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
|
||||
if ( rc != 0 )
|
||||
{
|
||||
|
@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri
|
||||
void hbs_sm_handler ( void );
|
||||
|
||||
/* send the cluster vault to SM */
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
|
||||
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
|
||||
|
||||
/* copy cluster data from src to dst */
|
||||
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
|
||||
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
|
||||
/* Heartbeat service state audit */
|
||||
void hbs_state_audit ( void );
|
||||
|
||||
/* Send state change message to SM if there has been a
|
||||
* state change in the last period */
|
||||
void hbs_cluster_change_notifier ( void );
|
||||
|
||||
/**
|
||||
* @} hbs_base
|
||||
*/
|
||||
|
@ -69,6 +69,8 @@ typedef struct
|
||||
|
||||
msgClassSock * sm_socket_ptr ;
|
||||
|
||||
string cluster_change_reason ;
|
||||
|
||||
} hbs_cluster_ctrl_type ;
|
||||
|
||||
/* Cluster control structire construct allocation. */
|
||||
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
|
||||
{
|
||||
ctrl.sm_socket_ptr = sm_socket_ptr ;
|
||||
}
|
||||
ctrl.cluster_change_reason = "";
|
||||
|
||||
ctrl.log_throttle = 0 ;
|
||||
}
|
||||
|
||||
@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,
|
||||
|
||||
void hbs_cluster_change ( string cluster_change_reason )
|
||||
{
|
||||
hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
|
||||
ilog ("reason: %s", cluster_change_reason.c_str());
|
||||
if ( ctrl.cluster_change_reason.empty() )
|
||||
ctrl.cluster_change_reason = cluster_change_reason ;
|
||||
else
|
||||
ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_change_notifier
|
||||
*
|
||||
* Description : Send SM the cluster info if there has been a state change.
|
||||
*
|
||||
***************************************************************************/
|
||||
void hbs_cluster_change_notifier ( void )
|
||||
{
|
||||
if ( ! ctrl.cluster_change_reason.empty () )
|
||||
{
|
||||
if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
|
||||
ctrl.cluster_change_reason ) == PASS )
|
||||
{
|
||||
ctrl.cluster_change_reason.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
|
||||
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
|
||||
"Unable to store history beyond %d ",
|
||||
ctrl.cluster.histories );
|
||||
hbs_cluster_change_notifier ();
|
||||
return ;
|
||||
}
|
||||
else
|
||||
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
|
||||
else
|
||||
history_ptr->oldest_entry_index++ ;
|
||||
|
||||
hbs_cluster_change_notifier ();
|
||||
|
||||
/* clear the log throttle if we are updating history ok. */
|
||||
ctrl.log_throttle = 0 ;
|
||||
}
|
||||
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
|
||||
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
|
||||
{
|
||||
int rc = FAIL_SOCKET_SENDTO ;
|
||||
ctrl.cluster.reqid = (unsigned short)reqid ;
|
||||
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
|
||||
{
|
||||
ilog ("cluster state notification Reason: %s", reason.c_str());
|
||||
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
|
||||
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
|
||||
if ( bytes <= 0 )
|
||||
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
|
||||
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
|
||||
bytes , errno, strerror(errno));
|
||||
}
|
||||
hbs_cluster_dump ( ctrl.cluster );
|
||||
else
|
||||
{
|
||||
/* limit the string length */
|
||||
ilog ("reason: %s", reason.substr(0,80).c_str());
|
||||
hbs_cluster_dump ( ctrl.cluster );
|
||||
rc = PASS ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("cannot send cluster info due to socket error");
|
||||
}
|
||||
return(rc);
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
|
||||
{
|
||||
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
|
||||
{
|
||||
hbs_cluster_change ("peer controller cluster event " +
|
||||
hbs_cluster_change ("peer cluster delta " +
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user