diff --git a/mtce-control/src/scripts/hbsAgent.service b/mtce-control/src/scripts/hbsAgent.service index 7e111707..bd4bcd63 100644 --- a/mtce-control/src/scripts/hbsAgent.service +++ b/mtce-control/src/scripts/hbsAgent.service @@ -1,22 +1,13 @@ [Unit] Description=StarlingX Maintenance Heartbeat Agent -After=network.target syslog.service config.service +After=hbsClient.service Before=pmon.service [Service] Type=forking ExecStart=/etc/rc.d/init.d/hbsAgent start -ExecStop=/etc/rc.d/init.d/hbsAgent start +ExecStop=/etc/rc.d/init.d/hbsAgent stop PIDFile=/var/run/hbsAgent.pid -KillMode=process -SendSIGKILL=no - -# Process recovery is handled by pmond if its running. -# Delay 10 seconds to give pmond a chance to recover -# before systemd kicks in to do it as a backup plan. -Restart=always -RestartSec=10 [Install] WantedBy=multi-user.target - diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index ae43fe64..3a6cef81 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -8511,7 +8511,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p -#define HBS_LOSS_REPORT_THROTTLE (100) +#define HBS_LOSS_REPORT_THROTTLE (100000) int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) { int lost = 0 ; @@ -8551,6 +8551,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) if ( pulse_ptr->b2b_misses_count[iface] > 1 ) { + if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold ) + { + hbs_cluster_change ( pulse_ptr->hostname + " " + + get_iface_name_str(iface) + + " heartbeat miss " + + itos(pulse_ptr->b2b_misses_count[iface])); + } if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) { if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) @@ -8657,57 +8664,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) } } - /* Turn the cluster-host heartbeat loss into a degrade only - * condition if the clstr_degrade_only flag is set */ - if (( iface == CLSTR_IFACE ) && - ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) && - ( clstr_degrade_only == true )) - { - /* Only print the log at the threshold boundary */ - if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) - { - if ( this->active_controller ) - { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - } - - wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n", - pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); - hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); - } - } - /* Turn the clstr heartbeat loss into a degrade only * condition for inactive controller on normal system. */ - else if (( iface == CLSTR_IFACE ) && - ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) && - ( this->system_type == SYSTEM_TYPE__NORMAL ) && - (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) + if (( iface == CLSTR_IFACE ) && + ((( this->system_type == SYSTEM_TYPE__NORMAL ) && + (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) || + ( clstr_degrade_only == true ))) { /* Only print the log at the threshold boundary */ - if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) + if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold ) { if ( this->active_controller ) { manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); } - wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n", + wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface)); - hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); + get_iface_name_str(iface), + clstr_degrade_only ? "config option" : "system type"); + hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" ); } } else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) + // else if ( pulse_ptr->hbs_failure[iface] == false ) { - elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); + elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n", + pulse_ptr->hostname.c_str(), + get_iface_name_str(iface), + pulse_ptr->b2b_misses_count[iface]); + hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" ); if ( this->active_controller ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - + if ( pulse_ptr->hbs_failure[iface] == false ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + } /* report this host as failed */ if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS ) { @@ -8715,10 +8708,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) } } else - { pulse_ptr->hbs_failure[iface] = true ; - } - hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); + pulse_ptr->hbs_failure_count[iface]++ ; } if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] ) diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index c8cb0305..bfd13f20 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -2368,6 +2368,7 @@ void daemon_service_run ( void ) arrival_histogram[iface] = "" ; unexpected_pulse_list[iface] = "" ; + rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri ); if ( rc != 0 ) { diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index bfa8f1d1..b9f067f7 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri void hbs_sm_handler ( void ); /* send the cluster vault to SM */ -void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ); +int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ); /* copy cluster data from src to dst */ void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ); @@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ); /* Heartbeat service state audit */ void hbs_state_audit ( void ); +/* Send state change message to SM if there has been a + * state change in the last period */ +void hbs_cluster_change_notifier ( void ); + /** * @} hbs_base */ diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index 780fa8e3..1f82a4e3 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -69,6 +69,8 @@ typedef struct msgClassSock * sm_socket_ptr ; + string cluster_change_reason ; + } hbs_cluster_ctrl_type ; /* Cluster control structire construct allocation. */ @@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr ) { ctrl.sm_socket_ptr = sm_socket_ptr ; } + ctrl.cluster_change_reason = ""; + ctrl.log_throttle = 0 ; } @@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller, void hbs_cluster_change ( string cluster_change_reason ) { - hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason ); + ilog ("reason: %s", cluster_change_reason.c_str()); + if ( ctrl.cluster_change_reason.empty() ) + ctrl.cluster_change_reason = cluster_change_reason ; + else + ctrl.cluster_change_reason.append("," + cluster_change_reason) ; +} + +/**************************************************************************** + * + * Name : hbs_cluster_change_notifier + * + * Description : Send SM the cluster info if there has been a state change. + * + ***************************************************************************/ +void hbs_cluster_change_notifier ( void ) +{ + if ( ! ctrl.cluster_change_reason.empty () ) + { + if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0, + ctrl.cluster_change_reason ) == PASS ) + { + ctrl.cluster_change_reason.clear(); + } + } } /**************************************************************************** @@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface, wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT, "Unable to store history beyond %d ", ctrl.cluster.histories ); + hbs_cluster_change_notifier (); return ; } else @@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface, else history_ptr->oldest_entry_index++ ; + hbs_cluster_change_notifier (); + /* clear the log throttle if we are updating history ok. */ ctrl.log_throttle = 0 ; } @@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void ) * ***************************************************************************/ -void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ) +int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ) { + int rc = FAIL_SOCKET_SENDTO ; ctrl.cluster.reqid = (unsigned short)reqid ; if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true )) { - ilog ("cluster state notification Reason: %s", reason.c_str()); int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes(); int bytes = sm_client_sock->write((char*)&ctrl.cluster, len); if ( bytes <= 0 ) @@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n", bytes , errno, strerror(errno)); } - hbs_cluster_dump ( ctrl.cluster ); + else + { + /* limit the string length */ + ilog ("reason: %s", reason.substr(0,80).c_str()); + hbs_cluster_dump ( ctrl.cluster ); + rc = PASS ; + } } else { wlog ("cannot send cluster info due to socket error"); } + return(rc); } /**************************************************************************** @@ -689,7 +726,7 @@ void hbs_history_save ( string hostname, { if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) ) { - hbs_cluster_change ("peer controller cluster event " + + hbs_cluster_change ("peer cluster delta " + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network)); }