Merge "Mtce: Add heartbeat cluster information for SM query"
This commit is contained in:
commit
0362090b73
@ -249,6 +249,44 @@ int jsonUtil_get_key_val ( char * json_str_ptr,
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
int jsonUtil_get_key_val_int ( char * json_str_ptr,
|
||||
string key,
|
||||
int & value )
|
||||
{
|
||||
/* init to null to avoid trap on early cleanup call with
|
||||
* bad non-null default pointer value */
|
||||
struct json_object *raw_obj = (struct json_object *)(NULL);
|
||||
|
||||
if ((json_str_ptr == NULL) || ( *json_str_ptr == '\0' ) || ( ! strncmp ( json_str_ptr, "(null)" , 6 )))
|
||||
{
|
||||
elog ("Cannot tokenize a null json string\n");
|
||||
elog ("... json string: %s\n", json_str_ptr );
|
||||
return (FAIL);
|
||||
}
|
||||
|
||||
size_t len_before = strlen (json_str_ptr);
|
||||
|
||||
jlog2 ("String: %s\n", json_str_ptr );
|
||||
|
||||
raw_obj = json_tokener_parse( json_str_ptr );
|
||||
if ( raw_obj )
|
||||
{
|
||||
value = jsonUtil_get_key_value_int ( raw_obj, key.data() ) ;
|
||||
jlog1 ("%s:%d\n", key.c_str(), value);
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t len_after = strlen (json_str_ptr);
|
||||
|
||||
elog ("Unable to tokenize string (before:%ld after:%ld);\n", len_before, len_after);
|
||||
elog ("... json string: %s\n", json_str_ptr );
|
||||
}
|
||||
|
||||
if (raw_obj)
|
||||
json_object_put(raw_obj);
|
||||
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/** This utility freads the passed in inventory GET request
|
||||
* response json character string and performes the following
|
||||
|
@ -69,6 +69,10 @@ int jsonUtil_get_key_val ( char * json_str_ptr,
|
||||
string key,
|
||||
string & value );
|
||||
|
||||
int jsonUtil_get_key_val_int ( char * json_str_ptr,
|
||||
string key,
|
||||
int & value );
|
||||
|
||||
/** Submit a request to get an authorization token and nova URL */
|
||||
int jsonApi_auth_request ( string & hostname, string & payload );
|
||||
|
||||
|
@ -114,6 +114,8 @@ typedef struct
|
||||
int event_port ; /**< daemon specific event tx port */
|
||||
int cmd_port ; /**< daemon specific command rx port */
|
||||
int sensor_port ; /**< sensor read value port */
|
||||
int sm_server_port ; /**< port mtce uses to receive data from SM */
|
||||
int sm_client_port ; /**< port mtce uses to send SM data */
|
||||
int start_delay ; /**< startup delay, added for pmon */
|
||||
int api_retries ; /**< api retries before failure */
|
||||
int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
|
||||
@ -243,6 +245,19 @@ extern char *program_invocation_short_name;
|
||||
else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
}
|
||||
|
||||
/** Error logger macro with throttling */
|
||||
#define elog_throttled(cnt,max,format,args...) { \
|
||||
if ( ++cnt == 1 ) \
|
||||
{ \
|
||||
if (ltc()) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
} \
|
||||
if ( cnt >= max ) \
|
||||
{ \
|
||||
cnt = 0 ; \
|
||||
} \
|
||||
}
|
||||
|
||||
/** Warning logger macro */
|
||||
#define wlog(format, args...) { \
|
||||
if ( ltc() ) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Warn : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
@ -387,7 +402,9 @@ extern char *program_invocation_short_name;
|
||||
|
||||
#define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
@ -23,7 +23,9 @@ using namespace std;
|
||||
#include "returnCodes.h"
|
||||
#include "nodeTimers.h"
|
||||
|
||||
#ifndef ALIGN_PACK
|
||||
#define ALIGN_PACK(x) __attribute__((packed)) x
|
||||
#endif
|
||||
|
||||
/* Out-Of-Service Stress tests */
|
||||
#define WANT_SYSINV_API_STRESS 0x00000001
|
||||
@ -359,8 +361,12 @@ void daemon_exit ( void );
|
||||
|
||||
#define CONTROLLER_0 ((const char *)"controller-0")
|
||||
#define CONTROLLER_1 ((const char *)"controller-1")
|
||||
#define CONTROLLER_2 ((const char *)"controller-2")
|
||||
#define CONTROLLER ((const char *)"controller")
|
||||
|
||||
#define STORAGE_0 ((const char *)"storage-0")
|
||||
#define STORAGE_1 ((const char *)"storage-1")
|
||||
|
||||
/* The infrastructure networking floating IP
|
||||
*
|
||||
* Note: If there is no infra then this label will resolve
|
||||
|
@ -267,7 +267,7 @@ bool is_goenabled ( int nodeType, bool pass )
|
||||
return daemon_is_file_present ( file );
|
||||
}
|
||||
|
||||
#define LOG_MEMORY(buf) ilog ("%s", buf ); \
|
||||
#define LOG_MEMORY(buf) syslog ( LOG_INFO, "%s", buf ); \
|
||||
buf_ptr = &buf[0]; \
|
||||
MEMSET_ZERO ( buf );
|
||||
|
||||
@ -279,7 +279,7 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
|
||||
char buf[0x1024] ;
|
||||
char * buf_ptr = &buf[0];
|
||||
MEMSET_ZERO ( buf );
|
||||
ilog ("Dumping Memory:\n");
|
||||
syslog ( LOG_INFO, "Dumping Memory: %ld bytes", bytes );
|
||||
if ( format == 4 )
|
||||
{
|
||||
int loops = bytes/format ;
|
||||
@ -294,7 +294,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
|
||||
buf_ptr += sprintf ( buf_ptr, "%c", *byte_ptr) ;
|
||||
else
|
||||
buf_ptr += sprintf ( buf_ptr, "%c", '.');
|
||||
|
||||
byte_ptr++ ;
|
||||
}
|
||||
LOG_MEMORY(buf);
|
||||
@ -315,7 +314,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
|
||||
buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ;
|
||||
else
|
||||
buf_ptr += sprintf ( buf_ptr , "%c", '.');
|
||||
|
||||
byte_ptr++ ;
|
||||
}
|
||||
LOG_MEMORY(buf);
|
||||
@ -336,21 +334,12 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
|
||||
buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ;
|
||||
else
|
||||
buf_ptr += sprintf ( buf_ptr , "%c", '.');
|
||||
|
||||
byte_ptr++ ;
|
||||
}
|
||||
LOG_MEMORY(buf);
|
||||
word_ptr += 4 ;
|
||||
}
|
||||
}
|
||||
byte_ptr = (uint8_t*)raw_ptr ;
|
||||
ilog ("Raw Hex Dump : %ld\n", bytes );
|
||||
for ( unsigned int x = 0 ; x < bytes ; x++ )
|
||||
{
|
||||
buf_ptr += sprintf ( buf_ptr, " %02x", *byte_ptr );
|
||||
byte_ptr++ ;
|
||||
}
|
||||
// printf ("\n\n");
|
||||
}
|
||||
|
||||
|
||||
|
@ -93,7 +93,7 @@
|
||||
#define FAIL_INVALID_DATA (71)
|
||||
#define FAIL_BAD_STATE (72)
|
||||
#define FAIL_KEY_VALUE_PARSE (73)
|
||||
#define FAIL____UNUSED____74 (74)
|
||||
#define FAIL_DATA_SIZE (74)
|
||||
#define FAIL_NOT_FOUND (75)
|
||||
#define FAIL_WORKQ_TIMEOUT (76)
|
||||
#define FAIL_HTTP_DELETE (77)
|
||||
|
@ -207,7 +207,7 @@ int daemon_run_testhead ( void );
|
||||
#define CONFIG_AGENT_INV_PORT 0x00000100 /**< Inventory Port Number */
|
||||
#define CONFIG_AGENT_HA_PORT 0x00000200 /**< HA Framework Port Number */
|
||||
#define CONFIG_CLIENT_MTCALARM_PORT 0x00000400 /**< Send alarm requests to */
|
||||
#define CONFIG_RESERVED_800 0x00000800 /**< */
|
||||
#define CONFIG_AGENT_SM_CLIENT_PORT 0x00000800 /**< Port to Send SM data on */
|
||||
#define CONFIG_MTC_TO_HWMON_CMD_PORT 0x00001000 /**< HWmon Port Number */
|
||||
#define CONFIG_AGENT_KEY_PORT 0x00002000 /**< Keystone HTTP port */
|
||||
#define CONFIG_AGENT_HBS_MTC_PORT 0x00004000 /**< Heartbeat Service Port */
|
||||
@ -217,8 +217,8 @@ int daemon_run_testhead ( void );
|
||||
#define CONFIG_AGENT_MTC_MGMNT_PORT 0x00040000 /**< Agent Infr network port */
|
||||
#define CONFIG_AGENT_TOKEN_REFRESH 0x00080000 /**< Token refresh rate mask */
|
||||
#define CONFIG_CLIENT_MTC_INFRA_PORT 0x00100000 /**< Client Infra nwk mtc port */
|
||||
#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */
|
||||
#define CONFIG_AGENT_VIM_CMD_PORT 0x00400000 /**< VIM Command Port Mask */
|
||||
#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */
|
||||
#define CONFIG_AGENT_SM_SERVER_PORT 0x00400000 /**< Port to RX data from SM */
|
||||
#define CONFIG_CLIENT_HBS_INFRA_PORT 0x00800000 /**< Infrastructure ntwk Port */
|
||||
#define CONFIG_CLIENT_HBS_MGMNT_PORT 0x01000000 /**< Management network Port */
|
||||
#define CONFIG_CLIENT_HBS_EVENT_PORT 0x02000000 /**< Heartbeat Event Messaging */
|
||||
|
@ -90,6 +90,15 @@ of spec operating conditions that can reduce outage time through automated
|
||||
notification and recovery thereby improving overall platform availability
|
||||
for the customer.
|
||||
|
||||
%package -n mtce-dev
|
||||
Summary: Titanuim Server Maintenance Software Development Package
|
||||
Group: base
|
||||
Provides: mtce-dev = %{version}-%{release}
|
||||
|
||||
%description -n mtce-dev
|
||||
Titanuim Cloud Maintenance. This package contains header files,
|
||||
and related items necessary for software development.
|
||||
|
||||
%package -n mtce-pmon
|
||||
Summary: Titanuim Server Maintenance Process Monitor Package
|
||||
Group: base
|
||||
@ -424,6 +433,9 @@ install -m 644 -p -D %{_buildsubdir}/fsmon/scripts/fsmon.logrotate %{buildroot}%
|
||||
install -m 644 -p -D %{_buildsubdir}/hwmon/scripts/hwmon.logrotate %{buildroot}%{local_etc_logrotated}/hwmon.logrotate
|
||||
install -m 644 -p -D %{_buildsubdir}/alarm/scripts/mtcalarm.logrotate %{buildroot}%{local_etc_logrotated}/mtcalarm.logrotate
|
||||
|
||||
# software development files
|
||||
install -m 644 -p -D %{_buildsubdir}/heartbeat/mtceHbsCluster.h %{buildroot}/%{_includedir}/mtceHbsCluster.h
|
||||
|
||||
install -m 755 -p -D %{_buildsubdir}/public/libamon.so.$MAJOR %{buildroot}%{_libdir}/libamon.so.$MAJOR
|
||||
cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so.$MAJOR.$MINOR
|
||||
cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so
|
||||
@ -621,3 +633,10 @@ install -m 755 -d %{buildroot}/var/run
|
||||
%{_sysconfdir}/init.d/hostw
|
||||
%{local_bindir}/hostwd
|
||||
|
||||
###############################
|
||||
# Maintenance Software Development RPM
|
||||
###############################
|
||||
%files -n mtce-dev
|
||||
%defattr(-,root,root,-)
|
||||
|
||||
%{_includedir}/mtceHbsCluster.h
|
||||
|
@ -269,7 +269,7 @@ nodeLinkClass::nodeLinkClass()
|
||||
hbs_ready = false ;
|
||||
hbs_state_change = false ;
|
||||
hbs_disabled = true ;
|
||||
hbs_pulse_period = hbs_pulse_period_save = 200 ;
|
||||
hbs_pulse_period = hbs_pulse_period_save = 0 ;
|
||||
hbs_minor_threshold = HBS_MINOR_THRESHOLD ;
|
||||
hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ;
|
||||
hbs_failure_threshold = HBS_FAILURE_THRESHOLD ;
|
||||
@ -7325,18 +7325,40 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_
|
||||
|
||||
int send_event ( string & hostname, unsigned int cmd, iface_enum iface );
|
||||
|
||||
int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear )
|
||||
int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool send_clear )
|
||||
{
|
||||
int rc = FAIL ;
|
||||
if ( ! hostname.empty() )
|
||||
nodeLinkClass::node* node_ptr ;
|
||||
node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr != NULL )
|
||||
{
|
||||
nodeLinkClass::node* node_ptr ;
|
||||
node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr != NULL )
|
||||
bool want_log = true ;
|
||||
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
||||
{
|
||||
node_ptr->monitor[iface] = true_false ;
|
||||
if ( node_ptr->monitor[iface] == true_false )
|
||||
continue ;
|
||||
|
||||
if ( iface == INFRA_IFACE )
|
||||
{
|
||||
if ( this->infra_network_provisioned == false )
|
||||
continue ;
|
||||
|
||||
if ( node_ptr->monitor[MGMNT_IFACE] == true_false )
|
||||
want_log = false ;
|
||||
}
|
||||
|
||||
if ( send_clear == true )
|
||||
{
|
||||
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, (iface_enum)iface ) ;
|
||||
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, (iface_enum)iface ) ;
|
||||
}
|
||||
|
||||
if ( true_false == true )
|
||||
{
|
||||
if ( want_log )
|
||||
{
|
||||
ilog ("%s starting heartbeat service \n",
|
||||
hostname.c_str());
|
||||
}
|
||||
node_ptr->no_work_log_throttle = 0 ;
|
||||
node_ptr->b2b_misses_count[iface] = 0 ;
|
||||
node_ptr->hbs_misses_count[iface] = 0 ;
|
||||
@ -7345,16 +7367,20 @@ int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool tr
|
||||
node_ptr->hbs_failure[iface] = false ;
|
||||
node_ptr->hbs_minor[iface] = false ;
|
||||
node_ptr->hbs_degrade[iface] = false ;
|
||||
if ( send_clear == true )
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( want_log )
|
||||
{
|
||||
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ;
|
||||
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ;
|
||||
ilog ("%s stopping heartbeat service\n",
|
||||
hostname.c_str());
|
||||
}
|
||||
}
|
||||
return PASS ;
|
||||
node_ptr->monitor[iface] = true_false ;
|
||||
}
|
||||
return PASS ;
|
||||
}
|
||||
return ( rc );
|
||||
return ( FAIL );
|
||||
}
|
||||
|
||||
/* store the current hardware monitor monitoring state */
|
||||
@ -7887,11 +7913,11 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ;
|
||||
pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ;
|
||||
pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ;
|
||||
pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if ( pulse_list[iface].tail_ptr == pulse_ptr )
|
||||
{
|
||||
qlog2 ("%s Pulse: Multiple Node -> Tail Case : %d of %d\n", node_ptr->hostname.c_str(), pulse_ptr->linknum[iface], pulses[iface] );
|
||||
@ -7906,19 +7932,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ;
|
||||
pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ;
|
||||
}
|
||||
pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ;
|
||||
pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* July 1 emacdona: Make failure path case more robust */
|
||||
if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; }
|
||||
else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; }
|
||||
else if ( pulse_ptr->pulse_link[iface].prev_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; }
|
||||
else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 4\n"); rc = FAIL; }
|
||||
else if ( pulse_ptr->pulse_link[iface].next_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 5\n"); rc = FAIL; }
|
||||
|
||||
if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; }
|
||||
else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; }
|
||||
else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; }
|
||||
if ( rc == FAIL )
|
||||
{
|
||||
slog ("%s Null pointer error splicing %s out of pulse list with %d pulses remaining (Monitoring:%s)\n",
|
||||
@ -7935,7 +7958,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
||||
}
|
||||
if ( rc == PASS )
|
||||
{
|
||||
pulse_ptr->linknum[iface]-- ; // = 0 ;
|
||||
pulse_ptr->linknum[iface]-- ;
|
||||
}
|
||||
pulses[iface]-- ;
|
||||
}
|
||||
@ -8082,14 +8105,26 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
|
||||
|
||||
|
||||
|
||||
int nodeLinkClass::lost_pulses ( iface_enum iface )
|
||||
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
{
|
||||
int rc = PASS ;
|
||||
int lost = 0 ;
|
||||
|
||||
/*
|
||||
* Assume storage-0 is responding until otherwise proven its not.
|
||||
* keep in mind that this interface counts nodes that have not responded ;
|
||||
* not those that have.
|
||||
*/
|
||||
storage_0_responding = true ;
|
||||
|
||||
/*
|
||||
* Loop over the pulse_list which now onoly contains a list of hosts
|
||||
* that have not responded in this heartbeat period.
|
||||
*/
|
||||
for ( ; pulse_list[iface].head_ptr != NULL ; )
|
||||
{
|
||||
daemon_signal_hdlr ();
|
||||
pulse_ptr = pulse_list[iface].head_ptr ;
|
||||
lost++ ;
|
||||
if ( active )
|
||||
{
|
||||
string flat = "Flat Line:" ;
|
||||
@ -8098,6 +8133,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
|
||||
pulse_ptr->b2b_pulses_count[iface] = 0 ;
|
||||
// pulse_ptr->max_count[iface]++ ;
|
||||
|
||||
/*
|
||||
* Update storage_0_responding reference to false if storgate-0
|
||||
* is found in the pulse lots list.
|
||||
*/
|
||||
if ( pulse_ptr->hostname == STORAGE_0 )
|
||||
{
|
||||
storage_0_responding = false ;
|
||||
}
|
||||
|
||||
/* Don't log single misses unless in debug mode */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
|
||||
{
|
||||
@ -8156,8 +8200,9 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
|
||||
get_iface_name_str(iface),
|
||||
pulse_ptr->b2b_misses_count[iface] );
|
||||
}
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
mem_log ( flat, pulse_ptr->b2b_misses_count[iface], pulse_ptr->hostname.c_str());
|
||||
|
||||
#endif
|
||||
if ( iface == MGMNT_IFACE )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold )
|
||||
@ -8252,8 +8297,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
||||
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
|
||||
}
|
||||
rc = remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS );
|
||||
if ( rc != PASS )
|
||||
if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS ))
|
||||
{
|
||||
elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
@ -8266,7 +8310,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
|
||||
break ;
|
||||
}
|
||||
}
|
||||
return (rc);
|
||||
return (lost);
|
||||
}
|
||||
|
||||
/* Return true if the specified interface is being monitored for this host */
|
||||
@ -8301,7 +8345,7 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface )
|
||||
|
||||
if ( pulse_list[iface].head_ptr != NULL )
|
||||
{
|
||||
for ( pulse_ptr = pulse_list[iface].head_ptr ;
|
||||
for ( pulse_ptr = pulse_list[iface].head_ptr ;
|
||||
pulse_ptr != NULL ;
|
||||
pulse_ptr = pulse_ptr->pulse_link[iface].next_ptr )
|
||||
{
|
||||
@ -8310,12 +8354,15 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface )
|
||||
}
|
||||
dlog ("Patients: %s\n", pulse_host_list.c_str());
|
||||
}
|
||||
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
if ( pulses[iface] && !pulse_host_list.empty() )
|
||||
{
|
||||
string temp = get_iface_name_str(iface) ;
|
||||
temp.append(" Patients :") ;
|
||||
mem_log ( temp, pulses[iface], pulse_host_list );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -1940,7 +1940,7 @@ public:
|
||||
void manage_pulse_flags ( string & hostname, unsigned int flags );
|
||||
|
||||
/** Control the heartbeat monitoring state of a host */
|
||||
int mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear );
|
||||
int mon_host ( const string & hostname, bool true_false, bool send_clear );
|
||||
|
||||
/** Return true if the pulse list is empty */
|
||||
bool pulse_list_empty ( iface_enum iface );
|
||||
@ -1956,7 +1956,7 @@ public:
|
||||
* that exceed preset thresholds.
|
||||
*
|
||||
*/
|
||||
int lost_pulses ( iface_enum iface );
|
||||
int lost_pulses ( iface_enum iface, bool & storage_0_responding );
|
||||
|
||||
bool monitored_pulse ( string hostname , iface_enum iface );
|
||||
|
||||
|
@ -4,10 +4,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsStubs.cpp
|
||||
SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsUtil.cpp hbsCluster.cpp hbsStubs.cpp
|
||||
OBJS = $(SRCS:.cpp=.o)
|
||||
|
||||
LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid
|
||||
LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid -ljson-c
|
||||
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
|
||||
INCLUDES += -I../common -I../alarm -I../maintenance -I../public
|
||||
|
||||
@ -31,8 +31,8 @@ endif
|
||||
all: static_analysis common agent client
|
||||
|
||||
build: static_analysis $(OBJS)
|
||||
$(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent
|
||||
$(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o -L../public -L../alarm $(LDLIBS) -o hbsClient
|
||||
$(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsUtil.o hbsCluster.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent
|
||||
$(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o hbsUtil.o -L../public -L../alarm $(LDLIBS) -o hbsClient
|
||||
|
||||
common:
|
||||
( cd ../common ; make clean ; make lib VER=$(VER) VER_MJR=$(VER_MJR))
|
||||
|
@ -41,6 +41,7 @@ using namespace std;
|
||||
#include "hbsBase.h" /* Heartbeat Base Header File */
|
||||
#include "hbsAlarm.h" /* for ... hbsAlarm_clear_all */
|
||||
#include "alarm.h" /* for ... alarm send message to mtcalarmd */
|
||||
#include "jsonUtil.h" /* for ... jsonUtil_get_key_val */
|
||||
|
||||
/**************************************************************
|
||||
* Implementation Structure
|
||||
@ -68,6 +69,8 @@ using namespace std;
|
||||
/* Number of back to back interface errors before the interface is re-initialized. */
|
||||
#define INTERFACE_ERRORS_FOR_REINIT (8)
|
||||
|
||||
#define MAX_LEN 1000
|
||||
|
||||
/* Historical String data for mem_logs */
|
||||
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
|
||||
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
|
||||
@ -90,6 +93,8 @@ int module_init ( void )
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
static unsigned int controller_number = 0 ;
|
||||
|
||||
void daemon_sigchld_hdlr ( void )
|
||||
{
|
||||
; /* dlog("Received SIGCHLD ... no action\n"); */
|
||||
@ -184,14 +189,16 @@ void daemon_exit ( void )
|
||||
CONFIG_AGENT_HBS_DEGRADE |\
|
||||
CONFIG_AGENT_HBS_FAILURE |\
|
||||
CONFIG_AGENT_MULTICAST |\
|
||||
CONFIG_SCHED_PRIORITY |\
|
||||
CONFIG_SCHED_PRIORITY |\
|
||||
CONFIG_MTC_TO_HBS_CMD_PORT |\
|
||||
CONFIG_HBS_TO_MTC_EVENT_PORT |\
|
||||
CONFIG_AGENT_HBS_MGMNT_PORT |\
|
||||
CONFIG_AGENT_HBS_INFRA_PORT |\
|
||||
CONFIG_CLIENT_HBS_MGMNT_PORT |\
|
||||
CONFIG_CLIENT_MTCALARM_PORT |\
|
||||
CONFIG_CLIENT_HBS_INFRA_PORT )
|
||||
CONFIG_CLIENT_HBS_INFRA_PORT |\
|
||||
CONFIG_AGENT_SM_SERVER_PORT |\
|
||||
CONFIG_AGENT_SM_CLIENT_PORT)
|
||||
|
||||
/* Startup config read */
|
||||
static int hbs_config_handler ( void * user,
|
||||
@ -203,6 +210,8 @@ static int hbs_config_handler ( void * user,
|
||||
|
||||
if (MATCH("agent", "heartbeat_period"))
|
||||
{
|
||||
int curr_period = hbsInv.hbs_pulse_period ;
|
||||
|
||||
config_ptr->hbs_pulse_period = atoi(value);
|
||||
hbsInv.hbs_pulse_period = atoi(value);
|
||||
hbsInv.hbs_state_change = true ;
|
||||
@ -227,10 +236,14 @@ static int hbs_config_handler ( void * user,
|
||||
}
|
||||
}
|
||||
}
|
||||
hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ;
|
||||
if ( curr_period != hbsInv.hbs_pulse_period )
|
||||
{
|
||||
/* initialize cluster info */
|
||||
hbs_cluster_init ( hbsInv.hbs_pulse_period );
|
||||
}
|
||||
}
|
||||
|
||||
hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ;
|
||||
|
||||
if (MATCH("agent", "hbs_minor_threshold"))
|
||||
{
|
||||
config_ptr->hbs_minor_threshold =
|
||||
@ -312,6 +325,16 @@ static int hbs_config_handler ( void * user,
|
||||
config_ptr->hbs_agent_mgmnt_port = atoi(value);
|
||||
config_ptr->mask |= CONFIG_AGENT_HBS_MGMNT_PORT ;
|
||||
}
|
||||
else if (MATCH("agent", "sm_server_port"))
|
||||
{
|
||||
config_ptr->sm_server_port = atoi(value);
|
||||
config_ptr->mask |= CONFIG_AGENT_SM_SERVER_PORT ;
|
||||
}
|
||||
else if (MATCH("agent", "sm_client_port"))
|
||||
{
|
||||
config_ptr->sm_client_port = atoi(value);
|
||||
config_ptr->mask |= CONFIG_AGENT_SM_CLIENT_PORT ;
|
||||
}
|
||||
else if (MATCH("client", "hbs_client_mgmnt_port"))
|
||||
{
|
||||
config_ptr->hbs_client_mgmnt_port = atoi(value);
|
||||
@ -617,6 +640,34 @@ int alarm_port_init ( void )
|
||||
return ( hbs_sock.alarm_sock->return_status ) ;
|
||||
}
|
||||
|
||||
int hbs_sm_sockets_init ( void )
|
||||
{
|
||||
int rc = PASS ;
|
||||
|
||||
/* Create an UDP RX Message Socket for SM Requests; LO interface only */
|
||||
hbs_sock.sm_server_sock = new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP);
|
||||
if ( ! hbs_sock.sm_server_sock )
|
||||
{
|
||||
elog ("Failed to setup SM receive socket");
|
||||
rc = FAIL_SOCKET_CREATE ;
|
||||
}
|
||||
|
||||
/* Create an UDP TX Message Socket for SM Requests; LO interface only */
|
||||
hbs_sock.sm_client_sock = new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP);
|
||||
if ( ! hbs_sock.sm_client_sock )
|
||||
{
|
||||
elog ("Failed to setup SM transmit socket");
|
||||
rc = FAIL_SOCKET_CREATE ;
|
||||
}
|
||||
|
||||
if ( rc == PASS )
|
||||
{
|
||||
hbs_sock.sm_server_sock->sock_ok(true);
|
||||
hbs_sock.sm_client_sock->sock_ok(true);
|
||||
}
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/* Init the internal/local sockets ; the ones that will no change.
|
||||
* This way we don't miss add and start commands from maintenance. */
|
||||
|
||||
@ -654,6 +705,9 @@ int hbs_int_socket_init ( void )
|
||||
{
|
||||
elog ("Alarm port setup or registration failed (rc:%d)\n", rc );
|
||||
}
|
||||
|
||||
rc = hbs_sm_sockets_init () ;
|
||||
|
||||
return (rc);
|
||||
}
|
||||
|
||||
@ -697,26 +751,36 @@ int hbs_pulse_request ( iface_enum iface,
|
||||
string hostname_clue,
|
||||
unsigned int lookup_clue)
|
||||
{
|
||||
int rc = PASS ;
|
||||
#define MAX_LEN 1000
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
char str[MAX_LEN] ;
|
||||
|
||||
/* Add the sequence number */
|
||||
hbs_sock.tx_mesg[iface].s = seq_num ;
|
||||
memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME );
|
||||
if (( lookup_clue ) &&
|
||||
( hostname_clue.length() <= MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
hbs_sock.tx_mesg[iface].c = lookup_clue ;
|
||||
memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE],
|
||||
hostname_clue.data(),
|
||||
hostname_clue.length());
|
||||
}
|
||||
/* Message length is the size of the sequence number, the clue and the buffer */
|
||||
|
||||
int msg_len = (HBS_MAX_MSG+(sizeof(unsigned int)*2)) ;
|
||||
#endif
|
||||
int bytes = 0 ;
|
||||
if ( hbs_sock.tx_sock[iface] )
|
||||
{
|
||||
// int unused_networks = 0 ;
|
||||
memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME );
|
||||
|
||||
/* Add message version - 0 -> 1 with the acction of cluster information */
|
||||
hbs_sock.tx_mesg[iface].v = HBS_MESSAGE_VERSION ;
|
||||
|
||||
/* Add the sequence number */
|
||||
hbs_sock.tx_mesg[iface].s = seq_num ;
|
||||
|
||||
if (( lookup_clue ) &&
|
||||
( hostname_clue.length() <= MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
hbs_sock.tx_mesg[iface].c = lookup_clue ;
|
||||
memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE],
|
||||
hostname_clue.data(),
|
||||
hostname_clue.length());
|
||||
}
|
||||
|
||||
/* Append the cluster info to the pulse request */
|
||||
hbs_cluster_append(hbs_sock.tx_mesg[iface]) ;
|
||||
|
||||
/* Calculate the total message size */
|
||||
bytes = sizeof(hbs_message_type)-hbs_cluster_unused_bytes();
|
||||
|
||||
#ifdef WANT_FIT_TESTING
|
||||
if ( daemon_want_fit ( FIT_CODE__NO_PULSE_REQUEST, "any" , get_iface_name_str(iface) ) )
|
||||
{
|
||||
@ -727,14 +791,15 @@ int hbs_pulse_request ( iface_enum iface,
|
||||
goto hbs_pulse_request_out ;
|
||||
}
|
||||
#endif
|
||||
if ( (rc = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], msg_len)) < 0 )
|
||||
|
||||
if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 )
|
||||
{
|
||||
elog("Failed to send Pulse request: %d:%s to %s.%d (rc:%i ; %d:%s)\n",
|
||||
hbs_sock.tx_mesg[iface].s,
|
||||
&hbs_sock.tx_mesg[iface].m[0],
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
rc, errno, strerror(errno) );
|
||||
bytes, errno, strerror(errno) );
|
||||
return (FAIL_SOCKET_SENDTO);
|
||||
}
|
||||
}
|
||||
@ -748,16 +813,17 @@ int hbs_pulse_request ( iface_enum iface,
|
||||
hbs_pulse_request_out:
|
||||
#endif
|
||||
|
||||
mlog1("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%x:%s\n",
|
||||
get_iface_name_str(iface), rc,
|
||||
mlog("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%d:%x:%s\n",
|
||||
get_iface_name_str(iface), bytes,
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.tx_mesg[iface].v,
|
||||
hbs_sock.tx_mesg[iface].s,
|
||||
hbs_sock.tx_mesg[iface].c,
|
||||
hbs_sock.tx_mesg[iface].f,
|
||||
hbs_sock.tx_mesg[iface].m);
|
||||
|
||||
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
snprintf ( &str[0], MAX_LEN, "%s Pulse Req: %17s:%5d: %u:%u:%s\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
@ -766,6 +832,7 @@ hbs_pulse_request_out:
|
||||
hbs_sock.tx_mesg[iface].c,
|
||||
hbs_sock.tx_mesg[iface].m);
|
||||
mem_log (&str[0]);
|
||||
#endif
|
||||
|
||||
return (PASS);
|
||||
}
|
||||
@ -785,7 +852,7 @@ string get_hostname_from_pulse ( char * msg_ptr )
|
||||
|
||||
int _pulse_receive ( iface_enum iface , unsigned int seq_num )
|
||||
{
|
||||
int n = 0 ;
|
||||
int bytes = 0 ;
|
||||
|
||||
int detected_pulses = 0 ;
|
||||
|
||||
@ -796,7 +863,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
|
||||
do
|
||||
{
|
||||
/* Clean the receive buffer */
|
||||
memset ( hbs_sock.rx_mesg[iface].m, 0, HBS_MAX_MSG );
|
||||
memset ( hbs_sock.rx_mesg[iface].m, 0, sizeof(hbs_message_type) );
|
||||
hbs_sock.rx_mesg[iface].s = 0 ;
|
||||
hbs_sock.rx_mesg[iface].c = 0 ;
|
||||
if ( hbs_sock.rx_sock[iface] == NULL )
|
||||
@ -804,10 +871,10 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
|
||||
elog ("%s cannot receive pulses - null object\n", get_iface_name_str(iface) );
|
||||
return (0);
|
||||
}
|
||||
if ( (n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 )
|
||||
if ( (bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 )
|
||||
{
|
||||
mlog1 ("%s Pulse Rsp: (%5d): %17s:%5d: %d:%d:%x:%s\n",
|
||||
get_iface_name_str(iface), n,
|
||||
get_iface_name_str(iface), bytes,
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
@ -839,7 +906,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
|
||||
}
|
||||
#endif
|
||||
|
||||
mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str());
|
||||
// mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str());
|
||||
if ( !hostname.compare("localhost") )
|
||||
{
|
||||
mlog3 ("%s Pulse Rsp (local): %17s:%5d: %d:%d:%x:%s\n",
|
||||
@ -868,7 +935,6 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
|
||||
{
|
||||
if ( hbsInv.monitored_pulse ( hostname , iface ) == true )
|
||||
{
|
||||
#define MAX_LEN 1000
|
||||
char str[MAX_LEN] ;
|
||||
string extra = "Rsp" ;
|
||||
|
||||
@ -880,25 +946,42 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
|
||||
{
|
||||
rc = hbsInv.remove_pulse ( hostname, iface, hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f ) ;
|
||||
}
|
||||
snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %17s:%5d: %u:%u:%x:%s\n",
|
||||
get_iface_name_str(iface), extra.c_str(), n,
|
||||
snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %s:%d: %u:%u:%x:%s\n",
|
||||
get_iface_name_str(iface), extra.c_str(), bytes,
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].c,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m);
|
||||
mlog1 ("%s", &str[0]);
|
||||
mlog ("%s", &str[0]);
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
mem_log (str);
|
||||
#endif
|
||||
if ( extra.empty())
|
||||
{
|
||||
detected_pulses++ ;
|
||||
}
|
||||
/* don't save data from self */
|
||||
if ( hostname != hbsInv.my_hostname )
|
||||
{
|
||||
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
|
||||
{
|
||||
if ( iface == MGMNT_IFACE )
|
||||
hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_MGMT , hbs_sock.rx_mesg[iface]);
|
||||
else
|
||||
hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_INFRA , hbs_sock.rx_mesg[iface]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("skipping my hostname");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
mlog3 ("%s Pulse Dis: (%5d): %17s:%5d: %d:%d:%x:%s\n",
|
||||
get_iface_name_str(iface), n,
|
||||
get_iface_name_str(iface), bytes,
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
@ -934,7 +1017,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
|
||||
hbs_sock.rx_mesg[iface].m) ;
|
||||
}
|
||||
}
|
||||
} while ( n > 0 ) ;
|
||||
} while ( bytes > 0 ) ;
|
||||
monitor_scheduling ( after_rx_time, before_rx_time, detected_pulses, SCHED_MONITOR__RECEIVER );
|
||||
return (detected_pulses);
|
||||
}
|
||||
@ -951,6 +1034,8 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
|
||||
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
|
||||
{
|
||||
daemon_dump_membuf_banner ();
|
||||
hbsInv.print_node_info ();
|
||||
hbs_cluster_log( hbsInv.my_hostname, "event");
|
||||
daemon_dump_membuf ();
|
||||
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
|
||||
}
|
||||
@ -1038,6 +1123,9 @@ int daemon_init ( string iface, string nodetype )
|
||||
/* Initialize the hbs control struct */
|
||||
MEMSET_ZERO ( hbs_ctrl );
|
||||
|
||||
/* init the utility module */
|
||||
hbs_utils_init ();
|
||||
|
||||
/* initialize the timer */
|
||||
mtcTimer_init ( hbsTimer, "controller", "heartbeat" );
|
||||
|
||||
@ -1091,9 +1179,123 @@ int daemon_init ( string iface, string nodetype )
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Name : hbs_sm_handler
|
||||
*
|
||||
* Description: Try and receive a Service Management request from sm_server_sock
|
||||
*
|
||||
* Expecting request in the following form:
|
||||
* ~66 bytes with moderate spacing
|
||||
*
|
||||
* {
|
||||
* "origin" :"sm",
|
||||
* "service":"heartbeat",
|
||||
* "request":"cluster_info"
|
||||
* "req_id" : number
|
||||
* }
|
||||
*
|
||||
* Successfully parsed request results in a call to
|
||||
* hbs_cluser_send which sends the latest snapshot of
|
||||
* the heartbeat cluser info to SM.
|
||||
*
|
||||
* Assumptions: log flooding is avoided.
|
||||
*
|
||||
* Returns : Nothing
|
||||
*
|
||||
****************************************************************************/
|
||||
static int _hbs_sm_handler_log_throttle = 0 ;
|
||||
void hbs_sm_handler ( void )
|
||||
{
|
||||
#define _MAX_MSG_LEN (80)
|
||||
#define _MAX_LOG_CNT (1000)
|
||||
|
||||
#define PRIMARY_LABEL "origin"
|
||||
#define SERVICE_LABEL "service"
|
||||
#define REQUEST_LABEL "request"
|
||||
#define REQID_LABEL "reqid"
|
||||
|
||||
#define SUPPORTED_ORIGIN "sm"
|
||||
#define SUPPERTED_SERVICE "heartbeat"
|
||||
#define SUPPORTED_REQUEST "cluster_info"
|
||||
|
||||
char sm_mesg[_MAX_MSG_LEN] ;
|
||||
MEMSET_ZERO(sm_mesg);
|
||||
int bytes = hbs_sock.sm_server_sock->read((char*)&sm_mesg, _MAX_MSG_LEN);
|
||||
if ( bytes )
|
||||
{
|
||||
/* Expecting request in the following form:
|
||||
* { "origin":"sm" ... } */
|
||||
if ( sm_mesg[0] == '{' )
|
||||
{
|
||||
int reqid = 0 ;
|
||||
string origin = "" ;
|
||||
string service = "" ;
|
||||
string request = "" ;
|
||||
if ( jsonUtil_get_key_val ( sm_mesg, PRIMARY_LABEL, origin ) != PASS )
|
||||
{
|
||||
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
|
||||
"missing primary label 'origin' in request.");
|
||||
}
|
||||
else if (( origin == SUPPORTED_ORIGIN ) &&
|
||||
( jsonUtil_get_key_val ( sm_mesg, SERVICE_LABEL, service ) == PASS ) &&
|
||||
( jsonUtil_get_key_val ( sm_mesg, REQUEST_LABEL, request ) == PASS ) &&
|
||||
( jsonUtil_get_key_val_int ( sm_mesg, REQID_LABEL, reqid ) == PASS ))
|
||||
{
|
||||
if (( service == SUPPERTED_SERVICE ) &&
|
||||
( request == SUPPORTED_REQUEST ))
|
||||
{
|
||||
/* success path ... */
|
||||
hbs_cluster_send( hbs_sock.sm_client_sock, reqid );
|
||||
|
||||
/* reset log throttle */
|
||||
_hbs_sm_handler_log_throttle = 0 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
|
||||
"missing service or request labels in request.");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
|
||||
"failed to parse one or more request labels.");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
|
||||
"improperly formatted json string request.");
|
||||
}
|
||||
}
|
||||
else if ( bytes == -1 )
|
||||
{
|
||||
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
|
||||
"message receive error (%d:%s)",
|
||||
errno, strerror(errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
|
||||
"unknown error Error (rc:%d)", bytes );
|
||||
}
|
||||
dlog ("... %s", sm_mesg );
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : daemon_service_run
|
||||
*
|
||||
* Description: Daemon's main loop
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void daemon_service_run ( void )
|
||||
{
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
int exp_pulses[MAX_IFACES] ;
|
||||
#endif
|
||||
int rc = PASS ;
|
||||
int counter = 0 ;
|
||||
int goenabled_wait_log_throttle = 0 ;
|
||||
@ -1154,6 +1356,8 @@ void daemon_service_run ( void )
|
||||
daemon_exit ();
|
||||
}
|
||||
|
||||
/* set this controller as provisioned */
|
||||
hbs_manage_controller_state ( hbsInv.my_hostname , true );
|
||||
|
||||
/* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored
|
||||
*
|
||||
@ -1195,6 +1399,16 @@ void daemon_service_run ( void )
|
||||
/* enable the base level signal handler latency monitor */
|
||||
daemon_latency_monitor (true);
|
||||
|
||||
/* load this controller index number - used for cluster stuff */
|
||||
if ( hbsInv.my_hostname == CONTROLLER_0 )
|
||||
controller_number = 0 ;
|
||||
else
|
||||
controller_number = 1 ;
|
||||
|
||||
/* tell the cluster which controller this is and
|
||||
* how many networks are being monitored */
|
||||
hbs_cluster_nums (controller_number,hbsInv.infra_network_provisioned ?2:1);
|
||||
|
||||
/* Run heartbeat service forever or until stop condition */
|
||||
for ( hbsTimer.ring = false ; ; )
|
||||
{
|
||||
@ -1315,6 +1529,14 @@ void daemon_service_run ( void )
|
||||
FD_SET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds);
|
||||
}
|
||||
|
||||
/* Add the sm request receiver to the select list */
|
||||
if (( hbs_sock.sm_server_sock ) &&
|
||||
( hbs_sock.sm_server_sock->getFD()))
|
||||
{
|
||||
socks.push_front (hbs_sock.sm_server_sock->getFD());
|
||||
FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds);
|
||||
}
|
||||
|
||||
/* Add the netlink event listener to the select list */
|
||||
if ( hbs_sock.netlink_sock )
|
||||
{
|
||||
@ -1379,6 +1601,11 @@ void daemon_service_run ( void )
|
||||
hbs_sock.fired[INFRA_INTERFACE] = true ;
|
||||
}
|
||||
|
||||
if ((hbs_sock.sm_server_sock != NULL ) &&
|
||||
( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds)))
|
||||
{
|
||||
hbs_sm_handler();
|
||||
}
|
||||
if ((hbs_sock.mtc_to_hbs_sock != NULL ) &&
|
||||
( FD_ISSET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds)))
|
||||
{
|
||||
@ -1404,7 +1631,7 @@ void daemon_service_run ( void )
|
||||
inv.nodetype = msg.parm[0];
|
||||
hbsInv.add_heartbeat_host ( inv ) ;
|
||||
hostname_inventory.push_back ( hostname );
|
||||
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), inv.nodetype );
|
||||
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), msg.parm[0] );
|
||||
|
||||
/* clear any outstanding alarms on the ADD */
|
||||
if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE )
|
||||
@ -1415,10 +1642,7 @@ void daemon_service_run ( void )
|
||||
}
|
||||
else if ( msg.cmd == MTC_CMD_DEL_HOST )
|
||||
{
|
||||
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
||||
{
|
||||
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
|
||||
}
|
||||
hbsInv.mon_host ( hostname, false, false );
|
||||
hostname_inventory.remove ( hostname );
|
||||
hbsInv.del_host ( hostname );
|
||||
ilog ("%s deleted from heartbeat service\n", hostname.c_str());
|
||||
@ -1432,27 +1656,24 @@ void daemon_service_run ( void )
|
||||
}
|
||||
else if ( msg.cmd == MTC_CMD_STOP_HOST )
|
||||
{
|
||||
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
||||
{
|
||||
hbsInv.mon_host ( hostname, (iface_enum)iface, false, true );
|
||||
}
|
||||
ilog ("%s stopping heartbeat service\n", hostname.c_str());
|
||||
hbsInv.mon_host ( hostname, false, true );
|
||||
hbs_cluster_del ( hostname );
|
||||
|
||||
ilog ("%s stopping heartbeat service\n",
|
||||
hostname.c_str());
|
||||
}
|
||||
else if ( msg.cmd == MTC_CMD_START_HOST )
|
||||
{
|
||||
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
||||
{
|
||||
hbsInv.mon_host ( hostname, (iface_enum)iface, true, true );
|
||||
}
|
||||
ilog ("%s starting heartbeat service\n", hostname.c_str());
|
||||
hbsInv.mon_host ( hostname, true, true );
|
||||
hbs_cluster_add ( hostname );
|
||||
|
||||
ilog ("%s starting heartbeat service\n",
|
||||
hostname.c_str());
|
||||
}
|
||||
else if ( msg.cmd == MTC_RESTART_HBS )
|
||||
{
|
||||
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
||||
{
|
||||
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
|
||||
hbsInv.mon_host ( hostname, (iface_enum)iface, true, false );
|
||||
}
|
||||
hbsInv.mon_host ( hostname, false, false );
|
||||
hbsInv.mon_host ( hostname, true, false );
|
||||
ilog ("%s restarting heartbeat service\n", hostname.c_str());
|
||||
hbsInv.print_node_info();
|
||||
}
|
||||
@ -1616,7 +1837,9 @@ void daemon_service_run ( void )
|
||||
int rri = 0 ;
|
||||
string lf = "\n" ;
|
||||
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
mem_log ((char*)lf.data());
|
||||
#endif
|
||||
|
||||
/* Get the next Resource Reference Identifier
|
||||
* and its Resourvce Identifier. These values
|
||||
@ -1630,7 +1853,9 @@ void daemon_service_run ( void )
|
||||
if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned == false ))
|
||||
continue ;
|
||||
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
exp_pulses[iface] =
|
||||
#endif
|
||||
hbsInv.hbs_expected_pulses[iface] =
|
||||
hbsInv.create_pulse_list((iface_enum)iface);
|
||||
|
||||
@ -1759,28 +1984,33 @@ void daemon_service_run ( void )
|
||||
if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned != true ))
|
||||
continue ;
|
||||
|
||||
#define MAX_LEN 1000
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
char str[MAX_LEN] ;
|
||||
|
||||
snprintf (&str[0], MAX_LEN, "%s Histogram: %d - %s\n",
|
||||
get_iface_name_str(iface),
|
||||
exp_pulses[iface],
|
||||
arrival_histogram[iface].c_str());
|
||||
|
||||
mem_log (str);
|
||||
|
||||
if ( !unexpected_pulse_list[iface].empty() )
|
||||
{
|
||||
snprintf ( &str[0], MAX_LEN, "%s Others : %s\n",
|
||||
get_iface_name_str(iface),
|
||||
unexpected_pulse_list[iface].c_str());
|
||||
|
||||
mem_log(str);
|
||||
}
|
||||
hbsInv.lost_pulses ( (iface_enum)iface );
|
||||
#endif
|
||||
/*
|
||||
* Assume storage-0 is responding until otherwise proven
|
||||
* its not. Keep in mind that the 'lost_pulses' interface
|
||||
* only counts nodes that have not responded.
|
||||
*/
|
||||
bool storage_0_responding = true ;
|
||||
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
|
||||
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
|
||||
}
|
||||
hbsTimer.ring = false ;
|
||||
heartbeat_request = true ;
|
||||
// hbs_cluster_log ( hbsInv.my_hostname, "->") ;
|
||||
seq_num++ ;
|
||||
}
|
||||
daemon_load_fit ();
|
||||
@ -1796,7 +2026,9 @@ void daemon_dump_info ( void )
|
||||
hbsInv.print_node_info ();
|
||||
hbsInv.memDumpAllState ();
|
||||
|
||||
#ifdef WANT_HBS_MEM_LOGS
|
||||
daemon_dump_membuf (); /* write mem_logs to log file and clear log list */
|
||||
#endif
|
||||
}
|
||||
|
||||
const char MY_DATA [100] = { "eieio\n" } ;
|
||||
|
@ -27,6 +27,8 @@
|
||||
#include <signal.h>
|
||||
#include <list>
|
||||
#include "msgClass.h"
|
||||
#include "mtceHbsCluster.h"
|
||||
#include "hbsCluster.h"
|
||||
|
||||
/**
|
||||
* @addtogroup hbs_base
|
||||
@ -38,6 +40,8 @@
|
||||
#endif
|
||||
#define __AREA__ "hbs"
|
||||
|
||||
// #define WANT_CLUSTER_DEBUG
|
||||
|
||||
#define ALIGN_PACK(x) __attribute__((packed)) x
|
||||
|
||||
/** Maximum service fail count before action */
|
||||
@ -56,15 +60,18 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
|
||||
|
||||
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
|
||||
|
||||
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
|
||||
|
||||
/* Heartbeat control structure */
|
||||
typedef struct
|
||||
{
|
||||
unsigned int nodetype ;
|
||||
bool clear_alarms ;
|
||||
} hbs_ctrl_type ;
|
||||
hbs_ctrl_type * get_hbs_ctrl_ptr ( void );
|
||||
|
||||
/* A heartbeat service message
|
||||
* if this structire is changed then
|
||||
* if this structure is changed then
|
||||
* hbs_pulse_request needs to be looked at
|
||||
*/
|
||||
typedef struct
|
||||
@ -76,7 +83,7 @@ typedef struct
|
||||
unsigned int s ;
|
||||
|
||||
/* Fast Lookup Clue Info */
|
||||
unsigned int c ;
|
||||
unsigned int c ;
|
||||
|
||||
/* Status Flags
|
||||
* ------------
|
||||
@ -89,6 +96,16 @@ typedef struct
|
||||
/** message version number */
|
||||
unsigned int v ;
|
||||
|
||||
/** Heartbeat cluster information that is put into heartbeat messages.
|
||||
*
|
||||
* Pulse Request : To hbsClient: Only 1 controller with up to 2 network types history.
|
||||
* Pulse Response: From hbsClient: Can include up to 2 controllers with 2 networks each.
|
||||
*
|
||||
* This addition requires message verison increment.
|
||||
*
|
||||
**/
|
||||
mtce_hbs_cluster_type cluster ;
|
||||
|
||||
} ALIGN_PACK(hbs_message_type) ;
|
||||
|
||||
|
||||
@ -104,6 +121,12 @@ typedef struct
|
||||
/** Heartbeat Service Event Transmit Interface - hbsClient -> mtcAgent */
|
||||
msgClassSock* hbs_ready_tx_sock;
|
||||
|
||||
/** Heartbeat Service SM Transmit Interface - hbsAgent -> sm */
|
||||
msgClassSock* sm_client_sock;
|
||||
|
||||
/** Heartbeat Service SM Receive Interface - sm -> hbsAgent */
|
||||
msgClassSock* sm_server_sock;
|
||||
|
||||
/** PMON Pulse Receive Interface - pmond -> hbsClient */
|
||||
msgClassSock* pmon_pulse_sock;
|
||||
|
||||
@ -166,6 +189,9 @@ int hbs_refresh_pids ( std::list<procList> & proc_list );
|
||||
int hbs_process_monitor ( std::list<procList> & pmon_list );
|
||||
int hbs_self_recovery ( unsigned int cmd );
|
||||
|
||||
/* returns this controller's number ; 0 or 1 */
|
||||
unsigned int hbs_get_controller_number ( void );
|
||||
|
||||
/* Setup the pulse messaging interfaces
|
||||
* 'p' is a boot that indicates if the infrastructure network is provisioned
|
||||
* 'p' = true means it is provisioned */
|
||||
@ -184,6 +210,93 @@ int hbs_self_recovery ( unsigned int cmd );
|
||||
} \
|
||||
}
|
||||
|
||||
/*********** Common Heartbeat Utilities in hbsUtil.cpp ***************/
|
||||
|
||||
/* module init */
|
||||
void hbs_utils_init ( void );
|
||||
|
||||
/* network enum to name lookup */
|
||||
string hbs_cluster_network_name ( mtce_hbs_network_enum network );
|
||||
|
||||
/* Produce formatted clog's that characterize current and changing cluster
|
||||
* history for a given network. Each log is controller/network specific. */
|
||||
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix );
|
||||
|
||||
/* Initialize the specified history array */
|
||||
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history );
|
||||
|
||||
/* Clear all history in the cluster vault */
|
||||
void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
|
||||
|
||||
|
||||
/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
|
||||
|
||||
/* Set the cluster vault to default state.
|
||||
* Called upon daemon init or heartbeat period change. */
|
||||
void hbs_cluster_init ( unsigned short period );
|
||||
|
||||
/* Calculate number of bytes that is unused in the cluster data structure.
|
||||
* Primarily to know how many history elements are missing. */
|
||||
unsigned short hbs_cluster_unused_bytes ( void );
|
||||
|
||||
/* Add and delete hosts from the monitored list.
|
||||
* Automatically adjusts the numbers in the cluster vault. */
|
||||
void hbs_cluster_add ( string & hostname );
|
||||
void hbs_cluster_del ( string & hostname );
|
||||
|
||||
/* Report status of storgate-0 */
|
||||
void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
|
||||
|
||||
/* Look for and clog changes in cluster state */
|
||||
int hbs_cluster_cmp ( hbs_message_type & msg );
|
||||
|
||||
/* Manage the enabled state of the controllers */
|
||||
void hbs_manage_controller_state ( string & hostname, bool enabled );
|
||||
|
||||
/* Set the number of monitored hosts and this controller's
|
||||
* number in the cluster vault. */
|
||||
void hbs_cluster_nums ( unsigned short this_controller,
|
||||
unsigned short monitored_networks );
|
||||
|
||||
/* Copy/Save the peer controller's cluster info from the hbsClient's
|
||||
* pulse response into the cluster vault so its there and ready for
|
||||
* an SM cluster_info request. */
|
||||
int hbs_cluster_save ( string & hostname,
|
||||
mtce_hbs_network_enum network,
|
||||
hbs_message_type & msg );
|
||||
|
||||
/*
|
||||
* Called by the hbsAgent pulse receiver to create a network specific
|
||||
* history update entry consisting of
|
||||
*
|
||||
* 1. the number of monitored hosts
|
||||
* 2. how many of those that responded in the last heartbeat period.
|
||||
* 3. threshold storage-0 responding count and manage that state in that
|
||||
* networks history header.
|
||||
*/
|
||||
void hbs_cluster_update ( iface_enum iface,
|
||||
unsigned short not_responding_hosts,
|
||||
bool storage_0_responding );
|
||||
|
||||
/* Called by the hbsAgent pulse transmitter to append this controllers
|
||||
* running cluster view in the next multicast pulse request.
|
||||
* The hbsClient is expected to loop this data and any other like data from
|
||||
* the other controller back in its response. */
|
||||
void hbs_cluster_append ( hbs_message_type & msg );
|
||||
|
||||
/* Produce formatted clog's that characterize current and changing cluster
|
||||
* history for a given network. Each log is controller/network specific. */
|
||||
void hbs_cluster_log ( string & hostname, string prefix );
|
||||
|
||||
/* Service SM cluster info request */
|
||||
void hbs_sm_handler ( void );
|
||||
|
||||
/* send the cluster vault to SM */
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
|
||||
|
||||
/* print the contents of the vault */
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
|
||||
|
||||
/**
|
||||
* @} hbs_base
|
||||
*/
|
||||
|
@ -20,7 +20,6 @@
|
||||
* daemon_files_init
|
||||
* daemon_configure
|
||||
* daemon_signal_init
|
||||
* hbs_message_init
|
||||
* hbs_socket_init
|
||||
*
|
||||
* daemon_service_run
|
||||
@ -59,7 +58,7 @@ using namespace std;
|
||||
#include "daemon_option.h" /* Common options for daemons */
|
||||
#include "nodeTimers.h" /* for ... maintenance timers */
|
||||
#include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
|
||||
#include "nlEvent.h" /* for ... open_netlink_socket */
|
||||
#include "nlEvent.h" /* for ... open_netlink_socket */
|
||||
#include "hbsBase.h" /* Heartbeat Base Header File */
|
||||
|
||||
extern "C"
|
||||
@ -95,8 +94,9 @@ typedef struct
|
||||
std::list<procList>::iterator proc_ptr ;
|
||||
} stallMon_type ;
|
||||
|
||||
|
||||
static char pulse_resp_tx_hdr [HBS_MAX_MSG];
|
||||
static char my_hostname [MAX_HOST_NAME_SIZE+1];
|
||||
static char my_hostname_length ;
|
||||
static string my_macaddr = "" ;
|
||||
static string my_address = "" ;
|
||||
static unsigned int my_nodetype= CGTS_NODE_NULL ;
|
||||
@ -360,6 +360,12 @@ static int hbs_config_handler ( void * user,
|
||||
config_ptr->pmon_pulse_port = atoi(value);
|
||||
config_ptr->mask |= CONFIG_CLIENT_PULSE_PORT ;
|
||||
}
|
||||
#ifdef WANT_CLUSTER_DEBUG
|
||||
else if (MATCH("agent", "sm_client_port"))
|
||||
{
|
||||
config_ptr->sm_client_port = atoi(value);
|
||||
}
|
||||
#endif
|
||||
else
|
||||
{
|
||||
return (PASS);
|
||||
@ -446,20 +452,6 @@ int daemon_configure ( void )
|
||||
/* Initialization Utilities */
|
||||
/****************************/
|
||||
|
||||
/* Initialize the unicast pulse response message */
|
||||
/* One time thing ; tx same message all the time. */
|
||||
int hbs_message_init ( void )
|
||||
{
|
||||
/* Build the transmit pulse response message for each interface */
|
||||
for ( int i = 0 ; i < MAX_IFACES ; i++ )
|
||||
{
|
||||
memset ( &hbs_sock.tx_mesg[i], 0, sizeof (hbs_message_type));
|
||||
memcpy ( &hbs_sock.tx_mesg[i].m[0], &rsp_msg_header[0], HBS_HEADER_SIZE );
|
||||
memcpy ( &hbs_sock.tx_mesg[i].m[HBS_HEADER_SIZE], my_hostname, strlen(my_hostname));
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/* Initialize pulse messaging for the specified interface
|
||||
* This is called by a macro defined in hbsBase.h */
|
||||
int _setup_pulse_messaging ( iface_enum i, int rmem )
|
||||
@ -621,6 +613,11 @@ int hbs_socket_init ( void )
|
||||
return (FAIL_SOCKET_NOBLOCK);
|
||||
}
|
||||
|
||||
#ifdef WANT_CLUSTER_DEBUG
|
||||
hbs_sock.sm_client_sock = new msgClassRx(LOOPBACK_IP,hbs_config.sm_client_port,IPPROTO_UDP);
|
||||
if ( rc ) return (rc) ;
|
||||
hbs_sock.sm_client_sock->sock_ok(true);
|
||||
#endif
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
@ -648,7 +645,7 @@ int get_pmon_pulses ( void )
|
||||
if ( !strncmp ( &msg.hdr[0] , get_pmond_pulse_header(), MSG_HEADER_SIZE ))
|
||||
{
|
||||
pulses++ ;
|
||||
mlog ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses );
|
||||
mlog1 ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -710,92 +707,87 @@ static unsigned int my_rri = 0 ;
|
||||
static int rx_error_count[MAX_IFACES] = {0,0} ;
|
||||
static int tx_error_count[MAX_IFACES] = {0,0} ;
|
||||
|
||||
#define ERROR_LOG_THRESHOLD (200)
|
||||
|
||||
int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
{
|
||||
unsigned int s = 0 ; /* Sequence number */
|
||||
int n = 0 ; /* message size */
|
||||
int rc = 0 ;
|
||||
|
||||
if (( iface != MGMNT_IFACE ) && ( iface != INFRA_IFACE ))
|
||||
return (FAIL_BAD_CASE);
|
||||
|
||||
memset ( (char*) &hbs_sock.rx_mesg[iface], 0, sizeof(hbs_message_type));
|
||||
if ( ! hbs_sock.rx_sock[iface] )
|
||||
{
|
||||
elog ("cannot receive from null rx_mesg[%s] socket\n", get_iface_name_str(iface) );
|
||||
elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"cannot receive from null rx_mesg[%s] socket\n",
|
||||
get_iface_name_str(iface) );
|
||||
return (FAIL_TO_RECEIVE);
|
||||
}
|
||||
else if ( hbs_sock.rx_sock[iface]->sock_ok() == false )
|
||||
else if ( ! hbs_sock.tx_sock[iface] )
|
||||
{
|
||||
elog ("cannot receive from failed rx_mesg[%s] socket\n", get_iface_name_str(iface) );
|
||||
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"cannot send to null mesg[%s] socket\n",
|
||||
get_iface_name_str(iface) );
|
||||
return (FAIL_TO_TRANSMIT);
|
||||
}
|
||||
else if ( ! hbs_sock.rx_sock[iface]->sock_ok() )
|
||||
{
|
||||
elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"cannot receive from failed rx_mesg[%s] socket\n",
|
||||
get_iface_name_str(iface) );
|
||||
return (FAIL_TO_RECEIVE);
|
||||
}
|
||||
|
||||
n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type));
|
||||
|
||||
if( n < HBS_HEADER_SIZE )
|
||||
else if ( ! hbs_sock.tx_sock[iface]->sock_ok() )
|
||||
{
|
||||
rx_error_count[iface]++ ;
|
||||
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"cannot send to failed mesg[%s] socket\n",
|
||||
get_iface_name_str(iface) );
|
||||
return (FAIL_TO_TRANSMIT);
|
||||
}
|
||||
|
||||
/* throtle the log so that if they come back-to-back we avoid flooding */
|
||||
if ( n == -1 )
|
||||
// MEMSET_ZERO(hbs_sock.rx_mesg[iface]);
|
||||
int rx_bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type));
|
||||
if ( rx_bytes < HBS_HEADER_SIZE )
|
||||
{
|
||||
if ( rx_bytes == -1 )
|
||||
{
|
||||
if ( rx_error_count[iface] > 1 )
|
||||
{
|
||||
wlog_throttled ( rx_error_count[iface], 500, "%s receive error (%d:%m)\n", get_iface_name_str(iface), errno );
|
||||
}
|
||||
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"%s receive error (%d:%m)\n",
|
||||
get_iface_name_str(iface), errno );
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog_throttled ( rx_error_count[iface], 500, "%s message underrun (expected %ld but got %d)\n",
|
||||
get_iface_name_str(iface), sizeof(hbs_message_type), n );
|
||||
}
|
||||
if ( rx_error_count[iface] == 100 )
|
||||
{
|
||||
wlog ( "%s is getting a lot of receive errors (%d:%m)\n", get_iface_name_str(iface), errno );
|
||||
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"%s message underrun (expected %ld but got %d)\n",
|
||||
get_iface_name_str(iface),
|
||||
sizeof(hbs_message_type), rx_bytes );
|
||||
}
|
||||
return (FAIL_TO_RECEIVE);
|
||||
}
|
||||
|
||||
/* Clear the error count since we got a good receive */
|
||||
rx_error_count[iface] = 0 ;
|
||||
|
||||
#ifdef WANT_NO_SELF_HEARTBEAT_REPLY
|
||||
/* Don't reply to the heartbeat if the request came from myself */
|
||||
if ( ! strncmp ( my_address.data(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
MAX_CHARS_IN_IP_ADDR ))
|
||||
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
|
||||
if ( cfg_ptr->debug_msg )
|
||||
{
|
||||
ilog ("%s Refusing to send heartbeat response to self\n", hbs_sock.rx_sock[iface]->get_dst_addr()->toString());
|
||||
return (PASS);
|
||||
mlog ("\n");
|
||||
mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c);
|
||||
}
|
||||
#else
|
||||
/* We use this to monitor pmond on active controller */
|
||||
#endif
|
||||
|
||||
/* Save the sequence number */
|
||||
s = hbs_sock.rx_mesg[iface].s ;
|
||||
|
||||
mlog ("\n");
|
||||
mlog ("%s Pulse Req: %s:%5d: %d: :%s RRI:%d\n", get_iface_name_str(iface),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c);
|
||||
|
||||
/* verify the message header */
|
||||
if ( strncmp ( (const char *)&hbs_sock.rx_mesg[iface].m, (const char *)&req_msg_header, HBS_HEADER_SIZE ))
|
||||
{
|
||||
wlog_throttled ( rx_error_count[iface], 200, "%s Invalid header (%d:%s)\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].m );
|
||||
|
||||
mlog ("Detected: %d <%s>\n", HBS_HEADER_SIZE,hbs_sock.rx_mesg[iface].m);
|
||||
mlog ("Expected: %d <%s>\n", HBS_HEADER_SIZE,req_msg_header);
|
||||
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"%s Invalid header (%d:%s)\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].m );
|
||||
return (FAIL_MSG_HEADER) ;
|
||||
}
|
||||
|
||||
|
||||
/* Manage the Resource Reference Index (RRI) "lookup clue" */
|
||||
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
@ -807,32 +799,31 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
}
|
||||
|
||||
/* Add my RRI to the response message */
|
||||
hbs_sock.tx_mesg[iface].c = my_rri ;
|
||||
hbs_sock.rx_mesg[iface].c = my_rri ;
|
||||
|
||||
/* Clear struct */
|
||||
hbs_sock.tx_mesg[iface].s = s ;
|
||||
hbs_sock.tx_mesg[iface].f = flags ;
|
||||
/* Manage OOB flags */
|
||||
hbs_sock.rx_mesg[iface].f = flags ;
|
||||
if ( pmonPulse_counter )
|
||||
{
|
||||
hbs_sock.tx_mesg[iface].f |= ( PMOND_FLAG ) ;
|
||||
hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ;
|
||||
}
|
||||
if ( infra_network_provisioned == true )
|
||||
{
|
||||
hbs_sock.tx_mesg[iface].f |= INFRA_FLAG ;
|
||||
hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ;
|
||||
}
|
||||
|
||||
n = (int)sizeof(hbs_message_type) ;
|
||||
|
||||
if ( ! hbs_sock.tx_sock[iface] )
|
||||
#define WANT_CLUSTER_INFO_LOG
|
||||
#ifdef WANT_CLUSTER_INFO_LOG
|
||||
/* Log the received cluster info */
|
||||
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
|
||||
{
|
||||
elog ("cannot send to null tx_mesg[%s] socket\n", get_iface_name_str(iface) );
|
||||
return (FAIL_TO_TRANSMIT);
|
||||
}
|
||||
else if ( hbs_sock.tx_sock[iface]->sock_ok() == false )
|
||||
{
|
||||
elog ("cannot send to failed tx_mesg[%s] socket\n", get_iface_name_str(iface) );
|
||||
return (FAIL_TO_TRANSMIT);
|
||||
char str[100] ;
|
||||
// hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s );
|
||||
snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
|
||||
string hostname = my_hostname ;
|
||||
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WANT_PULSE_RESPONSE_FIT
|
||||
if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP )))
|
||||
@ -848,44 +839,69 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Send pulse response message with sequence number, flags and resource referecen index */
|
||||
rc = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.tx_mesg[iface], n);
|
||||
if ( rc == -1 )
|
||||
int rc = PASS ;
|
||||
|
||||
/* replace the request header with the response header */
|
||||
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
|
||||
|
||||
/* Deal with the cluster info if it exists.
|
||||
* ... Introduced in messaging version 1 */
|
||||
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
|
||||
{
|
||||
elog ("Failed to sendto socket %d through %s:%d len:%d (%s) (%d:%s)\n",
|
||||
hbs_sock.tx_sock[iface]->getFD(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(),
|
||||
get_iface_name_str(iface), errno, strerror(errno));
|
||||
if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION )
|
||||
{
|
||||
ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version);
|
||||
}
|
||||
// if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION )
|
||||
// {
|
||||
// ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision);
|
||||
// }
|
||||
|
||||
/* Add peer controller cluster data to this controller's response */
|
||||
// hbs_cluster_loop(hbs_sock.rx_mesg[iface]);
|
||||
}
|
||||
else if ( rc != n)
|
||||
|
||||
/* send pulse response message */
|
||||
int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes);
|
||||
if ( tx_bytes == -1 )
|
||||
{
|
||||
/* Avoid log flooding
|
||||
elog ("unicast send failed. (%d)\n", rc); */
|
||||
wlog_throttled ( tx_error_count[iface], 200,
|
||||
"%s Pulse Rsp: %d:%d bytes < %d:%s > to <%s>\n",
|
||||
get_iface_name_str(iface), n, rc,
|
||||
hbs_sock.tx_mesg[iface].s,
|
||||
&hbs_sock.tx_mesg[iface].m[0],
|
||||
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"pulse tx failed %d:%s:%d len:%d (%s) (%d:%s)\n",
|
||||
hbs_sock.tx_sock[iface]->getFD(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(),
|
||||
get_iface_name_str(iface), errno, strerror(errno));
|
||||
}
|
||||
else if ( tx_bytes != rx_bytes)
|
||||
{
|
||||
wlog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
|
||||
"%s Pulse Rsp: %d:%d bytes < %d:%s >",
|
||||
get_iface_name_str(iface), rx_bytes, tx_bytes,
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
&hbs_sock.rx_mesg[iface].m[0]);
|
||||
return (rc);
|
||||
rc = FAIL_DATA_SIZE ;
|
||||
}
|
||||
else
|
||||
{
|
||||
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d)\n",
|
||||
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.tx_mesg[iface].s,
|
||||
hbs_sock.tx_mesg[iface].f,
|
||||
hbs_sock.tx_mesg[iface].m,
|
||||
hbs_sock.tx_mesg[iface].c,
|
||||
pmonPulse_counter);
|
||||
/* Clear the error count since we got a good transmit */
|
||||
tx_error_count[iface] = 0 ;
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c,
|
||||
pmonPulse_counter, rx_bytes, tx_bytes);
|
||||
}
|
||||
return PASS;
|
||||
|
||||
/* Clear the error count since we got a good receive */
|
||||
if ( rx_error_count[iface] )
|
||||
rx_error_count[iface] = 0 ;
|
||||
if ( tx_error_count[iface] )
|
||||
tx_error_count[iface] = 0 ;
|
||||
|
||||
return rc ;
|
||||
}
|
||||
|
||||
#ifdef WANT_FIT_TESTING
|
||||
@ -968,6 +984,9 @@ int daemon_init ( string iface, string nodeType_str )
|
||||
/* Initialize socket construct and pointer to it */
|
||||
memset ( &hbs_sock, 0, sizeof(hbs_sock));
|
||||
|
||||
/* init the utility module */
|
||||
hbs_utils_init ();
|
||||
|
||||
/* Defaults */
|
||||
hbs_config.stall_pmon_thld = -1 ;
|
||||
hbs_config.stall_mon_period = MTC_HRS_8 ;
|
||||
@ -1025,12 +1044,6 @@ int daemon_init ( string iface, string nodeType_str )
|
||||
rc = FAIL_DAEMON_CONFIG ;
|
||||
}
|
||||
|
||||
/* Init the heartbeat transmit pulse response message */
|
||||
else if ( hbs_message_init () != PASS )
|
||||
{
|
||||
elog ("Failed to initialize pulse response message\n");
|
||||
rc = FAIL_MESSAGE_INIT ;
|
||||
}
|
||||
/* Setup the heartbeat service messaging sockets */
|
||||
else if ( hbs_socket_init () != PASS )
|
||||
{
|
||||
@ -1119,6 +1132,11 @@ void daemon_service_run ( void )
|
||||
ilog ("Sending Heartbeat Ready Event\n");
|
||||
hbs_send_event ( MTC_EVENT_MONITOR_READY );
|
||||
|
||||
my_hostname_length = strlen(my_hostname) ;
|
||||
memset ( &pulse_resp_tx_hdr[0], 0, HBS_MAX_MSG );
|
||||
memcpy ( &pulse_resp_tx_hdr[0], &rsp_msg_header[0], HBS_HEADER_SIZE );
|
||||
memcpy ( &pulse_resp_tx_hdr[HBS_HEADER_SIZE], my_hostname, my_hostname_length );
|
||||
|
||||
/* Run heartbeat service forever or until stop condition */
|
||||
for ( ; ; )
|
||||
{
|
||||
@ -1153,7 +1171,9 @@ void daemon_service_run ( void )
|
||||
FD_SET(hbs_sock.pmon_pulse_sock->getFD(),&hbs_sock.readfds);
|
||||
FD_SET(hbs_sock.amon_socket, &hbs_sock.readfds);
|
||||
FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds);
|
||||
|
||||
#ifdef WANT_CLUSTER_DEBUG
|
||||
FD_SET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds);
|
||||
#endif
|
||||
rc = select( socks.back()+1,
|
||||
&hbs_sock.readfds, NULL, NULL,
|
||||
&hbs_sock.waitd);
|
||||
@ -1176,6 +1196,19 @@ void daemon_service_run ( void )
|
||||
/* Only service sockets for the rc > 0 case */
|
||||
else if ( rc )
|
||||
{
|
||||
#ifdef WANT_CLUSTER_DEBUG
|
||||
if ( hbs_sock.sm_client_sock && FD_ISSET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds ) )
|
||||
{
|
||||
mtce_hbs_cluster_type msg ;
|
||||
/* Receive event messages */
|
||||
memset ( &msg , 0, sizeof(mtce_hbs_cluster_type));
|
||||
int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type));
|
||||
if ( bytes )
|
||||
{
|
||||
hbs_cluster_dump (msg);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (hbs_sock.rx_sock[MGMNT_IFACE]&&FD_ISSET(hbs_sock.rx_sock[MGMNT_IFACE]->getFD(), &hbs_sock.readfds))
|
||||
{
|
||||
/* Receive pulse request and send a response */
|
||||
|
748
mtce/src/heartbeat/hbsCluster.cpp
Normal file
748
mtce/src/heartbeat/hbsCluster.cpp
Normal file
@ -0,0 +1,748 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* @file Maintenance Heartbeat Agent Cluster Manager Module
|
||||
*
|
||||
*************************************************************************
|
||||
*
|
||||
* This module provides the heartbeat cluster implementation member
|
||||
* functions that the hbsAgent service calls to collect, store and
|
||||
* send heartbeat cluster information to SM upon request.
|
||||
*
|
||||
* See mtceHbsCluster.h for formal API between SM and Mtce.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include "nodeBase.h" /* common maintenance constructs and definitions */
|
||||
#include "daemon_common.h" /* common daemon constructs and definitions */
|
||||
#include "hbsBase.h" /* mtce heartbeat constructs and definitions */
|
||||
|
||||
/* Error log throttle counter. */
|
||||
#define THROTTLE_COUNT (500)
|
||||
|
||||
/* Private Heartbeat Cluster Control Structure. */
|
||||
typedef struct
|
||||
{
|
||||
/* Contains the controller number (0 or 1) for this controller. */
|
||||
unsigned short this_controller ;
|
||||
|
||||
/* Preserves which controllers are enabled. */
|
||||
bool controller_0_enabled ;
|
||||
bool controller_1_enabled ;
|
||||
#ifdef THREE_CONTROLLER_SYSTEM
|
||||
bool controller_2_enabled ;
|
||||
#endif
|
||||
|
||||
/* Used to prevent log flooding in presence of back to back errors. */
|
||||
unsigned int log_throttle ;
|
||||
|
||||
/* Used to threshold storage-0 not responding state */
|
||||
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
|
||||
|
||||
/* Contains the number of monitored networks in the system.
|
||||
* Management only = 1
|
||||
* Management and Inrastructure = 2 */
|
||||
unsigned short monitored_networks ;
|
||||
|
||||
/* This contains the current number of heartbeat enabled hosts.
|
||||
*
|
||||
* Used to improve performance.
|
||||
*
|
||||
* Performance: This value is included in each history entry so
|
||||
* rather than do the size calculation of monitored_hostname_list
|
||||
* each time, this variable is updated from monitored_hostname_list
|
||||
* after each add/del operation. */
|
||||
unsigned short monitored_hosts ;
|
||||
|
||||
/* List of host names being monitored. */
|
||||
std::list<string>monitored_hostname_list ;
|
||||
|
||||
/* The working heartbeat cluster data vault. */
|
||||
mtce_hbs_cluster_type cluster ;
|
||||
|
||||
} hbs_cluster_ctrl_type ;
|
||||
|
||||
/* Cluster control structire construct allocation. */
|
||||
static hbs_cluster_ctrl_type ctrl ;
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_init
|
||||
*
|
||||
* Description : Initialize the cluster structure to default values.
|
||||
*
|
||||
* Assumtions : Called by hbsAgent.cpp before entering the main loop.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_init ( unsigned short period )
|
||||
{
|
||||
ctrl.monitored_hosts = 0;
|
||||
ctrl.monitored_hostname_list.clear();
|
||||
|
||||
/* Init the cluster - header. */
|
||||
ctrl.cluster.version = MTCE_HBS_CLUSTER_VERSION ;
|
||||
ctrl.cluster.revision = MTCE_HBS_CLUSTER_REVISION ;
|
||||
ctrl.cluster.magic_number = MTCE_HBS_MAGIC_NUMBER ;
|
||||
|
||||
/* Init the cluster - global / dynamic data. */
|
||||
ctrl.cluster.reqid = 0 ;
|
||||
ctrl.cluster.period_msec = period ;
|
||||
ctrl.cluster.storage0_enabled = false ;
|
||||
ctrl.cluster.histories = 0 ;
|
||||
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
||||
|
||||
/* The storage-0 thresholding counter for each network. */
|
||||
for ( int n = 0 ; n < MTCE_HBS_NETWORKS ; n++ )
|
||||
ctrl.storage_0_not_responding_count[n] = 0 ;
|
||||
|
||||
for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ )
|
||||
hbs_cluster_history_init ( ctrl.cluster.history[h] );
|
||||
|
||||
ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)",
|
||||
ctrl.cluster.version,
|
||||
ctrl.cluster.revision,
|
||||
ctrl.cluster.magic_number,
|
||||
ctrl.cluster.bytes,
|
||||
sizeof(mtce_hbs_cluster_history_type));
|
||||
|
||||
ctrl.log_throttle = 0 ;
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_nums
|
||||
*
|
||||
* Description : Set this controller number and the number of monitored
|
||||
* networks in this system.
|
||||
*
|
||||
* These values do not change without a process restart.
|
||||
*
|
||||
* Assumtions : Called by hbsAgent.cpp before entering the main loop.
|
||||
*
|
||||
* Returns : None
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_nums ( unsigned short this_controller,
|
||||
unsigned short monitored_networks )
|
||||
{
|
||||
ctrl.this_controller = this_controller ;
|
||||
ctrl.monitored_networks = monitored_networks ;
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : log_monitored_hosts_list
|
||||
*
|
||||
* Description : Log the list of monitored hosts.
|
||||
* Typically done on a list change.
|
||||
*
|
||||
* Returns : None
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void log_monitored_hosts_list ( void )
|
||||
{
|
||||
std::list<string>::iterator iter_ptr ;
|
||||
string list = "" ;
|
||||
for ( iter_ptr = ctrl.monitored_hostname_list.begin() ;
|
||||
iter_ptr != ctrl.monitored_hostname_list.end() ;
|
||||
iter_ptr++ )
|
||||
{
|
||||
list.append (*(iter_ptr));
|
||||
list.append (" ");
|
||||
}
|
||||
ilog ("cluster of %ld: %s",
|
||||
ctrl.monitored_hostname_list.size(),
|
||||
list.c_str());
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : cluster_storage0_state
|
||||
*
|
||||
* Description : Record the heartbeat monitoring state of storage-0.
|
||||
*
|
||||
* Parameters : true if storage-0 heartbeating is in the 'started' state.
|
||||
* false if storage-0 heartbeating is in the 'stopped' state.
|
||||
*
|
||||
* Returns : None
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void cluster_storage0_state ( bool enabled )
|
||||
{
|
||||
if ( ctrl.cluster.storage0_enabled != enabled )
|
||||
{
|
||||
ctrl.cluster.storage0_enabled = enabled ;
|
||||
ilog ("storage-0 heartbeat state changed to %s",
|
||||
enabled ? "enabled" : "disabled" );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_manage_controller_state
|
||||
*
|
||||
* Description : Track the monitored enabled state of the controllers.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_manage_controller_state ( string & hostname, bool enabled )
|
||||
{
|
||||
/* track controller state */
|
||||
if ( hostname == CONTROLLER_0 )
|
||||
{
|
||||
ctrl.controller_0_enabled = enabled ;
|
||||
}
|
||||
else if ( hostname == CONTROLLER_1 )
|
||||
{
|
||||
ctrl.controller_1_enabled = enabled ;
|
||||
}
|
||||
#ifdef THREE_CONTROLLER_SYSTEM
|
||||
else if ( hostname == CONTROLLER_2 )
|
||||
{
|
||||
ctrl.controller_2_enabled = enabled ;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_add
|
||||
*
|
||||
* Description : Add the specified hostname to the enabled hosts list.
|
||||
*
|
||||
* Updates : hostname is added to monitored_hostname_list
|
||||
*
|
||||
* If added host is storage-0 then update its enabled status.
|
||||
* if added host is a controller then update controller state.
|
||||
*
|
||||
* Parameters : hostname string
|
||||
*
|
||||
* Updates : monitored_hostname_list
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_add ( string & hostname )
|
||||
{
|
||||
/* Consider using 'unique' after instead of remove before update. */
|
||||
ctrl.monitored_hostname_list.remove(hostname) ;
|
||||
ctrl.monitored_hostname_list.push_back(hostname) ;
|
||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||
|
||||
/* Manage storage-0 state */
|
||||
if ( hostname == STORAGE_0 )
|
||||
{
|
||||
cluster_storage0_state ( true );
|
||||
}
|
||||
|
||||
/* If we get down to 0 monitored hosts then just start fresh */
|
||||
if (( ctrl.monitored_hosts ) == 0 )
|
||||
{
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec );
|
||||
}
|
||||
|
||||
/* Manage controller state ; true means enabled in this case. */
|
||||
hbs_manage_controller_state ( hostname, true );
|
||||
|
||||
ilog ("%s added to cluster", hostname.c_str());
|
||||
|
||||
log_monitored_hosts_list ();
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_del
|
||||
*
|
||||
* Description : Delete the specified hostname from the enabled hosts list.
|
||||
*
|
||||
* Updates : hostname is removed from monitored_hostname_list
|
||||
*
|
||||
* If added host is storage-0 then update its enabled status.
|
||||
* if added host is a controller then update controller count.
|
||||
*
|
||||
* Parameters : hostname string
|
||||
*
|
||||
* Updates : monitored_hostname_list
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_del ( string & hostname )
|
||||
{
|
||||
ctrl.monitored_hostname_list.remove(hostname) ;
|
||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||
|
||||
/* Manage storage-0 state. */
|
||||
if ( hostname == STORAGE_0 )
|
||||
{
|
||||
cluster_storage0_state ( false );
|
||||
}
|
||||
|
||||
/* If we get down to 0 monitored hosts then just start fresh */
|
||||
if (( ctrl.monitored_hosts ) == 0 )
|
||||
{
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec );
|
||||
}
|
||||
|
||||
/* Manage controller state ; false means not enabled in this case. */
|
||||
hbs_manage_controller_state ( hostname , false );
|
||||
|
||||
ilog ("%s deleted from cluster", hostname.c_str());
|
||||
|
||||
log_monitored_hosts_list ();
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_update
|
||||
*
|
||||
* Description : Update this controller's cluster info for the specified
|
||||
* network with
|
||||
*
|
||||
* 1. The number of enabled hosts.
|
||||
* 2. The number of responding hosts.
|
||||
* 3. The oldest history index in the rotational history fifo.
|
||||
* 4. Maintain a back to back non-responding count for storage-0.
|
||||
* Once the count reaches the minimum threshold of
|
||||
* STORAGE_0_NR_THRESHOLD then the specific network history
|
||||
* is updated to indicate storgae-0 is not responding. Once
|
||||
* storage-0 starts responding again with a single response
|
||||
* then that network history is updated to indicate storage-0
|
||||
* is responding.
|
||||
*
|
||||
* Assumptions : Converts heartbeat interface number to cluster network number.
|
||||
*
|
||||
* Parameters : heartbeat interface number ( iface_enum )
|
||||
* network index
|
||||
* number of not responding hosts for this interval
|
||||
*
|
||||
* Updates : This and last history as well as storage-0 not responding
|
||||
* count.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#define STORAGE_0_NR_THRESHOLD (4)
|
||||
|
||||
void hbs_cluster_update ( iface_enum iface,
|
||||
unsigned short not_responding_hosts,
|
||||
bool storage_0_responding )
|
||||
{
|
||||
if ( ctrl.monitored_hosts == 0 )
|
||||
return ;
|
||||
|
||||
/* convert heartbeat iface enum to cluster network enum. */
|
||||
mtce_hbs_network_enum n ;
|
||||
if ( iface == MGMNT_IFACE )
|
||||
n = MTCE_HBS_NETWORK_MGMT ;
|
||||
else if ( iface == INFRA_IFACE )
|
||||
n = MTCE_HBS_NETWORK_INFRA ;
|
||||
#ifdef MONITORED_OAM_NETWORK
|
||||
else if ( iface == OAM_IFACE )
|
||||
n = MTCE_HBS_NETWORK_OAM ;
|
||||
#endif
|
||||
else
|
||||
return ;
|
||||
|
||||
if ( not_responding_hosts )
|
||||
{
|
||||
clog1 ("controller-%d %s enabled:%d not responding:%d",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
ctrl.monitored_hosts,
|
||||
not_responding_hosts);
|
||||
}
|
||||
else
|
||||
{
|
||||
clog1 ("controller-%d %s has %d monitored hosts and all are responding",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
ctrl.monitored_hosts);
|
||||
}
|
||||
|
||||
/* Look-up active history array for this network combination */
|
||||
mtce_hbs_cluster_history_type * history_ptr = NULL ;
|
||||
GET_CLUSTER_HISTORY_PTR(ctrl.cluster, ctrl.this_controller ,n);
|
||||
if ( history_ptr == NULL )
|
||||
{
|
||||
if ( ctrl.cluster.histories >= MTCE_HBS_MAX_HISTORY_ELEMENTS )
|
||||
{
|
||||
/* Should never happen but if it does then log without floooding */
|
||||
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
|
||||
"Unable to store history beyond %d ",
|
||||
ctrl.cluster.histories );
|
||||
return ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Adding a new history slot. */
|
||||
history_ptr = &ctrl.cluster.history[ctrl.cluster.histories] ;
|
||||
ctrl.cluster.histories++ ;
|
||||
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
||||
history_ptr->controller = ctrl.this_controller ;
|
||||
history_ptr->network = n ;
|
||||
|
||||
/* Log new network history as its being started. */
|
||||
ilog ("controller-%d %s network history add",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
/* Manage storage-0 status. */
|
||||
if ( ctrl.cluster.storage0_enabled )
|
||||
{
|
||||
/* Handle storage-0 status change from not responding to responding. */
|
||||
if ( storage_0_responding == true )
|
||||
{
|
||||
if (history_ptr->storage0_responding == false)
|
||||
{
|
||||
history_ptr->storage0_responding = true ;
|
||||
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str());
|
||||
}
|
||||
if (ctrl.storage_0_not_responding_count[n])
|
||||
ctrl.storage_0_not_responding_count[n] = 0 ;
|
||||
}
|
||||
/* Count the storage-0 not responding case for this network. */
|
||||
else
|
||||
{
|
||||
ctrl.storage_0_not_responding_count[n]++ ;
|
||||
if ( ctrl.storage_0_not_responding_count[n] == 2 )
|
||||
{
|
||||
ilog ("controller-%d %s heartbeat ; storage-0 has 2 misses",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str() );
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle storage-0 status change from responding to not responding. */
|
||||
if (( history_ptr->storage0_responding == true ) &&
|
||||
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
|
||||
{
|
||||
history_ptr->storage0_responding = false ;
|
||||
ilog ("controller-%d %s heartbeat ; storage-0 is not responding",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str() );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Typical path for storage-0 disabled or normal non-storage system case */
|
||||
if ( history_ptr->storage0_responding == true )
|
||||
history_ptr->storage0_responding = false ;
|
||||
|
||||
/* Handle clearing threshold count when storage-0 is not enabled. */
|
||||
if ( ctrl.storage_0_not_responding_count[n] )
|
||||
ctrl.storage_0_not_responding_count[n] = 0 ;
|
||||
}
|
||||
|
||||
/*
|
||||
* Manage the history entry index.
|
||||
*
|
||||
* Get the previous entry index ...
|
||||
* ... which is the one before the oldest index.
|
||||
* ... which is the index for the next entry.
|
||||
*/
|
||||
unsigned short last_entry_index ;
|
||||
if ( history_ptr->oldest_entry_index == 0 )
|
||||
{
|
||||
/* Go to the end of the array. */
|
||||
last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Otherwise, the previous index in the array */
|
||||
last_entry_index = history_ptr->oldest_entry_index - 1 ;
|
||||
}
|
||||
|
||||
/* Update the history with this data. */
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
|
||||
|
||||
if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled !=
|
||||
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
|
||||
( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding !=
|
||||
history_ptr->entry[ last_entry_index].hosts_responding))
|
||||
{
|
||||
/* Only log on change events. */
|
||||
if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled ==
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding )
|
||||
{
|
||||
ilog ("controller-%d %s cluster of %d is healthy",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled);
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("controller-%d %s cluster of %d with %d responding",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled,
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding);
|
||||
}
|
||||
}
|
||||
|
||||
/* Increment the entries count till it reaches the max. */
|
||||
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
|
||||
history_ptr->entries++ ;
|
||||
|
||||
/* Manage the next entry update index ; aka the oldest index. */
|
||||
if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
|
||||
history_ptr->oldest_entry_index = 0 ;
|
||||
else
|
||||
history_ptr->oldest_entry_index++ ;
|
||||
|
||||
/* clear the log throttle if we are updating history ok. */
|
||||
ctrl.log_throttle = 0 ;
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_append
|
||||
*
|
||||
* Description : Add this controller's cluster info to this pulse
|
||||
* request message.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_append ( hbs_message_type & msg )
|
||||
{
|
||||
unsigned short c = ctrl.this_controller ;
|
||||
|
||||
CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks);
|
||||
|
||||
msg.cluster.version = ctrl.cluster.version ;
|
||||
msg.cluster.revision = ctrl.cluster.revision ;
|
||||
msg.cluster.magic_number = ctrl.cluster.magic_number ;
|
||||
msg.cluster.period_msec = ctrl.cluster.period_msec ;
|
||||
msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ;
|
||||
msg.cluster.histories = ctrl.cluster.histories ;
|
||||
|
||||
int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks);
|
||||
|
||||
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
|
||||
c, ctrl.monitored_networks, ctrl.cluster.histories, bytes );
|
||||
|
||||
/* Copy the cluster into the message. */
|
||||
memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes);
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_unused_bytes
|
||||
*
|
||||
* Descrition : Used to set how much data to send in the heartbeat pulse
|
||||
* requests.
|
||||
*
|
||||
* Returns : The number of bytes that are not used in the full
|
||||
* history array cluster structure.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
unsigned short hbs_cluster_unused_bytes ( void )
|
||||
{
|
||||
if ( ctrl.cluster.histories <= MTCE_HBS_MAX_HISTORY_ELEMENTS )
|
||||
{
|
||||
unsigned short tmp = MTCE_HBS_MAX_HISTORY_ELEMENTS - ctrl.cluster.histories ;
|
||||
return((unsigned short)(sizeof(mtce_hbs_cluster_history_type)*tmp)) ;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_send
|
||||
*
|
||||
* Description: Send the cluster vault to SM.
|
||||
*
|
||||
* Returns : Nothing
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
/* NOTE: All code wrapped in this directive will be removed once
|
||||
* active/active heartbeating is delivered in next update */
|
||||
#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
|
||||
{
|
||||
|
||||
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
/* To assist SM with duplex integration ...
|
||||
*
|
||||
* This code emulates heartbeat redundancy by duplicating
|
||||
* controller history up to the number of provisioned
|
||||
* controllers until active-active heartbeat is delivered.
|
||||
*/
|
||||
int peer_controller ;
|
||||
bool copy_cluster = false ;
|
||||
if ( ctrl.this_controller == 0 )
|
||||
{
|
||||
peer_controller = 1 ;
|
||||
if ( ctrl.controller_1_enabled )
|
||||
{
|
||||
copy_cluster = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
peer_controller = 0 ;
|
||||
if ( ctrl.controller_0_enabled )
|
||||
{
|
||||
copy_cluster = true ;
|
||||
}
|
||||
}
|
||||
|
||||
int n, networks = ctrl.cluster.histories ;
|
||||
if ( copy_cluster )
|
||||
{
|
||||
for ( n = 0 ; n < networks ; n++ )
|
||||
{
|
||||
/* copy this controller history to create peer controller */
|
||||
ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ;
|
||||
|
||||
/* update the controller */
|
||||
ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ;
|
||||
ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ;
|
||||
ctrl.cluster.histories++ ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
ctrl.cluster.reqid = (unsigned short)reqid ;
|
||||
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
|
||||
{
|
||||
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
|
||||
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
|
||||
if ( bytes <= 0 )
|
||||
{
|
||||
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
|
||||
bytes , errno, strerror(errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
|
||||
hbs_cluster_dump ( ctrl.cluster );
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
if ( copy_cluster )
|
||||
{
|
||||
/* Clear out the other controllers data. */
|
||||
for ( n = networks ; n > 0 ; n-- )
|
||||
{
|
||||
/* copy c0 history to another controller */
|
||||
hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]);
|
||||
ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type);
|
||||
ctrl.cluster.histories-- ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
}
|
||||
|
||||
void hbs_cluster_log ( string & hostname, string prefix )
|
||||
{
|
||||
hbs_cluster_log ( hostname, ctrl.cluster, prefix );
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Active Active Heartbeating and Debug Member Functions
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_cmp
|
||||
*
|
||||
* Descrition : Performs a sanity check over the cluster structure.
|
||||
*
|
||||
* Assumptions : Debug tool, not called at runtime.
|
||||
*
|
||||
* Returns : PASS or FAIL
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
int hbs_cluster_cmp( hbs_message_type & msg )
|
||||
{
|
||||
if ( msg.cluster.version < ctrl.cluster.version )
|
||||
{
|
||||
wlog ("Unexpected version (%d:%d)",
|
||||
msg.cluster.version, ctrl.cluster.version );
|
||||
}
|
||||
else if ( msg.cluster.revision != ctrl.cluster.revision )
|
||||
{
|
||||
wlog ("Unexpected revision (%d:%d)",
|
||||
msg.cluster.revision, ctrl.cluster.revision );
|
||||
}
|
||||
else if ( msg.cluster.magic_number != ctrl.cluster.magic_number )
|
||||
{
|
||||
wlog ("Unexpected magic number (%d:%d)",
|
||||
msg.cluster.magic_number, ctrl.cluster.magic_number );
|
||||
}
|
||||
else if ( msg.cluster.period_msec != ctrl.cluster.period_msec )
|
||||
{
|
||||
wlog ("Cluster Heartbeat period delta (%d:%d)",
|
||||
msg.cluster.period_msec, ctrl.cluster.period_msec );
|
||||
}
|
||||
else if ( msg.cluster.storage0_enabled != ctrl.cluster.storage0_enabled )
|
||||
{
|
||||
wlog ("Cluster storage0 enabled state delta (%d:%d)",
|
||||
msg.cluster.storage0_enabled, ctrl.cluster.storage0_enabled );
|
||||
}
|
||||
else
|
||||
{
|
||||
return (PASS);
|
||||
}
|
||||
return (FAIL);
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_save
|
||||
*
|
||||
* Descrition : Copies the other controllers information from msg into
|
||||
* the cluster.
|
||||
*
|
||||
* NOTE: Does not do that right now.
|
||||
*
|
||||
* Assumptions : Place holder until active/active heartbeating is implemented.
|
||||
*
|
||||
* Returns : PASS or FAIL
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
int hbs_cluster_save ( string & hostname,
|
||||
mtce_hbs_network_enum network,
|
||||
hbs_message_type & msg )
|
||||
{
|
||||
// clog ("Add cluster info from peer controller");
|
||||
if ( ctrl.monitored_hosts )
|
||||
{
|
||||
/* compare cluster info and log deltas */
|
||||
// hbs_cluster_cmp( msg );
|
||||
UNUSED(msg);
|
||||
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
|
||||
}
|
||||
return (PASS);
|
||||
}
|
86
mtce/src/heartbeat/hbsCluster.h
Normal file
86
mtce/src/heartbeat/hbsCluster.h
Normal file
@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* @file StarlingX Maintenance Heartbeat Cluster Manager Module
|
||||
*
|
||||
*************************************************************************
|
||||
*
|
||||
* This module provides API for the hbsAgent service to call to
|
||||
* collect, store and send heartbeat cluster information to SM
|
||||
* upon request. See hbsCluster.h for formal API.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
#ifndef __HBSCLUSTER_H__
|
||||
#define __HBSCLUSTER_H__
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include "mtceHbsCluster.h" /* for ... the public API */
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : BYTES_IN_CLUSTER_VAULT
|
||||
*
|
||||
* Description : Calculates the number of bytes in the cluster vault based on
|
||||
* the number of valid history array elements included.
|
||||
*
|
||||
* Parameters :
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#define BYTES_IN_CLUSTER_VAULT(e) \
|
||||
(sizeof(mtce_hbs_cluster_type)-(sizeof(mtce_hbs_cluster_history_type)*(MTCE_HBS_MAX_HISTORY_ELEMENTS-e)))
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : CHECK_CTRL_NTWK_PARMS
|
||||
*
|
||||
* Description :
|
||||
*
|
||||
* Parameters :
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#define CHECK_CTRL_NTWK_PARMS(c,n) \
|
||||
if (( c > MTCE_HBS_MAX_CONTROLLERS ) || \
|
||||
( n > MTCE_HBS_NETWORKS )) \
|
||||
{ \
|
||||
slog ("Invalid parameter: %d:%d", c, n); \
|
||||
return ; \
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : GET_CLUSTER_HISTORY_PTR
|
||||
*
|
||||
* Description :
|
||||
*
|
||||
* Parameters :
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#define GET_CLUSTER_HISTORY_PTR(cluster, c,n) \
|
||||
for ( int h = 0 ; h < cluster.histories ; h++ ) \
|
||||
{ \
|
||||
if (( cluster.history[h].controller == c ) && \
|
||||
( cluster.history[h].network == n )) \
|
||||
{ \
|
||||
history_ptr = &cluster.history[h] ; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#define SET_CONTROLLER_HOSTNAME(c) \
|
||||
if ( c == 0 ) \
|
||||
controller = CONTROLLER_0 ; \
|
||||
else if ( c == 1 ) \
|
||||
controller = CONTROLLER_1 ; \
|
||||
else if ( c == 2 ) \
|
||||
controller = CONTROLLER_2 ; \
|
||||
else \
|
||||
controller = "unknown" \
|
||||
|
||||
#endif // __HBSCLUSTER_H__
|
346
mtce/src/heartbeat/hbsUtil.cpp
Normal file
346
mtce/src/heartbeat/hbsUtil.cpp
Normal file
@ -0,0 +1,346 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* @file Maintenance Heartbeat Utilities Module
|
||||
*
|
||||
*************************************************************************
|
||||
*
|
||||
* This module provides heartbeat utilities that are common to both
|
||||
* hbsAgent and hbsClient.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include "daemon_common.h" /* common daemon constructs and definitions */
|
||||
#include "hbsBase.h" /* mtce heartbeat constructs and definitions */
|
||||
|
||||
/* hbs_cluster_log utility support. log control array. */
|
||||
bool first_log[MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* has first history log out */
|
||||
bool was_diff [MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* was there a history diff */
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_utils_init
|
||||
*
|
||||
* Description : Module Init function
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_utils_init ( void )
|
||||
{
|
||||
MEMSET_ZERO ( first_log );
|
||||
MEMSET_ZERO ( was_diff );
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_history_init
|
||||
*
|
||||
* Description : Initialize a cluster history element.
|
||||
*
|
||||
* Parameters : Reference to a mtce_hbs_cluster_history_type (history element)
|
||||
*
|
||||
* Returns : Nothing
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history )
|
||||
{
|
||||
MEMSET_ZERO(history);
|
||||
history.entries_max = MTCE_HBS_HISTORY_ENTRIES ;
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_history_clear
|
||||
*
|
||||
* Description : Clear all history in the cluster vault.
|
||||
*
|
||||
* Parameters : mtce_hbs_cluster_type instance : the vault.
|
||||
*
|
||||
* Returns : Nothing
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_history_clear ( mtce_hbs_cluster_type & cluster )
|
||||
{
|
||||
if ( cluster.histories )
|
||||
{
|
||||
for ( int h = 0 ; h < cluster.histories ; h++ )
|
||||
hbs_cluster_history_init ( cluster.history[h] ) ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : cluster_network_name
|
||||
*
|
||||
* Description : converts what is a heartbeat cluster network id to
|
||||
* network name.
|
||||
*
|
||||
* Parameters : network id
|
||||
*
|
||||
* Returns : network name as a string
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
string hbs_cluster_network_name ( mtce_hbs_network_enum network )
|
||||
{
|
||||
switch ( network )
|
||||
{
|
||||
case MTCE_HBS_NETWORK_MGMT:
|
||||
return ("Mgmnt");
|
||||
case MTCE_HBS_NETWORK_INFRA:
|
||||
return ("Infra");
|
||||
|
||||
#ifdef MONITORED_OAM_NETWORK
|
||||
case MTCE_HBS_NETWORK_OAM:
|
||||
return ("Oam");
|
||||
#endif
|
||||
|
||||
default:
|
||||
slog ("invalid network enum (%d)", network );
|
||||
return ("unknown");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_log
|
||||
*
|
||||
* Description : logs changes to the heartbeat cluster
|
||||
*
|
||||
* Parameters : The heartbeat cluster structure
|
||||
*
|
||||
* Returns : Nothing
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_log ( string & hostname,
|
||||
mtce_hbs_cluster_type & cluster,
|
||||
string log_prefix )
|
||||
{
|
||||
// bool want_log = false ;
|
||||
|
||||
clog1 ("log %d histories", cluster.histories );
|
||||
for ( int h = 0 ; h < cluster.histories ; h++ )
|
||||
{
|
||||
if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES )
|
||||
{
|
||||
#define MAX_CLUSTER_LINE_LEN 100
|
||||
#define MAX_ENTRY_STR_LEN 10 /* "9999:9999 " */
|
||||
mtce_hbs_cluster_entry_type e = { 0, 0 } ;
|
||||
char str[MAX_CLUSTER_LINE_LEN] ;
|
||||
string line = "";
|
||||
int start = 0 ;
|
||||
int stop = 0 ;
|
||||
bool newline = false ;
|
||||
bool logit = false ;
|
||||
bool first = false ;
|
||||
string controller = "" ;
|
||||
|
||||
mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ;
|
||||
|
||||
clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
history_ptr->entries,
|
||||
history_ptr->controller,
|
||||
log_prefix.c_str());
|
||||
|
||||
|
||||
/* Manage local this_index for log display.
|
||||
* Display oldest to newest ; left to right
|
||||
*
|
||||
* */
|
||||
int this_index = history_ptr->oldest_entry_index ;
|
||||
for ( int count = 0 ; count < history_ptr->entries ; count++ )
|
||||
{
|
||||
if (( line.length() + MAX_ENTRY_STR_LEN ) >=
|
||||
MAX_CLUSTER_LINE_LEN )
|
||||
{
|
||||
newline = true ;
|
||||
}
|
||||
|
||||
#ifdef WANT_MINIMAL_LOGS
|
||||
/* TODO: enable in final update */
|
||||
if (( first_log[h] == true ) && ( newline == false ) &&
|
||||
( history_ptr->entry[this_index].hosts_enabled ==
|
||||
history_ptr->entry[this_index].hosts_responding ))
|
||||
{
|
||||
line.append(". ");
|
||||
continue ;
|
||||
}
|
||||
#endif
|
||||
|
||||
// want_log = true ;
|
||||
|
||||
if ( count == 0 )
|
||||
{
|
||||
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
|
||||
history_ptr->entry[this_index].hosts_enabled,
|
||||
history_ptr->entry[this_index].hosts_responding ); // , this_index );
|
||||
line.append (str);
|
||||
str[0] = '\0' ;
|
||||
}
|
||||
//#ifdef WANT_DOTS
|
||||
else if (( history_ptr->entry[this_index].hosts_enabled ==
|
||||
e.hosts_enabled ) &&
|
||||
( history_ptr->entry[this_index].hosts_responding ==
|
||||
e.hosts_responding ))
|
||||
{
|
||||
line.append(". ");
|
||||
}
|
||||
//#endif
|
||||
else
|
||||
{
|
||||
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
|
||||
history_ptr->entry[this_index].hosts_enabled,
|
||||
history_ptr->entry[this_index].hosts_responding ); // , this_index );
|
||||
line.append (str);
|
||||
str[0] = '\0' ;
|
||||
logit = true ;
|
||||
was_diff[h] = true ;
|
||||
}
|
||||
if (( logit == false ) && ( first_log[h] == false ))
|
||||
{
|
||||
first_log[h] = true ;
|
||||
logit = true ;
|
||||
}
|
||||
stop++ ;
|
||||
if ( newline == true )
|
||||
{
|
||||
if ( logit )
|
||||
{
|
||||
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
|
||||
if ( hostname == controller )
|
||||
{
|
||||
clog ("%s view %s %s %02d..%02d: %s,",
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
clog ("%s view from %s %s %s %02d..%02d: %s,",
|
||||
controller.c_str(),
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
}
|
||||
}
|
||||
start = stop + 1 ;
|
||||
line.clear();
|
||||
first = true ;
|
||||
newline = false ;
|
||||
}
|
||||
e = history_ptr->entry[this_index] ;
|
||||
|
||||
/* manage index tracking */
|
||||
if ( this_index == (MTCE_HBS_HISTORY_ENTRIES-1))
|
||||
this_index = 0 ;
|
||||
else
|
||||
this_index++ ;
|
||||
}
|
||||
if (( newline == false ) && ( line.length() ))
|
||||
{
|
||||
// ERIC
|
||||
if (( logit == false ) && ( was_diff[h] == true ))
|
||||
{
|
||||
logit = true ;
|
||||
was_diff[h] = false ;
|
||||
}
|
||||
|
||||
if ( logit )
|
||||
{
|
||||
if ( first )
|
||||
{
|
||||
clog ("............ %s %s %02d..%02d: %s",
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
|
||||
if ( hostname == controller )
|
||||
{
|
||||
clog ("%s view %s %s %02d..%02d: %s",
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
clog ("%s view from %s %s %s %02d..%02d: %s",
|
||||
controller.c_str(),
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(), /* Infra <- */
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
was_diff[h] = false ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* name : hbs_cluster_dump
|
||||
*
|
||||
* Description: Formatted dump of the vault contents to the log file.
|
||||
*
|
||||
***************************************************************************/
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault )
|
||||
{
|
||||
syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------");
|
||||
syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes",
|
||||
vault.version,
|
||||
vault.revision,
|
||||
vault.period_msec,
|
||||
vault.reqid,
|
||||
vault.storage0_enabled ? "enabled" : "disabled",
|
||||
vault.histories,
|
||||
vault.bytes );
|
||||
for ( int h = 0 ; h < vault.histories ; h++ )
|
||||
{
|
||||
#define MAX_LINE_LEN (500)
|
||||
char str[MAX_LINE_LEN] ;
|
||||
int i = 0 ;
|
||||
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
|
||||
{
|
||||
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
|
||||
vault.history[h].oldest_entry_index==e ? '>' : ' ',
|
||||
vault.history[h].entry[e].hosts_enabled,
|
||||
vault.history[h].entry[e].hosts_responding);
|
||||
i = strlen(str) ;
|
||||
}
|
||||
syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s",
|
||||
vault.history[h].controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
|
||||
vault.storage0_enabled ? "y" : "n",
|
||||
vault.history[h].storage0_responding ? "y" : "n",
|
||||
vault.history[h].entries_max,
|
||||
vault.history[h].entries,
|
||||
str);
|
||||
}
|
||||
// dump_memory ( &vault, 16, vault.bytes );
|
||||
}
|
||||
|
||||
|
109
mtce/src/heartbeat/mtceHbsCluster.h
Normal file
109
mtce/src/heartbeat/mtceHbsCluster.h
Normal file
@ -0,0 +1,109 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* @file StarlingX Maintenance Heartbeat Cluster Manager Module
|
||||
*
|
||||
*************************************************************************
|
||||
*
|
||||
* This module provides API for the hbsAgent service to call to
|
||||
* collect, store and send heartbeat cluster information to SM
|
||||
* upon request. See hbsCluster.h for formal API.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
#ifndef __MTCEHBSCLUSTER_H__
|
||||
#define __MTCEHBSCLUSTER_H__
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
/**************************************************************
|
||||
* Implementation Structure
|
||||
*************************************************************/
|
||||
|
||||
#define MTCE_HBS_CLUSTER_VERSION (1)
|
||||
#define MTCE_HBS_CLUSTER_REVISION (0)
|
||||
#define MTCE_HBS_MAGIC_NUMBER (0x5aa5)
|
||||
|
||||
typedef enum
|
||||
{
|
||||
MTCE_HBS_NETWORK_MGMT = 0,
|
||||
MTCE_HBS_NETWORK_INFRA = 1,
|
||||
#ifdef MONITORED_OAM_NETWORK
|
||||
MTCE_HBS_NETWORK_OAM,
|
||||
#endif
|
||||
MTCE_HBS_NETWORKS
|
||||
} mtce_hbs_network_enum ;
|
||||
|
||||
#ifdef THREE_CONTROLLER_SYSTEM
|
||||
#define MTCE_HBS_MAX_CONTROLLERS (3)
|
||||
#else
|
||||
#define MTCE_HBS_MAX_CONTROLLERS (2)
|
||||
#endif
|
||||
|
||||
#ifdef MONITORED_OAM_NETWORK
|
||||
#define MTCE_HBS_MAX_NETWORKS (3)
|
||||
#else
|
||||
#define MTCE_HBS_MAX_NETWORKS (2)
|
||||
#endif
|
||||
|
||||
// value of 20 at 100 msec period is 2 seconds of history */
|
||||
#define MTCE_HBS_HISTORY_ENTRIES (20)
|
||||
|
||||
/* maximum number of history elements permitted in a cluster history summary */
|
||||
#define MTCE_HBS_MAX_HISTORY_ELEMENTS ((MTCE_HBS_MAX_CONTROLLERS)*(MTCE_HBS_NETWORKS))
|
||||
|
||||
#ifndef ALIGN_PACK
|
||||
#define ALIGN_PACK(x) __attribute__((packed)) x
|
||||
#endif
|
||||
|
||||
/* A single element of Heartbeat Cluster History for one heartbeat period */
|
||||
typedef struct
|
||||
{
|
||||
unsigned short hosts_enabled ; /* # of hosts being hb monitored */
|
||||
unsigned short hosts_responding ; /* # of hosts that responsed to hb*/
|
||||
} ALIGN_PACK(mtce_hbs_cluster_entry_type);
|
||||
|
||||
|
||||
/* Heartbeat Cluster History for all monitored networks of a Controller */
|
||||
typedef struct
|
||||
{
|
||||
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
|
||||
unsigned short network :4 ; /* see mtce_hbs_network_enum */
|
||||
unsigned short reserved_bits :7 ; /* future - initted to 0 */
|
||||
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
|
||||
unsigned short entries ; /* # of valid values in .entry */
|
||||
unsigned short entries_max ; /* max size of the enry array */
|
||||
unsigned short oldest_entry_index ; /* the oldest entry in the array */
|
||||
|
||||
/* historical array of entries for a specific network */
|
||||
mtce_hbs_cluster_entry_type entry [MTCE_HBS_HISTORY_ENTRIES] ;
|
||||
|
||||
} ALIGN_PACK(mtce_hbs_cluster_history_type) ;
|
||||
|
||||
/* Heartbeat Cluster History for all monitored networks of all Controllers */
|
||||
typedef struct
|
||||
{
|
||||
/* Header - Static Data - 4 bytes */
|
||||
unsigned char version ; /* public API MTCE_HBS_CLUSTER_VERSION */
|
||||
unsigned char revision ; /* public API MTCE_HBS_CLUSTER_REVISION */
|
||||
unsigned short magic_number ; /* public API MTCE_HBS_MAGIC_NUMBER */
|
||||
|
||||
/* Control - Dynamic Data - 8 bytes */
|
||||
unsigned short reqid ; /* added from SM cluster request */
|
||||
unsigned short period_msec ; /* heartbeat period in milliseconds */
|
||||
unsigned short bytes ; /* total struct size self check */
|
||||
unsigned char storage0_enabled; /* bool containing true or false */
|
||||
unsigned char histories ; /* How many hostory elements follow */
|
||||
|
||||
/* Array of Cluster History
|
||||
*
|
||||
* - histories above specifies how many
|
||||
* elements of this array are populated.
|
||||
*/
|
||||
mtce_hbs_cluster_history_type history [MTCE_HBS_MAX_HISTORY_ELEMENTS] ;
|
||||
|
||||
} ALIGN_PACK(mtce_hbs_cluster_type) ;
|
||||
|
||||
#endif // __HBSCLUSTER_H__
|
@ -23,6 +23,7 @@ SRCS += mtcKeyApi.cpp
|
||||
SRCS += mtcCmdHdlr.cpp
|
||||
SRCS += mtcNodeMnfa.cpp
|
||||
SRCS += mtcVimApi.cpp
|
||||
SRCS += mtcStubs.cpp
|
||||
|
||||
COMPUTE_OBJS = mtcNodeComp.o
|
||||
COMPUTE_OBJS += mtcCompMsg.o
|
||||
|
@ -1935,8 +1935,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
* the host has not reset yet we have disabled services
|
||||
* then now we need to reset the host to prevet VM duplication
|
||||
* by forcing a full enable */
|
||||
if (( node_ptr->uptime_save != 0 ) &&
|
||||
( node_ptr->uptime >= node_ptr->uptime_save ))
|
||||
if ((( node_ptr->uptime_save != 0 ) &&
|
||||
( node_ptr->uptime >= node_ptr->uptime_save )) ||
|
||||
(( node_ptr->uptime_save == 0 ) &&
|
||||
( node_ptr->uptime > MTC_MINS_15 )))
|
||||
{
|
||||
ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
|
||||
node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
|
17
mtce/src/maintenance/mtcStubs.cpp
Normal file
17
mtce/src/maintenance/mtcStubs.cpp
Normal file
@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Maintenance Agent Stubs
|
||||
*/
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include "nodeClass.h" /* The main link class */
|
||||
|
||||
void hbs_cluster_log ( void ) { }
|
40
mtce/src/scripts/hbs-query
Executable file
40
mtce/src/scripts/hbs-query
Executable file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2013-2016 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
#
|
||||
# This utility is primarily used by no reboot patching for process restart
|
||||
#
|
||||
# This script sends a jason string containing the the restart command
|
||||
# and ${1} as the specified process name to pmond over the loopback
|
||||
# interface on port 2117
|
||||
#
|
||||
# Linux Standard Base (LSB) Error Codes
|
||||
RETVAL=0
|
||||
GENERIC_ERROR=1
|
||||
INVALID_ARGS=2
|
||||
UNSUPPORTED_FEATURE=3
|
||||
NOT_INSTALLED=5
|
||||
NOT_RUNNING=7
|
||||
|
||||
PROTOCOL="UDP4-DATAGRAM"
|
||||
ADDRESS="127.0.0.1"
|
||||
|
||||
socat_exec=`(which socat) 2> /dev/null`
|
||||
|
||||
if [ -z ${socat_exec} ] ; then
|
||||
logger "Error: $0 cannot find socat exec"
|
||||
exit ${NOT_INSTALLED}
|
||||
fi
|
||||
reqid=123
|
||||
|
||||
if [ "${1}" != "" ] ; then
|
||||
reqid=${1}
|
||||
fi
|
||||
|
||||
port=$(cat /etc/mtc.ini | awk '{if ($1 == "sm_server_port") { print $3; }}')
|
||||
echo "{\"origin\":\"sm\", \"service\":\"heartbeat\", \"request\":\"cluster_info\", \"reqid\": $reqid }" | socat - ${PROTOCOL}:${ADDRESS}:${port}
|
||||
|
||||
exit ${RETVAL}
|
Loading…
x
Reference in New Issue
Block a user