Merge "Collectd+InfluxDb-RMON Replacement(ALL METRICS) P1"
This commit is contained in:
commit
4a4c540a3c
@ -421,11 +421,6 @@ install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-restart %{buildroot}/%{lo
|
||||
install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-start %{buildroot}/%{local_sbindir}/pmon-start
|
||||
install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-stop %{buildroot}/%{local_sbindir}/pmon-stop
|
||||
|
||||
# test tools
|
||||
install -m 755 %{_buildsubdir}/hwmon/scripts/show_hp360 %{buildroot}/%{_sbindir}/show_hp360
|
||||
install -m 755 %{_buildsubdir}/hwmon/scripts/show_hp380 %{buildroot}/%{_sbindir}/show_hp380
|
||||
install -m 755 %{_buildsubdir}/hwmon/scripts/show_quanta %{buildroot}/%{_sbindir}/show_quanta
|
||||
|
||||
# init script files
|
||||
install -m 755 -p -D %{_buildsubdir}/scripts/mtcClient %{buildroot}%{_sysconfdir}/init.d/mtcClient
|
||||
install -m 755 -p -D %{_buildsubdir}/scripts/hbsClient %{buildroot}%{_sysconfdir}/init.d/hbsClient
|
||||
@ -498,9 +493,6 @@ install -m 755 -d %{buildroot}%{_sysconfdir}/rmonapi.d
|
||||
install -m 755 -d %{buildroot}%{_sysconfdir}/rmonfiles.d
|
||||
install -m 755 -d %{buildroot}%{_sysconfdir}/rmon_interfaces.d
|
||||
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/remotelogging_resource.conf %{buildroot}%{local_etc_rmond}/remotelogging_resource.conf
|
||||
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/cpu_resource.conf %{buildroot}%{local_etc_rmond}/cpu_resource.conf
|
||||
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/memory_resource.conf %{buildroot}%{local_etc_rmond}/memory_resource.conf
|
||||
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/filesystem_resource.conf %{buildroot}%{local_etc_rmond}/filesystem_resource.conf
|
||||
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/cinder_virtual_resource.conf %{buildroot}%{local_etc_rmond}/cinder_virtual_resource.conf
|
||||
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/nova_virtual_resource.conf %{buildroot}%{local_etc_rmond}/nova_virtual_resource.conf
|
||||
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/oam_resource.conf %{buildroot}%{_sysconfdir}/rmon_interfaces.d/oam_resource.conf
|
||||
@ -676,10 +668,7 @@ install -m 755 -d %{buildroot}/var/run
|
||||
%{local_etc_logrotated}/rmon.logrotate
|
||||
%{_unitdir}/rmon.service
|
||||
|
||||
%{local_etc_rmond}/filesystem_resource.conf
|
||||
%{local_etc_rmond}/cpu_resource.conf
|
||||
%{local_etc_rmond}/remotelogging_resource.conf
|
||||
%{local_etc_rmond}/memory_resource.conf
|
||||
%{local_etc_rmond}/cinder_virtual_resource.conf
|
||||
%{local_etc_rmond}/nova_virtual_resource.conf
|
||||
|
||||
@ -713,10 +702,6 @@ install -m 755 -d %{buildroot}/var/run
|
||||
%{local_etc_logrotated}/hwmon.logrotate
|
||||
%{ocf_resourced}/platform/hwmon
|
||||
|
||||
%{_sbindir}/show_hp380
|
||||
%{_sbindir}/show_hp360
|
||||
%{_sbindir}/show_quanta
|
||||
|
||||
%{_sysconfdir}/init.d/hwmon
|
||||
%{local_bindir}/hwmond
|
||||
|
||||
|
@ -255,12 +255,38 @@ const char * get_mtcNodeCommand_str ( int cmd )
|
||||
}
|
||||
|
||||
|
||||
void print_mtc_message ( string hostname, int direction, mtc_message_type & msg , const char * iface, bool force )
|
||||
void print_mtc_message ( string hostname,
|
||||
int direction,
|
||||
mtc_message_type & msg,
|
||||
const char * iface,
|
||||
bool force )
|
||||
{
|
||||
/* Handle raw json string messages differently.
|
||||
* Those messages just have a json string that starts at the header */
|
||||
if ( msg.hdr[0] == '{' )
|
||||
{
|
||||
if ( force )
|
||||
{
|
||||
ilog ("%s %s (%s network) - %s\n",
|
||||
hostname.c_str(),
|
||||
direction ? "rx <-" : "tx ->" ,
|
||||
iface,
|
||||
msg.hdr);
|
||||
}
|
||||
else
|
||||
{
|
||||
mlog1 ("%s %s (%s network) - %s\n",
|
||||
hostname.c_str(),
|
||||
direction ? "rx <-" : "tx ->" ,
|
||||
iface,
|
||||
msg.hdr);
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
string str = "-" ;
|
||||
if ( msg.buf[0] )
|
||||
str = msg.buf ;
|
||||
|
||||
if ( force )
|
||||
{
|
||||
ilog ("%s %s %s (%s network) %d.%d %x:%x:%x.%x.%x.%x [%s] %s\n",
|
||||
|
@ -92,6 +92,9 @@ void daemon_exit ( void );
|
||||
#define NODE_HEALTHY (1)
|
||||
#define NODE_UNHEALTHY (2)
|
||||
|
||||
#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count")
|
||||
#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/")
|
||||
|
||||
#define HOST_IS_VIRTUAL ((const char *)"/var/run/virtual.host")
|
||||
|
||||
/** Configuration Pass/Fail Flag File */
|
||||
@ -146,10 +149,6 @@ void daemon_exit ( void );
|
||||
|
||||
#define BM_DNSMASQ_FILENAME ((const char *)"dnsmasq.bmc_hosts")
|
||||
|
||||
/* Added for Centos */
|
||||
#define CENTOS_RELEASE_FILE ((const char *)"/etc/centos-release")
|
||||
#define SYSTEMD_SERVICE_FILE_DIR ((const char *)"/usr/lib/systemd/system")
|
||||
|
||||
#define THREAD_NAME__IPMITOOL ((const char *)("ipmitool"))
|
||||
|
||||
#define IPMITOOL_PATH_AND_FILENAME ((const char *)("/usr/bin/ipmitool"))
|
||||
@ -970,7 +969,7 @@ string get_configStages_str ( mtc_configStages_enum stage );
|
||||
#define DEGRADE_MASK_SUBF 0x00000100
|
||||
#define DEGRADE_MASK_SM 0x00000200
|
||||
#define DEGRADE_MASK_CONFIG 0x00000400
|
||||
#define DEGRADE_MASK_RES2 0x00000800
|
||||
#define DEGRADE_MASK_COLLECTD 0x00000800
|
||||
#define DEGRADE_MASK_ENABLE 0x00001000
|
||||
#define DEGRADE_MASK_RES4 0x00002000
|
||||
#define DEGRADE_MASK_RES5 0x00004000
|
||||
|
@ -662,8 +662,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->log_throttle = 0 ;
|
||||
ptr->no_work_log_throttle = 0 ;
|
||||
|
||||
/* Clear the degrade control structs */
|
||||
ptr->degrade_mask = DEGRADE_MASK_NONE ;
|
||||
ptr->degrade_mask = ptr->degrade_mask_save = DEGRADE_MASK_NONE ;
|
||||
|
||||
ptr->degraded_resources_list.clear () ;
|
||||
ptr->pmond_ready = false ;
|
||||
ptr->rmond_ready = false ;
|
||||
@ -4561,16 +4561,6 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
|
||||
}
|
||||
|
||||
hbs_minor_clear ( node_ptr, iface );
|
||||
|
||||
/* Set the host available if the degrade mask is now
|
||||
* cleared and we are degraded */
|
||||
if ( node_ptr->degrade_mask == 0 )
|
||||
{
|
||||
if ( get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )
|
||||
{
|
||||
set_availStatus ( hostname, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
}
|
||||
}
|
||||
else if ( this->mtcTimer_dor.tid )
|
||||
{
|
||||
@ -4602,12 +4592,6 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_HEARTBEAT_INFRA ;
|
||||
}
|
||||
}
|
||||
|
||||
/* No point in changing if we are already degraded */
|
||||
if ( nodeLinkClass::get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE )
|
||||
{
|
||||
set_availStatus ( hostname, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4621,7 +4605,7 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
|
||||
wlog ("%s Unknown host\n", hostname.c_str());
|
||||
return ;
|
||||
}
|
||||
|
||||
|
||||
/* is this a clear event ? */
|
||||
if ( clear_event == true )
|
||||
{
|
||||
@ -4639,15 +4623,15 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
|
||||
|
||||
else if ( node_ptr->hbs_minor[iface] != true )
|
||||
{
|
||||
mnfa_add_host ( node_ptr, iface );
|
||||
mnfa_add_host ( node_ptr, iface );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Interface to declare that a key service on the
|
||||
/** Interface to declare that a key service on the
|
||||
* specified host is up, running and ready */
|
||||
int nodeLinkClass::declare_service_ready ( string & hostname,
|
||||
int nodeLinkClass::declare_service_ready ( string & hostname,
|
||||
unsigned int service )
|
||||
{
|
||||
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
@ -4661,18 +4645,11 @@ int nodeLinkClass::declare_service_ready ( string & hostname,
|
||||
node_ptr->pmond_ready = true ;
|
||||
plog ("%s got pmond ready event\n", hostname.c_str());
|
||||
|
||||
/* A ready event means that pmond pocess has started.
|
||||
* Any previous history is gone. Cleanup mtce.
|
||||
/* A ready event means that pmond pocess has started.
|
||||
* Any previous history is gone. Cleanup mtce.
|
||||
* If there are still process issues on this host then
|
||||
* they will be reported again.*/
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ;
|
||||
if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE )
|
||||
{
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
else if ( service == MTC_SERVICE_HWMOND )
|
||||
@ -4719,14 +4696,6 @@ int nodeLinkClass::degrade_pmond_clear ( string & hostname )
|
||||
if ( node_ptr->degrade_mask )
|
||||
{
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ;
|
||||
|
||||
if ( !node_ptr->degrade_mask )
|
||||
{
|
||||
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* The only detectable inservice failures are process failures */
|
||||
@ -4735,15 +4704,65 @@ int nodeLinkClass::degrade_pmond_clear ( string & hostname )
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/* This private API handles event messages from collectd */
|
||||
int nodeLinkClass::collectd_notify_handler ( string & hostname,
|
||||
string & resource,
|
||||
string & state )
|
||||
{
|
||||
int rc = PASS ;
|
||||
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr == NULL )
|
||||
{
|
||||
wlog ("%s Unknown Host\n", hostname.c_str());
|
||||
return (FAIL_UNKNOWN_HOSTNAME) ;
|
||||
}
|
||||
if ( state == "clear" )
|
||||
{
|
||||
if ( node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD )
|
||||
{
|
||||
ilog("%s collectd degrade state change ; assert -> clear (%s)",
|
||||
hostname.c_str(), resource.c_str());
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_COLLECTD ;
|
||||
}
|
||||
else
|
||||
{
|
||||
mlog3("%s collectd degrade 'clear' request (%s)",
|
||||
hostname.c_str(), resource.c_str());
|
||||
}
|
||||
}
|
||||
else if ( state == "assert" )
|
||||
{
|
||||
if ( (node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD) == 0 )
|
||||
{
|
||||
ilog("%s collectd degrade state change ; clear -> assert (due to %s)",
|
||||
hostname.c_str(), resource.c_str());
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_COLLECTD ;
|
||||
}
|
||||
else
|
||||
{
|
||||
mlog3("%s collectd degrade 'assert' request (%s)",
|
||||
hostname.c_str(), resource.c_str());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s collectd degrade state unknown (%s)\n",
|
||||
hostname.c_str(),
|
||||
state.c_str());
|
||||
rc = FAIL_OPERATION ;
|
||||
}
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/** Resource Monitor 'Clear' Event handler.
|
||||
*
|
||||
*
|
||||
* The resource specified will be removed from the
|
||||
* 'degraded_resources_list' for specified host.
|
||||
* if there are no other degraded resources or other
|
||||
* degraded services/reasons against that host then
|
||||
* this handler will clear the degrade state for the
|
||||
* specified host all together. */
|
||||
int nodeLinkClass::degrade_resource_clear ( string & hostname,
|
||||
int nodeLinkClass::degrade_resource_clear ( string & hostname,
|
||||
string & resource )
|
||||
{
|
||||
/* lr - Log Prefix Rmon */
|
||||
@ -4788,18 +4807,6 @@ int nodeLinkClass::degrade_resource_clear ( string & hostname,
|
||||
if ( node_ptr->degraded_resources_list.empty() )
|
||||
{
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_RESMON ; ;
|
||||
if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE )
|
||||
{
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s Remains Degraded - Reason Mask:0x%08x\n",
|
||||
hostname.c_str(), node_ptr->degrade_mask );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -4874,30 +4881,6 @@ int nodeLinkClass::node_degrade_control ( string & hostname, int state, string s
|
||||
|
||||
/* clear the mask regardless of host state */
|
||||
node_ptr->degrade_mask &= ~service_flag ;
|
||||
|
||||
/* only applies if host is unlocked-enabled-degraded and
|
||||
* there are no other degrade flags in the degrade mask */
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
|
||||
{
|
||||
if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* TODO: convert lask to a sring or services and print that string */
|
||||
wlog ("%s remains degraded - degrade mask:0x%08x\n",
|
||||
hostname.c_str(),
|
||||
node_ptr->degrade_mask );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
dlog ("%s unexpected degrade clear for '%s' service\n",
|
||||
hostname.c_str(), service.c_str() );
|
||||
}
|
||||
rc = PASS ;
|
||||
break ;
|
||||
}
|
||||
@ -4910,13 +4893,6 @@ int nodeLinkClass::node_degrade_control ( string & hostname, int state, string s
|
||||
wlog ("%s degrade 'assert' from '%s'\n", hostname.c_str(), service.c_str() );
|
||||
node_ptr->degrade_mask |= service_flag ;
|
||||
}
|
||||
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
rc = PASS ;
|
||||
break ;
|
||||
}
|
||||
@ -5232,10 +5208,6 @@ int nodeLinkClass::degrade_process_raise ( string & hostname,
|
||||
{
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_PMON ;
|
||||
wlog ("%s is degraded due to '%s' process failure\n", hostname.c_str(), process.c_str());
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
}
|
||||
}
|
||||
return (PASS);
|
||||
@ -5412,11 +5384,6 @@ int nodeLinkClass::degrade_resource_raise ( string & hostname,
|
||||
{
|
||||
dlog ("%s '%s' Degraded (again)\n", lr.c_str(), resource.c_str());
|
||||
}
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
@ -7039,9 +7006,6 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid )
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/")
|
||||
#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count")
|
||||
|
||||
void autorecovery_clear ( string hostname )
|
||||
{
|
||||
string ar_file = TMP_DIR_PATH + hostname + AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
|
@ -585,6 +585,7 @@ private:
|
||||
|
||||
/* Bit mask of degrade reasons */
|
||||
unsigned int degrade_mask ;
|
||||
unsigned int degrade_mask_save ;
|
||||
|
||||
/** Process Monitor Daemon Flag Missing count */
|
||||
int pmon_missing_count ;
|
||||
@ -785,6 +786,7 @@ private:
|
||||
int insv_test_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int stress_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int bm_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int uptime_handler ( void );
|
||||
|
||||
int host_services_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
@ -1731,6 +1733,11 @@ public:
|
||||
/** Calculates and returns the mnfa threshold based on enabled hosts */
|
||||
int mnfa_calculate_threshold ( string hostname );
|
||||
|
||||
/* collectd event handler */
|
||||
int collectd_notify_handler ( string & hostname,
|
||||
string & resource,
|
||||
string & state );
|
||||
|
||||
/*****************************************
|
||||
** Process Monitor Event Utilities API **
|
||||
*****************************************/
|
||||
|
@ -68,9 +68,9 @@ string daemon_read_file ( const char * filename );
|
||||
void daemon_logfile_close ( void );
|
||||
void daemon_logfile_open ( void );
|
||||
|
||||
void daemon_log ( const char * filename , const char * str );
|
||||
void daemon_log_value ( const char * filename , int val );
|
||||
void daemon_log_value ( const char * filename , const char * str, int val );
|
||||
int daemon_log ( const char * filename , const char * str );
|
||||
int daemon_log_value ( const char * filename , int val );
|
||||
int daemon_log_value ( const char * filename , const char * str, int val );
|
||||
|
||||
/* reads the first line of a file and if it contains a string
|
||||
* that represents an integer value then return it */
|
||||
|
@ -103,7 +103,7 @@ void daemon_healthcheck ( const char * sig )
|
||||
|
||||
#define BUFFER 1024
|
||||
|
||||
void daemon_log_value ( const char * filename , const char * str, int val )
|
||||
int daemon_log_value ( const char * filename , const char * str, int val )
|
||||
{
|
||||
FILE * file_stream = fopen (filename, "a" ) ;
|
||||
if ( file_stream != NULL )
|
||||
@ -111,10 +111,12 @@ void daemon_log_value ( const char * filename , const char * str, int val )
|
||||
fprintf ( file_stream,"%s %d\n", str, val );
|
||||
fflush (file_stream);
|
||||
fclose (file_stream);
|
||||
return (PASS);
|
||||
}
|
||||
return (FAIL_FILE_OPEN);
|
||||
}
|
||||
|
||||
void daemon_log_value ( const char * filename , int val )
|
||||
int daemon_log_value ( const char * filename , int val )
|
||||
{
|
||||
FILE * file_stream = fopen (filename, "w" ) ;
|
||||
if ( file_stream != NULL )
|
||||
@ -122,10 +124,12 @@ void daemon_log_value ( const char * filename , int val )
|
||||
fprintf ( file_stream,"%d\n", val );
|
||||
fflush (file_stream);
|
||||
fclose (file_stream);
|
||||
return (PASS);
|
||||
}
|
||||
return (FAIL_FILE_OPEN);
|
||||
}
|
||||
|
||||
void daemon_log ( const char * filename , const char * str )
|
||||
int daemon_log ( const char * filename , const char * str )
|
||||
{
|
||||
FILE * file_stream = fopen (filename, "a" ) ;
|
||||
if ( file_stream != NULL )
|
||||
@ -133,7 +137,9 @@ void daemon_log ( const char * filename , const char * str )
|
||||
fprintf ( file_stream,"%s\n", str );
|
||||
fflush (file_stream);
|
||||
fclose (file_stream);
|
||||
return (PASS);
|
||||
}
|
||||
return (FAIL_FILE_OPEN);
|
||||
}
|
||||
|
||||
/* reads the first line of a file and if it contains a string
|
||||
|
@ -191,8 +191,50 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
|
||||
|
||||
print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(iface), false );
|
||||
|
||||
if ( msg.hdr[0] == '{' )
|
||||
{
|
||||
int rc1 ;
|
||||
string service ;
|
||||
|
||||
mlog1 ("%s\n", &msg.hdr[0] );
|
||||
|
||||
rc1 = jsonUtil_get_key_val(&msg.hdr[0],"service", service );
|
||||
if ( rc1 == PASS )
|
||||
{
|
||||
if ( service == "collectd_notifier" )
|
||||
{
|
||||
int rc1,rc2,rc3 ;
|
||||
string hostname,resource,state ;
|
||||
|
||||
rc1 = jsonUtil_get_key_val(&msg.hdr[0],"hostname", hostname );
|
||||
rc2 = jsonUtil_get_key_val(&msg.hdr[0],"resource", resource );
|
||||
rc3 = jsonUtil_get_key_val(&msg.hdr[0],"degrade", state );
|
||||
if ( rc1|rc2|rc3 )
|
||||
{
|
||||
elog ("failed to parse '%s' message\n", service.c_str());
|
||||
wlog ("... %s\n", &msg.hdr[0] );
|
||||
}
|
||||
else
|
||||
{
|
||||
obj_ptr->collectd_notify_handler ( hostname,
|
||||
resource,
|
||||
state );
|
||||
}
|
||||
}
|
||||
/* future service requests */
|
||||
else
|
||||
{
|
||||
wlog ("Unexpected service request: '%s'\n", service.c_str());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("Unexpected json message: %s\n", &msg.hdr[0] );
|
||||
}
|
||||
}
|
||||
|
||||
/* Check for response messages */
|
||||
if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) )
|
||||
else if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) )
|
||||
{
|
||||
obj_ptr->set_cmd_resp ( hostname , msg ) ;
|
||||
}
|
||||
|
@ -74,6 +74,9 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
||||
/* manage the host connected state and board management alarms */
|
||||
nodeLinkClass::bm_handler ( node_ptr );
|
||||
|
||||
/* manage host's degrade state */
|
||||
nodeLinkClass::degrade_handler ( node_ptr );
|
||||
|
||||
/*
|
||||
* Always run the offline handler
|
||||
*
|
||||
|
@ -5599,15 +5599,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
}
|
||||
|
||||
/* handle coming out of the ADD in a degraded state */
|
||||
if (( node_ptr->degrade_mask != 0 ) &&
|
||||
(( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )))
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
|
||||
node_ptr->mtcAlive_gate = false ;
|
||||
node_ptr->addStage = MTC_ADD__DONE ;
|
||||
break;
|
||||
@ -6111,22 +6102,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
alarm_compute_clear ( node_ptr, false );
|
||||
}
|
||||
|
||||
/************************************************************
|
||||
* Manage host degrade based on degrade mask *
|
||||
***********************************************************/
|
||||
if (( node_ptr->degrade_mask == 0 ) &&
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
|
||||
/* expected degrade audit */
|
||||
else if (( node_ptr->degrade_mask ) &&
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -6461,12 +6436,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_SM ;
|
||||
|
||||
ilog ("%s sm degrade\n", node_ptr->hostname.c_str());
|
||||
|
||||
/* degrade the host if not already degraded */
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
}
|
||||
|
||||
/* Manage de-asserting degrade due to Software Management */
|
||||
@ -6477,16 +6446,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ;
|
||||
|
||||
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
|
||||
|
||||
/* if the degrade mask is now clear then consider clearing the degrade state */
|
||||
if ( node_ptr->degrade_mask == 0 )
|
||||
{
|
||||
/* ... but only if we are degraded */
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
|
||||
@ -6502,10 +6461,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
|
||||
{
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
|
||||
/* threshold is reached so raise the config alarm if it is not already raised */
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
@ -6554,6 +6509,30 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/************************************************************
|
||||
* Manage host degrade state based on degrade mask *
|
||||
* The availability state of degrade only applies when the *
|
||||
* host is unlocked-enabled. *
|
||||
***********************************************************/
|
||||
int nodeLinkClass::degrade_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
if (( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) &&
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
|
||||
else if (( node_ptr->degrade_mask ) &&
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
|
||||
{
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
|
@ -1127,6 +1127,7 @@ void read_fs_file ( vector<string> & dynamic_resources )
|
||||
*****************************************************************************/
|
||||
void add_dynamic_fs_resource ( bool send_response )
|
||||
{
|
||||
#ifdef WANT_FS_MONITORING
|
||||
char resource[50];
|
||||
char temp_resource[50];
|
||||
char device [50];
|
||||
@ -1206,10 +1207,14 @@ void add_dynamic_fs_resource ( bool send_response )
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
if (send_response)
|
||||
{
|
||||
#ifdef WANT_FS_MONITORING
|
||||
ilog ("sending response to dynamic FS add, to the rmon client\n");
|
||||
#else
|
||||
ilog("dynamic filesystem monitoring moved to collectd\n");
|
||||
#endif
|
||||
/* let the rmon client know that we are done with the file */
|
||||
rmon_resource_response(_rmon_ctrl_ptr->clients);
|
||||
}
|
||||
@ -4650,6 +4655,8 @@ void rmon_service (rmon_ctrl_type * ctrl_ptr)
|
||||
|
||||
ilog ("registered clients: %d\n", _rmon_ctrl_ptr->clients);
|
||||
|
||||
#ifdef WANT_FS_MONITORING
|
||||
|
||||
/* Initialize the resource specific configuration */
|
||||
for (int j=0; j<_rmon_ctrl_ptr->resources; j++)
|
||||
{
|
||||
@ -4669,6 +4676,9 @@ void rmon_service (rmon_ctrl_type * ctrl_ptr)
|
||||
|
||||
/* add any dynamic resources from before */
|
||||
add_dynamic_fs_resource(false);
|
||||
#else
|
||||
ilog("static filesystem monitoring moved to collectd\n");
|
||||
#endif
|
||||
|
||||
/* Clear any stale dynamic alarms that can be caused by dynamic resources. */
|
||||
/* An alarm become stale for example if it was raised against a local volumn group (lvg) and */
|
||||
|
@ -1,16 +0,0 @@
|
||||
[resource]
|
||||
resource = Platform CPU Usage
|
||||
debounce = 20 ; number of seconds to wait before degrade clear
|
||||
severity = critical ; minor, major, critical
|
||||
minor_threshold = 80 ; minor cpu utilization threshold percentage
|
||||
major_threshold = 90 ; major cpu utilization threshold percentage
|
||||
critical_threshold = 95 ; critical cpu utilization threshold percentage (use 101 if unused)
|
||||
minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0
|
||||
major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0
|
||||
critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0
|
||||
minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1
|
||||
major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1
|
||||
critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1
|
||||
num_tries = 2 ; number of tries before the alarm is raised
|
||||
alarm_on = 1 ; 1 for alarm on, 0 for alarm off
|
||||
percent = 1 ; Always use 1 for this resource (thresholds by percentage)
|
@ -1,16 +0,0 @@
|
||||
[resource]
|
||||
resource = Platform Filesystem Usage
|
||||
debounce = 20 ; number of seconds to wait before degrade clear
|
||||
severity = critical ; minor, major, critical
|
||||
minor_threshold = 70 ; minor filesystem utilization threshold percentage
|
||||
major_threshold = 80 ; major filesystem utilization threshold percentage
|
||||
critical_threshold = 90 ; critical filesystem utilization threshold percentage (use 101 if unused)
|
||||
minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0
|
||||
major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0
|
||||
critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0 (use 0 if unused)
|
||||
minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1
|
||||
major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1
|
||||
critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1
|
||||
num_tries = 2 ; number of tries before the alarm is raised
|
||||
alarm_on = 1 ; 1 for alarm on, 0 for alarm off
|
||||
percent = 1 ; 1 for percentage used, 0 for absolute value (file system available in MiB) (default is 1)
|
@ -1,16 +0,0 @@
|
||||
[resource]
|
||||
resource = Platform Memory Usage
|
||||
debounce = 20 ; number of seconds to wait before degrade clear
|
||||
severity = critical ; minor, major, critical
|
||||
minor_threshold = 70 ; minor memory utilization threshold percentage
|
||||
major_threshold = 80 ; major memory utilization threshold percentage
|
||||
critical_threshold = 90 ; critical memory utilization threshold percentage (use 101 if unsed)
|
||||
minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0
|
||||
major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0
|
||||
critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0 (use 0 if unused)
|
||||
minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1
|
||||
major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1
|
||||
critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1
|
||||
num_tries = 2 ; number of tries before the alarm is raised
|
||||
alarm_on = 1 ; 1 for alarm on, 0 for alarm off
|
||||
percent = 1 ; 1 for percentage used, 0 for absolute value (memory available in MiB) (default is 1)
|
Loading…
Reference in New Issue
Block a user