From c038b1a9a76d57fed90378ffa6df8eb0828c314c Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Mon, 14 May 2018 16:12:16 -0400 Subject: [PATCH] Collectd+InfluxDb-RMON Replacement(ALL METRICS) P1 This update adds Maintenance support for receiving host degrade assert and clear messages from collectd. This update also disables platform memory, cpu and file system resource monitoring in the maintenance resource monitor process rmon. These disabled resources are now monitored by collectd and therefore should not be monitored by rmond any longer. Change-Id: I13fd033bb1d14f299dcb97fa80296641c958d0a9 Signed-off-by: Jack Ding --- mtce-common/centos/cgts-mtce-common.spec | 15 -- .../cgts-mtce-common-1.0/common/nodeBase.cpp | 30 +++- .../cgts-mtce-common-1.0/common/nodeBase.h | 9 +- .../cgts-mtce-common-1.0/common/nodeClass.cpp | 156 +++++++----------- .../cgts-mtce-common-1.0/common/nodeClass.h | 7 + .../daemon/daemon_common.h | 6 +- .../daemon/daemon_files.cpp | 12 +- .../maintenance/mtcCtrlMsg.cpp | 44 ++++- .../maintenance/mtcNodeFsm.cpp | 3 + .../maintenance/mtcNodeHdlrs.cpp | 69 +++----- .../cgts-mtce-common-1.0/rmon/rmonHdlr.cpp | 12 +- .../rmon/scripts/cpu_resource.conf | 16 -- .../rmon/scripts/filesystem_resource.conf | 16 -- .../rmon/scripts/memory_resource.conf | 16 -- 14 files changed, 192 insertions(+), 219 deletions(-) delete mode 100644 mtce-common/cgts-mtce-common-1.0/rmon/scripts/cpu_resource.conf delete mode 100644 mtce-common/cgts-mtce-common-1.0/rmon/scripts/filesystem_resource.conf delete mode 100644 mtce-common/cgts-mtce-common-1.0/rmon/scripts/memory_resource.conf diff --git a/mtce-common/centos/cgts-mtce-common.spec b/mtce-common/centos/cgts-mtce-common.spec index 126b677a..429cfa79 100644 --- a/mtce-common/centos/cgts-mtce-common.spec +++ b/mtce-common/centos/cgts-mtce-common.spec @@ -421,11 +421,6 @@ install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-restart %{buildroot}/%{lo install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-start %{buildroot}/%{local_sbindir}/pmon-start install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-stop %{buildroot}/%{local_sbindir}/pmon-stop -# test tools -install -m 755 %{_buildsubdir}/hwmon/scripts/show_hp360 %{buildroot}/%{_sbindir}/show_hp360 -install -m 755 %{_buildsubdir}/hwmon/scripts/show_hp380 %{buildroot}/%{_sbindir}/show_hp380 -install -m 755 %{_buildsubdir}/hwmon/scripts/show_quanta %{buildroot}/%{_sbindir}/show_quanta - # init script files install -m 755 -p -D %{_buildsubdir}/scripts/mtcClient %{buildroot}%{_sysconfdir}/init.d/mtcClient install -m 755 -p -D %{_buildsubdir}/scripts/hbsClient %{buildroot}%{_sysconfdir}/init.d/hbsClient @@ -498,9 +493,6 @@ install -m 755 -d %{buildroot}%{_sysconfdir}/rmonapi.d install -m 755 -d %{buildroot}%{_sysconfdir}/rmonfiles.d install -m 755 -d %{buildroot}%{_sysconfdir}/rmon_interfaces.d install -m 644 -p -D %{_buildsubdir}/rmon/scripts/remotelogging_resource.conf %{buildroot}%{local_etc_rmond}/remotelogging_resource.conf -install -m 644 -p -D %{_buildsubdir}/rmon/scripts/cpu_resource.conf %{buildroot}%{local_etc_rmond}/cpu_resource.conf -install -m 644 -p -D %{_buildsubdir}/rmon/scripts/memory_resource.conf %{buildroot}%{local_etc_rmond}/memory_resource.conf -install -m 644 -p -D %{_buildsubdir}/rmon/scripts/filesystem_resource.conf %{buildroot}%{local_etc_rmond}/filesystem_resource.conf install -m 644 -p -D %{_buildsubdir}/rmon/scripts/cinder_virtual_resource.conf %{buildroot}%{local_etc_rmond}/cinder_virtual_resource.conf install -m 644 -p -D %{_buildsubdir}/rmon/scripts/nova_virtual_resource.conf %{buildroot}%{local_etc_rmond}/nova_virtual_resource.conf install -m 644 -p -D %{_buildsubdir}/rmon/scripts/oam_resource.conf %{buildroot}%{_sysconfdir}/rmon_interfaces.d/oam_resource.conf @@ -676,10 +668,7 @@ install -m 755 -d %{buildroot}/var/run %{local_etc_logrotated}/rmon.logrotate %{_unitdir}/rmon.service -%{local_etc_rmond}/filesystem_resource.conf -%{local_etc_rmond}/cpu_resource.conf %{local_etc_rmond}/remotelogging_resource.conf -%{local_etc_rmond}/memory_resource.conf %{local_etc_rmond}/cinder_virtual_resource.conf %{local_etc_rmond}/nova_virtual_resource.conf @@ -713,10 +702,6 @@ install -m 755 -d %{buildroot}/var/run %{local_etc_logrotated}/hwmon.logrotate %{ocf_resourced}/platform/hwmon -%{_sbindir}/show_hp380 -%{_sbindir}/show_hp360 -%{_sbindir}/show_quanta - %{_sysconfdir}/init.d/hwmon %{local_bindir}/hwmond diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.cpp b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.cpp index c3480638..f7410580 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.cpp +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.cpp @@ -255,12 +255,38 @@ const char * get_mtcNodeCommand_str ( int cmd ) } -void print_mtc_message ( string hostname, int direction, mtc_message_type & msg , const char * iface, bool force ) +void print_mtc_message ( string hostname, + int direction, + mtc_message_type & msg, + const char * iface, + bool force ) { + /* Handle raw json string messages differently. + * Those messages just have a json string that starts at the header */ + if ( msg.hdr[0] == '{' ) + { + if ( force ) + { + ilog ("%s %s (%s network) - %s\n", + hostname.c_str(), + direction ? "rx <-" : "tx ->" , + iface, + msg.hdr); + } + else + { + mlog1 ("%s %s (%s network) - %s\n", + hostname.c_str(), + direction ? "rx <-" : "tx ->" , + iface, + msg.hdr); + } + return ; + } + string str = "-" ; if ( msg.buf[0] ) str = msg.buf ; - if ( force ) { ilog ("%s %s %s (%s network) %d.%d %x:%x:%x.%x.%x.%x [%s] %s\n", diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h index 390d5ca5..332e4041 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h @@ -92,6 +92,9 @@ void daemon_exit ( void ); #define NODE_HEALTHY (1) #define NODE_UNHEALTHY (2) +#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count") +#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/") + #define HOST_IS_VIRTUAL ((const char *)"/var/run/virtual.host") /** Configuration Pass/Fail Flag File */ @@ -146,10 +149,6 @@ void daemon_exit ( void ); #define BM_DNSMASQ_FILENAME ((const char *)"dnsmasq.bmc_hosts") -/* Added for Centos */ -#define CENTOS_RELEASE_FILE ((const char *)"/etc/centos-release") -#define SYSTEMD_SERVICE_FILE_DIR ((const char *)"/usr/lib/systemd/system") - #define THREAD_NAME__IPMITOOL ((const char *)("ipmitool")) #define IPMITOOL_PATH_AND_FILENAME ((const char *)("/usr/bin/ipmitool")) @@ -970,7 +969,7 @@ string get_configStages_str ( mtc_configStages_enum stage ); #define DEGRADE_MASK_SUBF 0x00000100 #define DEGRADE_MASK_SM 0x00000200 #define DEGRADE_MASK_CONFIG 0x00000400 -#define DEGRADE_MASK_RES2 0x00000800 +#define DEGRADE_MASK_COLLECTD 0x00000800 #define DEGRADE_MASK_ENABLE 0x00001000 #define DEGRADE_MASK_RES4 0x00002000 #define DEGRADE_MASK_RES5 0x00004000 diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp index 1dfc57fc..e10f2528 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp @@ -662,8 +662,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->log_throttle = 0 ; ptr->no_work_log_throttle = 0 ; - /* Clear the degrade control structs */ - ptr->degrade_mask = DEGRADE_MASK_NONE ; + ptr->degrade_mask = ptr->degrade_mask_save = DEGRADE_MASK_NONE ; + ptr->degraded_resources_list.clear () ; ptr->pmond_ready = false ; ptr->rmond_ready = false ; @@ -4561,16 +4561,6 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface } hbs_minor_clear ( node_ptr, iface ); - - /* Set the host available if the degrade mask is now - * cleared and we are degraded */ - if ( node_ptr->degrade_mask == 0 ) - { - if ( get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ) - { - set_availStatus ( hostname, MTC_AVAIL_STATUS__AVAILABLE ); - } - } } else if ( this->mtcTimer_dor.tid ) { @@ -4602,12 +4592,6 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface node_ptr->degrade_mask |= DEGRADE_MASK_HEARTBEAT_INFRA ; } } - - /* No point in changing if we are already degraded */ - if ( nodeLinkClass::get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) - { - set_availStatus ( hostname, MTC_AVAIL_STATUS__DEGRADED ); - } } } } @@ -4621,7 +4605,7 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface, wlog ("%s Unknown host\n", hostname.c_str()); return ; } - + /* is this a clear event ? */ if ( clear_event == true ) { @@ -4639,15 +4623,15 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface, else if ( node_ptr->hbs_minor[iface] != true ) { - mnfa_add_host ( node_ptr, iface ); + mnfa_add_host ( node_ptr, iface ); } } } -/** Interface to declare that a key service on the +/** Interface to declare that a key service on the * specified host is up, running and ready */ -int nodeLinkClass::declare_service_ready ( string & hostname, +int nodeLinkClass::declare_service_ready ( string & hostname, unsigned int service ) { nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname ); @@ -4661,18 +4645,11 @@ int nodeLinkClass::declare_service_ready ( string & hostname, node_ptr->pmond_ready = true ; plog ("%s got pmond ready event\n", hostname.c_str()); - /* A ready event means that pmond pocess has started. - * Any previous history is gone. Cleanup mtce. + /* A ready event means that pmond pocess has started. + * Any previous history is gone. Cleanup mtce. * If there are still process issues on this host then * they will be reported again.*/ node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ; - if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) - { - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - } return (PASS); } else if ( service == MTC_SERVICE_HWMOND ) @@ -4719,14 +4696,6 @@ int nodeLinkClass::degrade_pmond_clear ( string & hostname ) if ( node_ptr->degrade_mask ) { node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ; - - if ( !node_ptr->degrade_mask ) - { - if ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - } } /* The only detectable inservice failures are process failures */ @@ -4735,15 +4704,65 @@ int nodeLinkClass::degrade_pmond_clear ( string & hostname ) return (PASS); } +/* This private API handles event messages from collectd */ +int nodeLinkClass::collectd_notify_handler ( string & hostname, + string & resource, + string & state ) +{ + int rc = PASS ; + nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr == NULL ) + { + wlog ("%s Unknown Host\n", hostname.c_str()); + return (FAIL_UNKNOWN_HOSTNAME) ; + } + if ( state == "clear" ) + { + if ( node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD ) + { + ilog("%s collectd degrade state change ; assert -> clear (%s)", + hostname.c_str(), resource.c_str()); + node_ptr->degrade_mask &= ~DEGRADE_MASK_COLLECTD ; + } + else + { + mlog3("%s collectd degrade 'clear' request (%s)", + hostname.c_str(), resource.c_str()); + } + } + else if ( state == "assert" ) + { + if ( (node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD) == 0 ) + { + ilog("%s collectd degrade state change ; clear -> assert (due to %s)", + hostname.c_str(), resource.c_str()); + node_ptr->degrade_mask |= DEGRADE_MASK_COLLECTD ; + } + else + { + mlog3("%s collectd degrade 'assert' request (%s)", + hostname.c_str(), resource.c_str()); + } + } + else + { + wlog ("%s collectd degrade state unknown (%s)\n", + hostname.c_str(), + state.c_str()); + rc = FAIL_OPERATION ; + } + return (rc); +} + /** Resource Monitor 'Clear' Event handler. - * + * * The resource specified will be removed from the * 'degraded_resources_list' for specified host. * if there are no other degraded resources or other * degraded services/reasons against that host then * this handler will clear the degrade state for the * specified host all together. */ -int nodeLinkClass::degrade_resource_clear ( string & hostname, +int nodeLinkClass::degrade_resource_clear ( string & hostname, string & resource ) { /* lr - Log Prefix Rmon */ @@ -4788,18 +4807,6 @@ int nodeLinkClass::degrade_resource_clear ( string & hostname, if ( node_ptr->degraded_resources_list.empty() ) { node_ptr->degrade_mask &= ~DEGRADE_MASK_RESMON ; ; - if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) - { - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - } - else - { - wlog ("%s Remains Degraded - Reason Mask:0x%08x\n", - hostname.c_str(), node_ptr->degrade_mask ); - } } else { @@ -4874,30 +4881,6 @@ int nodeLinkClass::node_degrade_control ( string & hostname, int state, string s /* clear the mask regardless of host state */ node_ptr->degrade_mask &= ~service_flag ; - - /* only applies if host is unlocked-enabled-degraded and - * there are no other degrade flags in the degrade mask */ - if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && - ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )) - { - if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - else - { - /* TODO: convert lask to a sring or services and print that string */ - wlog ("%s remains degraded - degrade mask:0x%08x\n", - hostname.c_str(), - node_ptr->degrade_mask ); - } - } - else - { - dlog ("%s unexpected degrade clear for '%s' service\n", - hostname.c_str(), service.c_str() ); - } rc = PASS ; break ; } @@ -4910,13 +4893,6 @@ int nodeLinkClass::node_degrade_control ( string & hostname, int state, string s wlog ("%s degrade 'assert' from '%s'\n", hostname.c_str(), service.c_str() ); node_ptr->degrade_mask |= service_flag ; } - - if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && - ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); - } rc = PASS ; break ; } @@ -5232,10 +5208,6 @@ int nodeLinkClass::degrade_process_raise ( string & hostname, { node_ptr->degrade_mask |= DEGRADE_MASK_PMON ; wlog ("%s is degraded due to '%s' process failure\n", hostname.c_str(), process.c_str()); - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); - } } } return (PASS); @@ -5412,11 +5384,6 @@ int nodeLinkClass::degrade_resource_raise ( string & hostname, { dlog ("%s '%s' Degraded (again)\n", lr.c_str(), resource.c_str()); } - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); - } - } return (PASS); } @@ -7039,9 +7006,6 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid ) * *****************************************************************************/ -#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/") -#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count") - void autorecovery_clear ( string hostname ) { string ar_file = TMP_DIR_PATH + hostname + AUTO_RECOVERY_FILE_SUFFIX ; diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h index 2b9b72ad..444e618b 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h @@ -585,6 +585,7 @@ private: /* Bit mask of degrade reasons */ unsigned int degrade_mask ; + unsigned int degrade_mask_save ; /** Process Monitor Daemon Flag Missing count */ int pmon_missing_count ; @@ -785,6 +786,7 @@ private: int insv_test_handler ( struct nodeLinkClass::node * node_ptr ); int stress_handler ( struct nodeLinkClass::node * node_ptr ); int bm_handler ( struct nodeLinkClass::node * node_ptr ); + int degrade_handler ( struct nodeLinkClass::node * node_ptr ); int uptime_handler ( void ); int host_services_handler ( struct nodeLinkClass::node * node_ptr ); @@ -1731,6 +1733,11 @@ public: /** Calculates and returns the mnfa threshold based on enabled hosts */ int mnfa_calculate_threshold ( string hostname ); + /* collectd event handler */ + int collectd_notify_handler ( string & hostname, + string & resource, + string & state ); + /***************************************** ** Process Monitor Event Utilities API ** *****************************************/ diff --git a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h index f804b849..e7e6c85d 100755 --- a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h +++ b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h @@ -68,9 +68,9 @@ string daemon_read_file ( const char * filename ); void daemon_logfile_close ( void ); void daemon_logfile_open ( void ); -void daemon_log ( const char * filename , const char * str ); -void daemon_log_value ( const char * filename , int val ); -void daemon_log_value ( const char * filename , const char * str, int val ); +int daemon_log ( const char * filename , const char * str ); +int daemon_log_value ( const char * filename , int val ); +int daemon_log_value ( const char * filename , const char * str, int val ); /* reads the first line of a file and if it contains a string * that represents an integer value then return it */ diff --git a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp index 89408f46..002c4db4 100755 --- a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp +++ b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp @@ -103,7 +103,7 @@ void daemon_healthcheck ( const char * sig ) #define BUFFER 1024 -void daemon_log_value ( const char * filename , const char * str, int val ) +int daemon_log_value ( const char * filename , const char * str, int val ) { FILE * file_stream = fopen (filename, "a" ) ; if ( file_stream != NULL ) @@ -111,10 +111,12 @@ void daemon_log_value ( const char * filename , const char * str, int val ) fprintf ( file_stream,"%s %d\n", str, val ); fflush (file_stream); fclose (file_stream); + return (PASS); } + return (FAIL_FILE_OPEN); } -void daemon_log_value ( const char * filename , int val ) +int daemon_log_value ( const char * filename , int val ) { FILE * file_stream = fopen (filename, "w" ) ; if ( file_stream != NULL ) @@ -122,10 +124,12 @@ void daemon_log_value ( const char * filename , int val ) fprintf ( file_stream,"%d\n", val ); fflush (file_stream); fclose (file_stream); + return (PASS); } + return (FAIL_FILE_OPEN); } -void daemon_log ( const char * filename , const char * str ) +int daemon_log ( const char * filename , const char * str ) { FILE * file_stream = fopen (filename, "a" ) ; if ( file_stream != NULL ) @@ -133,7 +137,9 @@ void daemon_log ( const char * filename , const char * str ) fprintf ( file_stream,"%s\n", str ); fflush (file_stream); fclose (file_stream); + return (PASS); } + return (FAIL_FILE_OPEN); } /* reads the first line of a file and if it contains a string diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp index 8a1f5291..bcc9b5f5 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp @@ -191,8 +191,50 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(iface), false ); + if ( msg.hdr[0] == '{' ) + { + int rc1 ; + string service ; + + mlog1 ("%s\n", &msg.hdr[0] ); + + rc1 = jsonUtil_get_key_val(&msg.hdr[0],"service", service ); + if ( rc1 == PASS ) + { + if ( service == "collectd_notifier" ) + { + int rc1,rc2,rc3 ; + string hostname,resource,state ; + + rc1 = jsonUtil_get_key_val(&msg.hdr[0],"hostname", hostname ); + rc2 = jsonUtil_get_key_val(&msg.hdr[0],"resource", resource ); + rc3 = jsonUtil_get_key_val(&msg.hdr[0],"degrade", state ); + if ( rc1|rc2|rc3 ) + { + elog ("failed to parse '%s' message\n", service.c_str()); + wlog ("... %s\n", &msg.hdr[0] ); + } + else + { + obj_ptr->collectd_notify_handler ( hostname, + resource, + state ); + } + } + /* future service requests */ + else + { + wlog ("Unexpected service request: '%s'\n", service.c_str()); + } + } + else + { + wlog("Unexpected json message: %s\n", &msg.hdr[0] ); + } + } + /* Check for response messages */ - if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) ) + else if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) ) { obj_ptr->set_cmd_resp ( hostname , msg ) ; } diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeFsm.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeFsm.cpp index 8ee70593..20fad599 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeFsm.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeFsm.cpp @@ -74,6 +74,9 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) /* manage the host connected state and board management alarms */ nodeLinkClass::bm_handler ( node_ptr ); + /* manage host's degrade state */ + nodeLinkClass::degrade_handler ( node_ptr ); + /* * Always run the offline handler * diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp index fc13c5c1..6a38d5e7 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp @@ -5599,15 +5599,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST ); } - /* handle coming out of the ADD in a degraded state */ - if (( node_ptr->degrade_mask != 0 ) && - (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && - ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); - } - node_ptr->mtcAlive_gate = false ; node_ptr->addStage = MTC_ADD__DONE ; break; @@ -6111,22 +6102,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) { alarm_compute_clear ( node_ptr, false ); } - - /************************************************************ - * Manage host degrade based on degrade mask * - ***********************************************************/ - if (( node_ptr->degrade_mask == 0 ) && - ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - - /* expected degrade audit */ - else if (( node_ptr->degrade_mask ) && - ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); - } } break ; } @@ -6461,12 +6436,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->degrade_mask |= DEGRADE_MASK_SM ; ilog ("%s sm degrade\n", node_ptr->hostname.c_str()); - - /* degrade the host if not already degraded */ - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); - } } /* Manage de-asserting degrade due to Software Management */ @@ -6477,16 +6446,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ; ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str()); - - /* if the degrade mask is now clear then consider clearing the degrade state */ - if ( node_ptr->degrade_mask == 0 ) - { - /* ... but only if we are degraded */ - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - } } if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) @@ -6502,10 +6461,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD ) { node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ; - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) - { - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); - } /* threshold is reached so raise the config alarm if it is not already raised */ if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL ) @@ -6554,6 +6509,30 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) return (PASS); } +/************************************************************ + * Manage host degrade state based on degrade mask * + * The availability state of degrade only applies when the * + * host is unlocked-enabled. * + ***********************************************************/ +int nodeLinkClass::degrade_handler ( struct nodeLinkClass::node * node_ptr ) +{ + if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) + { + if (( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) && + ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )) + { + availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE ); + } + + else if (( node_ptr->degrade_mask ) && + ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )) + { + availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED ); + } + } + return (PASS); +} int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr ) { diff --git a/mtce-common/cgts-mtce-common-1.0/rmon/rmonHdlr.cpp b/mtce-common/cgts-mtce-common-1.0/rmon/rmonHdlr.cpp index 71f1bf16..c1357a99 100644 --- a/mtce-common/cgts-mtce-common-1.0/rmon/rmonHdlr.cpp +++ b/mtce-common/cgts-mtce-common-1.0/rmon/rmonHdlr.cpp @@ -1127,6 +1127,7 @@ void read_fs_file ( vector & dynamic_resources ) *****************************************************************************/ void add_dynamic_fs_resource ( bool send_response ) { +#ifdef WANT_FS_MONITORING char resource[50]; char temp_resource[50]; char device [50]; @@ -1206,10 +1207,14 @@ void add_dynamic_fs_resource ( bool send_response ) } } } - +#endif if (send_response) { +#ifdef WANT_FS_MONITORING ilog ("sending response to dynamic FS add, to the rmon client\n"); +#else + ilog("dynamic filesystem monitoring moved to collectd\n"); +#endif /* let the rmon client know that we are done with the file */ rmon_resource_response(_rmon_ctrl_ptr->clients); } @@ -4650,6 +4655,8 @@ void rmon_service (rmon_ctrl_type * ctrl_ptr) ilog ("registered clients: %d\n", _rmon_ctrl_ptr->clients); +#ifdef WANT_FS_MONITORING + /* Initialize the resource specific configuration */ for (int j=0; j<_rmon_ctrl_ptr->resources; j++) { @@ -4669,6 +4676,9 @@ void rmon_service (rmon_ctrl_type * ctrl_ptr) /* add any dynamic resources from before */ add_dynamic_fs_resource(false); +#else + ilog("static filesystem monitoring moved to collectd\n"); +#endif /* Clear any stale dynamic alarms that can be caused by dynamic resources. */ /* An alarm become stale for example if it was raised against a local volumn group (lvg) and */ diff --git a/mtce-common/cgts-mtce-common-1.0/rmon/scripts/cpu_resource.conf b/mtce-common/cgts-mtce-common-1.0/rmon/scripts/cpu_resource.conf deleted file mode 100644 index dc0ab8dd..00000000 --- a/mtce-common/cgts-mtce-common-1.0/rmon/scripts/cpu_resource.conf +++ /dev/null @@ -1,16 +0,0 @@ -[resource] -resource = Platform CPU Usage -debounce = 20 ; number of seconds to wait before degrade clear -severity = critical ; minor, major, critical -minor_threshold = 80 ; minor cpu utilization threshold percentage -major_threshold = 90 ; major cpu utilization threshold percentage -critical_threshold = 95 ; critical cpu utilization threshold percentage (use 101 if unused) -minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0 -major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0 -critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0 -minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1 -major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1 -critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1 -num_tries = 2 ; number of tries before the alarm is raised -alarm_on = 1 ; 1 for alarm on, 0 for alarm off -percent = 1 ; Always use 1 for this resource (thresholds by percentage) diff --git a/mtce-common/cgts-mtce-common-1.0/rmon/scripts/filesystem_resource.conf b/mtce-common/cgts-mtce-common-1.0/rmon/scripts/filesystem_resource.conf deleted file mode 100644 index e8496b54..00000000 --- a/mtce-common/cgts-mtce-common-1.0/rmon/scripts/filesystem_resource.conf +++ /dev/null @@ -1,16 +0,0 @@ -[resource] -resource = Platform Filesystem Usage -debounce = 20 ; number of seconds to wait before degrade clear -severity = critical ; minor, major, critical -minor_threshold = 70 ; minor filesystem utilization threshold percentage -major_threshold = 80 ; major filesystem utilization threshold percentage -critical_threshold = 90 ; critical filesystem utilization threshold percentage (use 101 if unused) -minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0 -major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0 -critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0 (use 0 if unused) -minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1 -major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1 -critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1 -num_tries = 2 ; number of tries before the alarm is raised -alarm_on = 1 ; 1 for alarm on, 0 for alarm off -percent = 1 ; 1 for percentage used, 0 for absolute value (file system available in MiB) (default is 1) diff --git a/mtce-common/cgts-mtce-common-1.0/rmon/scripts/memory_resource.conf b/mtce-common/cgts-mtce-common-1.0/rmon/scripts/memory_resource.conf deleted file mode 100644 index 926e28cd..00000000 --- a/mtce-common/cgts-mtce-common-1.0/rmon/scripts/memory_resource.conf +++ /dev/null @@ -1,16 +0,0 @@ -[resource] -resource = Platform Memory Usage -debounce = 20 ; number of seconds to wait before degrade clear -severity = critical ; minor, major, critical -minor_threshold = 70 ; minor memory utilization threshold percentage -major_threshold = 80 ; major memory utilization threshold percentage -critical_threshold = 90 ; critical memory utilization threshold percentage (use 101 if unsed) -minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0 -major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0 -critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0 (use 0 if unused) -minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1 -major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1 -critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1 -num_tries = 2 ; number of tries before the alarm is raised -alarm_on = 1 ; 1 for alarm on, 0 for alarm off -percent = 1 ; 1 for percentage used, 0 for absolute value (memory available in MiB) (default is 1)