diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data
index 52d21566f..e5b3c5046 100644
--- a/monitoring/collectd-extensions/centos/build_srpm.data
+++ b/monitoring/collectd-extensions/centos/build_srpm.data
@@ -16,4 +16,4 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf"
-TIS_PATCH_VER=4
+TIS_PATCH_VER=5
diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py
index 058c9aeb7..73b7916bc 100755
--- a/monitoring/collectd-extensions/src/fm_notifier.py
+++ b/monitoring/collectd-extensions/src/fm_notifier.py
@@ -870,6 +870,9 @@ def _database_setup(database):
(PLUGIN, database, retention))
collectd.info("%s influxdb:%s is setup" % (PLUGIN, database))
PluginObject.database_setup = True
+ else:
+ collectd.error("%s influxdb:%s setup %s" %
+ (PLUGIN, database, error_str))
def _clear_alarm_for_missing_filesystems():
diff --git a/monitoring/collectd-extensions/src/ntpq.conf b/monitoring/collectd-extensions/src/ntpq.conf
index f7e3c26ce..02aebc127 100644
--- a/monitoring/collectd-extensions/src/ntpq.conf
+++ b/monitoring/collectd-extensions/src/ntpq.conf
@@ -1,16 +1,12 @@
-#
-# Interval 60
-#
-
- Instance "state"
+ Instance "reachable"
Persist true
PersistOK true
WarningMin 1
FailureMin 0
-# Hits 2
+ Hits 2
Invert false
diff --git a/monitoring/collectd-extensions/src/ntpq.py b/monitoring/collectd-extensions/src/ntpq.py
index 7a984304e..7b6f343db 100755
--- a/monitoring/collectd-extensions/src/ntpq.py
+++ b/monitoring/collectd-extensions/src/ntpq.py
@@ -1,9 +1,58 @@
-
-# Copyright (c) 2018 Wind River Systems, Inc.
+############################################################################
+# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
+#############################################################################
#
+# This is the NTP connectivity monitor plugin for collectd.
+#
+# This plugin uses the industry standard ntpq exec to query NTP attributes.
+#
+# This plugin executes 'ntpq -np' to determined which provisioned servers
+# are reachable. The ntpq output includes Tally Code. The tally Code is
+# represented by the first character in each server's line item.
+#
+# The only ntpq output looked at by this plugin are the Tally Codes and
+# associated IPs.
+#
+# Tally Code Summary:
+#
+# A server is considered reachable only when the Tally Code is a * or a +.
+# A server is considered unreachable if the Tally Code is a ' ' (space)
+# A server with a '*' Tally Code is the 'selected' server.
+#
+# Here is an example of the ntpq command output
+#
+# remote refid st t when poll reach delay offset jitter
+# =============================================================================
+# +192.168.204.104 206.108.0.133 2 u 203 1024 377 0.226 -3.443 1.137
+# +97.107.129.217 200.98.196.212 2 u 904 1024 377 21.677 5.577 0.624
+# 192.95.27.155 24.150.203.150 2 u 226 1024 377 15.867 0.381 1.124
+# -97.107.129.217 200.98.196.212 2 u 904 1024 377 21.677 5.577 0.624
+# *182.95.27.155 24.150.203.150 2 u 226 1024 377 15.867 0.381 1.124
+#
+# The local controller node is not to be considered a reachable server and is
+# never alarmed if it is not reachable.
+#
+# Normal running modes with no alarms include
+#
+# 0 - All NTP servers are reachable and one is selected
+# 1 - No NTP servers are provisioned
+#
+# Failure modes that warrant alarms include
+#
+# 2 - None of the NTP servers are reachable - major alarm
+# 3 - Some NTP servers reachable and one is selected - server IP minor alarm
+# 4 - Some NTP servers reachable but none is selected - major alarm
+#
+# None of these failures result in a host being degraded.
+#
+# This script will only be run on the controller nodes.
+#
+# This script logs to daemon.log with the 'collectd' process label
+#
+###############################################################################
import os
import subprocess
@@ -16,149 +65,106 @@ import tsconfig.tsconfig as tsc
api = fm_api.FaultAPIs()
PLUGIN = 'NTP query plugin'
-
-PLUGIN_SCRIPT = '/etc/rmonfiles.d/query_ntp_servers.sh'
-PLUGIN_RESULT = '/tmp/ntpq_server_info'
-
-# static variables
-ALARM_ID__NTPQ = "100.114"
+PLUGIN_INTERVAL = 600 # audit interval in secs
+PLUGIN_CONF = '/etc/ntp.conf'
+PLUGIN_EXEC = '/usr/sbin/ntpq'
+PLUGIN_EXEC_OPTIONS = '-pn'
+PLUGIN_ALARMID = "100.114"
# define a class here that will persist over read calls
class NtpqObject:
- hostname = ''
- base_eid = ''
- severity = 'clear'
+
+ # static variables set in init
+ hostname = '' # the name of this host
+ base_eid = '' # the eid for the major alarm
+ config_complete = False # set to true once config is complete
+ alarm_raised = False # True when the major alarm is asserted
+
+ server_list_conf = [] # list of servers in the /etc/ntp.conf file
+ server_list_ntpq = [] # list of servers in the ntpq -np output
+ unreachable_servers = [] # list of unreachable servers
+ reachable_servers = [] # list of reachable servers
+ selected_server = 'None' # the ip address of the selected server
+ selected_server_save = 'None' # the last selected server ; note change
+
+ # variables used to raise alarms to FM
suppression = True
service_affecting = False
- status = 0
- last_result = ''
- this_result = ''
- id = ALARM_ID__NTPQ
name = "NTP"
alarm_type = fm_constants.FM_ALARM_TYPE_1
cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN
repair = "Monitor and if condition persists, "
repair += "contact next level of support."
-
+# This plugin's class object - persists over read calls
obj = NtpqObject()
-def is_uuid_like(val):
- """Returns validation of a value as a UUID."""
- try:
- return str(uuid.UUID(val)) == val
- except (TypeError, ValueError, AttributeError):
- return False
+###############################################################################
+#
+# Name : _add_unreachable_server
+#
+# Description: This private interface is used to add an ip to the
+# unreachable servers list.
+#
+# Parameters : IP address
+#
+###############################################################################
+def _add_unreachable_server(ip=None):
+ """ Add ip to unreachable_servers list """
-# The config function - called once on collectd process startup
-def config_func(config):
- """
- Configure the plugin
- """
+ if ip:
+ if ip not in obj.unreachable_servers:
+ collectd.debug("%s adding '%s' to unreachable servers list: %s" %
+ (PLUGIN, ip, obj.unreachable_servers))
- collectd.debug('%s config function' % PLUGIN)
- return 0
+ obj.unreachable_servers.append(ip)
-
-# The init function - called once on collectd process startup
-def init_func():
-
- # ntp query is for controllers only
- if tsc.nodetype != 'controller':
- return 0
-
- # get current hostname
- obj.hostname = os.uname()[1]
- obj.base_eid = 'host=' + obj.hostname + '.ntp'
- collectd.info("%s on %s with entity id '%s'" % PLUGIN, obj.hostname, obj.base_eid)
- return 0
-
-
-# The sample read function - called on every audit interval
-def read_func():
-
- # ntp query is for controllers only
- if tsc.nodetype != 'controller':
- return 0
-
- result = int(0)
- # Query ntp
- try:
- result = os.system(PLUGIN_SCRIPT)
- except Exception as e:
- collectd.error("%s Could not run '%s' (%s)" %
- (PLUGIN, e))
- return 0
-
- obj.status = int(result)/0x100
-
- collectd.info("%s Query Result: %s" % (PLUGIN, obj.status))
-
- if os.path.exists(PLUGIN_RESULT) is False:
- collectd.error("%s produced no result file '%s'" %
- (PLUGIN, PLUGIN_RESULT))
- return 0
-
- # read the query result file.
- # format is in the PLUGIN_SCRIPT file.
- # This code only wants the second line.
- # It contains list of unreachable ntp servers that need alarm management.
- count = 0
- with open(PLUGIN_RESULT, 'r') as infile:
- for line in infile:
- count += 1
- collectd.info("%s Query Result: %s" % (PLUGIN, line))
- if count == 0:
- collectd.error("%s produced empty result file '%s'" %
- (PLUGIN, PLUGIN_RESULT))
- return 0
-
- sample = 1
-
- # Dispatch usage value to collectd
- val = collectd.Values(host=obj.hostname)
- val.plugin = 'ntpq'
- val.plugin_instance = 'some.ntp.server.ip'
- val.type = 'absolute'
- val.type_instance = 'state'
- val.dispatch(values=[sample])
-
- severity = 'clear'
- obj.severity = 'clear'
-
- # if there is no severity change then consider exiting
- if obj.severity == severity:
-
- # unless the current severity is 'minor'
- if severity == 'minor':
- # TODO: check to see if the failing IP address is changed
- collectd.info("%s NEED TO CHECK IP ADDRESSES" % (PLUGIN))
+ collectd.info("%s added '%s' to unreachable servers list: %s" %
+ (PLUGIN, ip, obj.unreachable_servers))
else:
- return 0
+ collectd.debug("%s ip '%s' already in unreachable_servers list" %
+ (PLUGIN, ip))
+ else:
+ collectd.error("%s _add_unreachable_server called with no IP" % PLUGIN)
- # if current severity is clear but previous severity is not then
- # prepare to clear the alarms
- if severity == 'clear':
- _alarm_state = fm_constants.FM_ALARM_STATE_CLEAR
- # TODO: loop over all raises alarms and clear them
- collectd.info("%s NEED CLEAR ALL ALARMS" % (PLUGIN))
- if api.clear_fault(obj.id, obj.base_eid) is False:
- collectd.error("%s %s:%s clear_fault failed" %
- (PLUGIN, obj.id, obj.base_eid))
- return 0
+###############################################################################
+#
+# Name : _raise_alarm
+#
+# Description: This private interface is used to raise NTP alarms.
+#
+# Parameters : Optional IP address
+#
+# If called with no or empty IP then a generic major alarm is raised.
+# If called with an IP then an IP specific minor alarm is raised.
+#
+# Returns : Error indication.
+#
+# True : is error. FM call failed to set the
+# alarm and needs to be retried.
+#
+# False: no error. FM call succeeds
+#
+###############################################################################
+
+def _raise_alarm(ip=None):
+ """ Assert an NTP alarm """
+
+ if not ip:
+ # Don't re-raise the alarm if its already raised
+ if obj.alarm_raised is True:
+ return False
- elif severity == 'major':
reason = "NTP configuration does not contain any valid "
reason += "or reachable NTP servers."
eid = obj.base_eid
fm_severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
+
else:
- # TODO: There can be up to 3 inacessable servers
- ip = 'some.server.ip.addr'
reason = "NTP address "
reason += ip
reason += " is not a valid or a reachable NTP server."
@@ -166,7 +172,7 @@ def read_func():
fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR
fault = fm_api.Fault(
- alarm_id=obj.id,
+ alarm_id=PLUGIN_ALARMID,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=eid,
@@ -179,12 +185,593 @@ def read_func():
suppression=obj.suppression)
alarm_uuid = api.set_fault(fault)
- if is_uuid_like(alarm_uuid) is False:
+ if _is_uuid_like(alarm_uuid) is False:
+
+ # Don't _add_unreachable_server list if the fm call failed.
+ # That way it will be retried at a later time.
collectd.error("%s %s:%s set_fault failed:%s" %
- (PLUGIN, obj.id, eid, alarm_uuid))
+ (PLUGIN, PLUGIN_ALARMID, eid, alarm_uuid))
+ return True
+ else:
+ collectd.info("%s raised alarm %s:%s" % (PLUGIN, PLUGIN_ALARMID, eid))
+ if ip:
+ _add_unreachable_server(ip)
+ else:
+ obj.alarm_raised = True
+
+ return False
+
+
+###############################################################################
+#
+# Name : _clear_base_alarm
+#
+# Description: This private interface is used to clear the NTP base alarm.
+#
+# Parameters : None
+#
+# Returns : Error indication.
+#
+# True : is error. FM call failed to clear the
+# alarm and needs to be retried.
+#
+# False: no error. FM call succeeds
+#
+###############################################################################
+
+def _clear_base_alarm():
+ """ Clear the NTP base alarm """
+
+ if api.get_fault(PLUGIN_ALARMID, obj.base_eid) is not None:
+ if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False:
+ collectd.error("%s failed to clear alarm %s:%s" %
+ (PLUGIN, PLUGIN_ALARMID, obj.base_eid))
+ return True
+ else:
+ collectd.info("%s cleared alarm %s:%s" %
+ (PLUGIN, PLUGIN_ALARMID, obj.base_eid))
+ obj.alarm_raised = False
+
+ return False
+
+
+###############################################################################
+#
+# Name : _remove_ip_from_unreachable_list
+#
+# Description: This private interface is used to remove the specified IP
+# from the unreachable servers list and clear its alarm if raised.
+#
+# Parameters : IP address
+#
+# Returns : Error indication.
+#
+# True : is error. FM call failed to clear the
+# alarm and needs to be retried.
+#
+# False: no error. FM call succeeds
+#
+###############################################################################
+
+def _remove_ip_from_unreachable_list(ip):
+ """
+ Remove an IP address from the unreachable list and
+ clear any NTP alarm that might be asserted for it.
+ """
+
+ # remove from unreachable list if its there
+ if ip and ip in obj.unreachable_servers:
+ eid = obj.base_eid + '=' + ip
+ collectd.debug("%s trying to clear alarm %s" % (PLUGIN, eid))
+ # clear the alarm if its asserted
+ if api.get_fault(PLUGIN_ALARMID, eid) is not None:
+ if api.clear_fault(PLUGIN_ALARMID, eid) is True:
+ collectd.info("%s cleared %s:%s alarm" %
+ (PLUGIN, PLUGIN_ALARMID, eid))
+ obj.unreachable_servers.remove(ip)
+ else:
+ # Handle clear failure by not removing the IP from the list.
+ # It will retry on next audit.
+ # Error should only occur if FM is not running at the time
+ # this get or clear is called
+ collectd.error("%s failed alarm clear %s:%s" %
+ (PLUGIN, PLUGIN_ALARMID, eid))
+ return True
+ else:
+ obj.unreachable_servers.remove(ip)
+ collectd.info("%s alarm %s not raised" % (PLUGIN, eid))
+
+ return False
+
+
+###############################################################################
+#
+# Name : _add_ip_to_ntpq_server_list
+#
+# Description: This private interface is used to create a list if servers
+# found in the ntpq output.
+#
+# This list is used to detect and handle servers that might come
+# and go between readings that might otherwise result in stuck
+# alarms.
+#
+# Parameters : IP address
+#
+# Returns : nothing
+#
+###############################################################################
+
+def _add_ip_to_ntpq_server_list(ip):
+ """ Add this IP to the list of servers that ntpq reports against. """
+
+ if ip not in obj.server_list_ntpq:
+ obj.server_list_ntpq.append(ip)
+
+
+##############################################################################
+#
+# Name : _cleanup_stale_servers
+#
+# Description: This private interface walks through each server tracking list
+# removing any that it finds that are not in the ntpq server list.
+#
+# Alarms are cleared as needed to avoid stale alarms
+#
+# Parameters : None
+#
+# Returns : nothing
+#
+###############################################################################
+
+def _cleanup_stale_servers():
+ """ Cleanup the server IP tracking lists """
+
+ collectd.debug("%s CLEANUP REACHABLE: %s %s" %
+ (PLUGIN, obj.server_list_ntpq, obj.reachable_servers))
+ for ip in obj.reachable_servers:
+ if ip not in obj.server_list_ntpq:
+ collectd.info("%s removing missing '%s' server from reachable "
+ "server list" % (PLUGIN, ip))
+ obj.reachable_servers.remove(ip)
+
+ collectd.debug("%s CLEANUP UNREACHABLE: %s %s" %
+ (PLUGIN, obj.server_list_ntpq, obj.unreachable_servers))
+ for ip in obj.unreachable_servers:
+ if ip not in obj.server_list_ntpq:
+ collectd.info("%s removing missing '%s' server from unreachable "
+ "server list" % (PLUGIN, ip))
+ _remove_ip_from_unreachable_list(ip)
+
+
+###############################################################################
+#
+# Name : _get_ntp_servers
+#
+# Description: This private interface reads the list of ntp servers from the
+# ntp.conf file
+#
+# Parameters : None
+#
+# Returns : nothing
+#
+# Updates : server_list_conf
+#
+###############################################################################
+
+def _get_ntp_servers():
+ """ Read the provisioned servers from the ntp conf file """
+
+ with open(PLUGIN_CONF, 'r') as infile:
+ for line in infile:
+ if line.startswith('server '):
+ ip = line.rstrip().split(' ')[1]
+ if ip not in obj.server_list_conf:
+ obj.server_list_conf.append(ip)
+ if len(obj.server_list_conf):
+ collectd.info("%s server list: %s" %
+ (PLUGIN, obj.server_list_conf))
+ else:
+ ##################################################################
+ #
+ # Handle NTP_NOT_PROVISIONED (1) case
+ #
+ # There is no alarming for this case.
+ # Clear any that may have been raised.
+ #
+ ##################################################################
+ collectd.info("%s No NTP servers are provisioned" % PLUGIN)
+
+ # clear all alarms
+ if obj.alarm_raised:
+ _clear_base_alarm()
+
+ if obj.unreachable_servers:
+ for ip in obj.unreachable_servers:
+ _remove_ip_from_unreachable_list(ip)
+
+
+###############################################################################
+#
+# Name : is_controller
+#
+# Description: This private interface returns a True if the specified ip is
+# associated with a local controller.
+#
+# Parameters : IP address
+#
+# Returns : True or False
+#
+###############################################################################
+
+def _is_controller(ip):
+ """ Returns True if this IP corresponds to one of the controllers """
+
+ collectd.debug("%s check if '%s' is a controller ip" % (PLUGIN, ip))
+ with open('/etc/hosts', 'r') as infile:
+ for line in infile:
+ # skip over file comment lines prefixed with '#'
+ if line[0] == '#':
+ continue
+ # line format is 'ip' 'name' ....
+ split_line = line.split()
+ if len(split_line) >= 2:
+ # look for exact match ip that contains controller in its name
+ if split_line[0] == ip and 'controller' in line:
+ collectd.debug("%s %s is a controller" % (PLUGIN, ip))
+ return True
+ return False
+
+
+###############################################################################
+#
+# Name : is_uuid_like
+#
+# Description: This private interface returns a True if the specified value is
+# a valid uuid.
+#
+# Parameters : val is a uuid string
+#
+# Returns : True or False
+#
+###############################################################################
+
+def _is_uuid_like(val):
+ """Returns validation of a value as a UUID."""
+ try:
+ return str(uuid.UUID(val)) == val
+ except (TypeError, ValueError, AttributeError):
+ return False
+
+
+###############################################################################
+#
+# Name : config_func
+#
+# Description: The configuration interface this plugin publishes to collectd.
+#
+# collectd calls this interface one time on its process startup
+# when it loads this plugin.
+#
+# There is currently no specific configuration options to parse
+# for this plugin.
+#
+# Parameters : collectd config object
+#
+# Returns : zero
+#
+###############################################################################
+
+def config_func(config):
+ """ Configure the plugin """
+
+ collectd.debug('%s config function' % PLUGIN)
+ return 0
+
+
+###############################################################################
+#
+# Name : init_func
+#
+# Description: The initialization interface this plugin publishes to collectd.
+#
+# collectd calls this interface one time on its process startup
+# when it loads this plugin.
+#
+# 1. get hostname
+# 2. build base entity id for the NTP alarm
+# 3. query FM for existing NTP alarms
+# - base alarm is maintained and state loaded if it exists
+# - ntp ip minor alalrms are cleared on init. This is done to
+# auto correct ntp server IP address changes over process
+# restart ; avoid stuck alarms.
+#
+# Parameters : None
+#
+# Returns : zero
+#
+###############################################################################
+
+def init_func():
+
+ # ntp query is for controllers only
+ if tsc.nodetype != 'controller':
return 0
- # TODO: clear the object alarm state
+ # do nothing till config is complete.
+ # init_func will be called again by read_func once config is complete.
+ if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is False:
+ return 0
+
+ # get current hostname
+ obj.hostname = os.uname()[1]
+ if not obj.hostname:
+ collectd.error("%s failed to get hostname" % PLUGIN)
+ return 1
+
+ obj.base_eid = 'host=' + obj.hostname + '.ntp'
+ collectd.debug("%s on %s with entity id '%s'" %
+ (PLUGIN, obj.hostname, obj.base_eid))
+
+ # get a list of provisioned ntp servers
+ _get_ntp_servers()
+
+ # manage existing alarms.
+ alarms = api.get_faults_by_id(PLUGIN_ALARMID)
+ if alarms:
+ for alarm in alarms:
+ eid = alarm.entity_instance_id
+ # ignore alarms not for this host
+ if obj.hostname not in eid:
+ continue
+
+ # maintain only the base alarm.
+ if alarm.entity_instance_id != obj.base_eid:
+ # clear any ntp server specific alarms over process restart
+ # this is done to avoid the potential for stuck ntp ip alarms
+ collectd.info("%s clearing found startup alarm '%s'" %
+ (PLUGIN, alarm.entity_instance_id))
+ rc = api.clear_fault(PLUGIN_ALARMID, alarm.entity_instance_id)
+ if rc is False:
+ # if we can't clear the alarm now then lets load it and
+ # manage it like it just happened. When the server starts
+ # responding then the alarm will get cleared at that time.
+ collectd.error("%s failed to clear alarm %s:%s" %
+ (PLUGIN, PLUGIN_ALARMID,
+ alarm.entity_instance_id))
+
+ ip = alarm.entity_instance_id.split('=')[2]
+ if ip and ip not in obj.unreachable_servers:
+ _add_unreachable_server(ip)
+ else:
+ obj.alarm_raised = True
+ collectd.info("%s found alarm %s:%s" %
+ (PLUGIN,
+ PLUGIN_ALARMID,
+ alarm.entity_instance_id))
+
+ # ensure the base alarm is cleared if there are no
+ # provisioned servers.
+ if not obj.server_list_conf:
+ _clear_base_alarm()
+
+ else:
+ collectd.info("%s no major startup alarms found" % PLUGIN)
+
+ obj.config_complete = True
+
+ return 0
+
+
+###############################################################################
+#
+# Name : read_func
+#
+# Description: The sample read interface this plugin publishes to collectd.
+#
+# collectd calls this interface every audit interval.
+#
+# Runs ntpq -np to query NTP status and manages alarms based on
+# the result.
+#
+# See file header (above) for more specific behavioral detail.
+#
+# Should only run on a controller ; both
+#
+# Parameters : None
+#
+# Returns : zero or non-zero on significant error
+#
+###############################################################################
+
+def read_func():
+
+ # ntp query is for controllers only
+ if tsc.nodetype != 'controller':
+ return 0
+
+ if obj.config_complete is False:
+ if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is False:
+ return 0
+ else:
+ collectd.info("%s controller config complete ; "
+ "invoking init_func" % PLUGIN)
+ if init_func() != 0:
+ return 1
+
+ # get a list if provisioned ntp servers
+ _get_ntp_servers()
+
+ # nothing to do while there are no provisioned NTP servers
+ if len(obj.server_list_conf) == 0:
+ return 0
+
+ # Do NTP Query
+ data = subprocess.check_output([PLUGIN_EXEC, PLUGIN_EXEC_OPTIONS])
+
+ # Keep this FIT test code but make it commented out for security
+ #
+ # if os.path.exists('/var/run/fit/ntpq_data'):
+ # data = ''
+ # collectd.info("%s using ntpq FIT data" % PLUGIN)
+ # with open('/var/run/fit/ntpq_data', 'r') as infile:
+ # for line in infile:
+ # data += line
+
+ if not data:
+ collectd.error("%s no data from query" % PLUGIN)
+ return 1
+
+ # Get the ntp query output into a list of lines
+ obj.ntpq = data.split('\n')
+
+ # keep track of changes ; only log on changes
+ reachable_list_changed = False
+ unreachable_list_changed = False
+
+ # Manage the selected server name
+ #
+ # save the old value so we can print a log if the selected server changes
+ if obj.selected_server:
+ obj.selected_server_save = obj.selected_server
+ # always assume no selected server ; till its learned
+ obj.selected_server = ''
+
+ # start with a fresh empty list for this new run to populate
+ obj.server_list_ntpq = []
+
+ # Loop through the ntpq output.
+ # Ignore the first 2 lines ; just header data.
+ for i in range(2, len(obj.ntpq)):
+
+ # ignore empty or lines that are not long enough
+ if len(obj.ntpq[i]) < 10:
+ continue
+
+ # log the ntpq output ; minus the 2 lines of header
+ collectd.info("NTPQ: %s" % obj.ntpq[i])
+
+ # Unreachable servers are ones whose line start with a space
+ ip = ''
+ if obj.ntpq[i][0] == ' ':
+ # get the ip address
+ # example format of line:['', '132.163.4.102', '', '', '.INIT.',
+ # get ip from index [1] of the list
+ unreachable = obj.ntpq[i].split(' ')[1]
+ if unreachable:
+ # check to see if its a controller ip
+ # we skip over controller ips
+ if _is_controller(unreachable) is False:
+ _add_ip_to_ntpq_server_list(unreachable)
+ if unreachable not in obj.unreachable_servers:
+ if _raise_alarm(unreachable) is False:
+ unreachable_list_changed = True
+ # if the FM call to raise the alarm worked then
+ # add this ip to the unreachable list if its not
+ # already in it
+ _add_unreachable_server(unreachable)
+
+ # Reachable servers are ones whose line start with a '+'
+ elif obj.ntpq[i][0] == '+':
+ # remove the '+' and get the ip
+ ip = obj.ntpq[i].split(' ')[0][1:]
+
+ elif obj.ntpq[i][0] == '*':
+ # remove the '+' and get the ip
+ ip = obj.ntpq[i].split(' ')[0][1:]
+ if ip:
+ if _is_controller(ip) is False:
+ if obj.selected_server:
+ # done update the selected server if more selections
+ # are found. go with the first one found.
+ collectd.info("%s additional selected server found"
+ " '%s'; current selection is '%s'" %
+ (PLUGIN, ip, obj.selected_server))
+ else:
+ # update the selected server list
+ obj.selected_server = ip
+ collectd.debug("%s selected server is '%s'" %
+ (PLUGIN, obj.selected_server))
+ else:
+ collectd.debug("%s local controller '%s' marked "
+ "as selected server ; ignoring" %
+ (PLUGIN, ip))
+
+ # anything else is unreachable
+ else:
+ unreachable = obj.ntpq[i][1:].split(' ')[0]
+ if _is_controller(unreachable) is False:
+ _add_ip_to_ntpq_server_list(unreachable)
+ if unreachable not in obj.unreachable_servers:
+ if _raise_alarm(unreachable) is False:
+ unreachable_list_changed = True
+ # if the FM call to raise the alarm worked then
+ # add this ip to the unreachable list if its not
+ # already in it
+ _add_unreachable_server(unreachable)
+
+ if ip:
+ # if the ip is valid then manage it
+ if _is_controller(ip) is False:
+ _add_ip_to_ntpq_server_list(ip)
+ # add the ip to the reachable servers list
+ # if its not already there
+ if ip not in obj.reachable_servers:
+ obj.reachable_servers.append(ip)
+ reachable_list_changed = True
+ # make sure this IP is no longer in the unreachable
+ # list and that alarms for it are cleared
+ _remove_ip_from_unreachable_list(ip)
+
+ _cleanup_stale_servers()
+
+ if obj.selected_server:
+ if obj.selected_server != obj.selected_server_save:
+ collectd.info("%s selected server changed from '%s' to '%s'" %
+ (PLUGIN,
+ obj.selected_server_save,
+ obj.selected_server))
+ obj.selected_server_save = obj.selected_server
+ if obj.alarm_raised is True:
+ _clear_base_alarm()
+
+ elif obj.alarm_raised is False:
+ collectd.error("%s no selected server" % PLUGIN)
+ if _raise_alarm() is False:
+ obj.selected_server_save = 'None'
+
+ # only log and act on changes
+ if reachable_list_changed is True:
+ if obj.reachable_servers:
+ collectd.info("%s reachable servers: %s" %
+ (PLUGIN, obj.reachable_servers))
+ if obj.alarm_raised is True:
+ if obj.selected_server and obj.reachable_servers:
+ _clear_base_alarm()
+ else:
+ collectd.error("%s no reachable servers" % PLUGIN)
+ _raise_alarm()
+
+ # only log changes
+ if unreachable_list_changed is True:
+ if obj.unreachable_servers:
+ collectd.info("%s unreachable servers: %s" %
+ (PLUGIN, obj.unreachable_servers))
+ else:
+ collectd.info("%s all servers are reachable" % PLUGIN)
+
+ # The sample published to the database is simply the number
+ # of reachable servers if one is selected
+ if not obj.selected_server:
+ sample = 0
+ else:
+ sample = len(obj.reachable_servers)
+
+ # Dispatch usage value to collectd
+ val = collectd.Values(host=obj.hostname)
+ val.plugin = 'ntpq'
+ val.type = 'absolute'
+ val.type_instance = 'reachable'
+ val.dispatch(values=[sample])
return 0
@@ -192,4 +779,4 @@ def read_func():
# register the config, init and read functions
collectd.register_config(config_func)
collectd.register_init(init_func)
-collectd.register_read(read_func)
+collectd.register_read(read_func, interval=PLUGIN_INTERVAL)
diff --git a/monitoring/collectd-extensions/src/python_plugins.conf b/monitoring/collectd-extensions/src/python_plugins.conf
index 8d8c979cc..52aa763d0 100644
--- a/monitoring/collectd-extensions/src/python_plugins.conf
+++ b/monitoring/collectd-extensions/src/python_plugins.conf
@@ -9,12 +9,7 @@ LoadPlugin python
Path "/proc/meminfo"
- # Import "example"
- #
- # Data "1 50"
- #
- # Import "interface"
- # Import "ntpq"
+ Import "ntpq"
LogTraces = true
Encoding "utf-8"