Add network interface monitoring plugin to collectd
This update introduces interface monitoring for oam, mgmt and infra networks as a collectd plugin. The interface plugin runs and queries the new maintenance Link Monitor daemon for Link Model and Information every 10 seconds. The plugin then manages alarms based on the link model similar to how rmon did in the past ; port and interface alarms. Severity: Interface and Port levels Alarm Level Minor Major Critical ----------- ----- --------------------- ---------------------------- Interface N/A One of lag pair is Up All Interface ports are Down Port N/A Physical Link is Down N/A Degrade support for interface monitoring is add to the mtce degrade notifier. Any link down condition results in a host degrade condition like was in rmon. Sample Data: represented as % of total links Up for that network interface 100 or 100% percent used - all links of interface are up. 50 or 50% percent used - one of lag pair is Up and the other is Down 0 or 0% percent used - all ports for that network are Down The plugin documents all of this in its header. This update also 1. Adds the new lmond process to syslog-ng config file. 2. Adds the new lmond process to the mtce patch script. 3. Modifies the cpu, df and memory threshold settings by -1. rmon thresholds were precise whereas collectd requires that the samples cross the thresholds, not just meet them. So for example, in terms of a 90% usage action the threshold needs to be 89. Test Plan: (WIP but almost complete) PASS: Verify interface plugin startup PASS: Verify interface plugin logging PASS: Verify interface plugin Link Status Query and response handling PASS: Verify monitor, sample storage and grafana display PASS: verify port and interface alarm matches what rmon produced PASS: Verify lmon port config from manifest configured plugin PASS: Verify lmon port config from lmon.conf PASS: Verify single interface failure handling and recovery PASS: Verify lagged interface failure handling and recovery PASS: Verify link loss of lagged interface shared between mgmt and oam (hp380) PASS: Verify network interface failure handling ; single port PASS: Verify network interface degrade handling ; lagged interface PEND: Verify network interface degrade handling ; vlan interface PASS: Verify HTTP request timeout period and handling PASS: Verify link status query failure handling - invalid uri (timeout) PASS: Verify link status query failure handling - missing uri (timeout) PASS: Verify link status query failure handling - status fail PASS: Verify link status query failure handling - bad json resp Change-Id: I2e2dfe6ddfa06a46770245540c7153d330bdf196 Story: 2002823 Task: 28635 Depends-On: https://review.openstack.org/#/c/633264 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
34bc8404f0
commit
e8c9676d98
@ -1,2 +1,2 @@
|
||||
SRC_DIR="files"
|
||||
TIS_PATCH_VER=0
|
||||
TIS_PATCH_VER=1
|
||||
|
@ -107,6 +107,7 @@ destination d_sm { file("/var/log/sm.log"); };
|
||||
destination d_rmon { file("/var/log/rmond.log" template(t_mtc)); };
|
||||
destination d_rmon_notify { file("/var/log/rmond_notify.log" template(t_mtc)); };
|
||||
destination d_pmon { file("/var/log/pmond.log" template(t_mtc)); };
|
||||
destination d_lmon { file("/var/log/lmond.log" template(t_mtc)); };
|
||||
destination d_hostwd { file("/var/log/hostwd.log" template(t_mtc)); };
|
||||
destination d_fsmon { file("/var/log/fsmond.log" template(t_mtc)); };
|
||||
destination d_hwmon { file("/var/log/hwmond.log" template(t_mtc)); };
|
||||
@ -352,6 +353,7 @@ filter f_local7 { facility(local7); };
|
||||
filter f_rmon { facility(local5) and program(rmond); };
|
||||
filter f_rmon_notify { facility(local5) and program(rmon_resource_notify); };
|
||||
filter f_pmon { facility(local5) and program(pmond); };
|
||||
filter f_lmon { facility(local5) and program(lmond); };
|
||||
filter f_hostw { facility(local5) and program(hostwd); };
|
||||
filter f_fsmon { facility(local5) and program(fsmond); };
|
||||
filter f_hwmon { facility(local5) and program(hwmond); };
|
||||
@ -472,6 +474,7 @@ log { source(s_src); filter(f_local3); destination(d_sm); };
|
||||
log { source(s_src); filter(f_rmon); destination(d_rmon); };
|
||||
log { source(s_src); filter(f_rmon_notify); destination(d_rmon_notify); };
|
||||
log { source(s_src); filter(f_pmon); destination(d_pmon); };
|
||||
log { source(s_src); filter(f_lmon); destination(d_lmon); };
|
||||
log { source(s_src); filter(f_hostw); destination(d_hostwd); };
|
||||
log { source(s_src); filter(f_fsmon); destination(d_fsmon); };
|
||||
log { source(s_src); filter(f_hwmon); destination(d_hwmon); };
|
||||
|
@ -5,6 +5,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
|
||||
$PKG_BASE/src/collectd.service \
|
||||
$PKG_BASE/src/fm_notifier.py \
|
||||
$PKG_BASE/src/mtce_notifier.py \
|
||||
$PKG_BASE/src/plugin_common.py \
|
||||
$PKG_BASE/src/python_plugins.conf \
|
||||
$PKG_BASE/src/cpu.py \
|
||||
$PKG_BASE/src/cpu.conf \
|
||||
@ -13,7 +14,9 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
|
||||
$PKG_BASE/src/df.conf \
|
||||
$PKG_BASE/src/ntpq.py \
|
||||
$PKG_BASE/src/ntpq.conf \
|
||||
$PKG_BASE/src/interface.py \
|
||||
$PKG_BASE/src/interface.conf \
|
||||
$PKG_BASE/src/example.py \
|
||||
$PKG_BASE/src/example.conf"
|
||||
|
||||
TIS_PATCH_VER=6
|
||||
TIS_PATCH_VER=7
|
||||
|
@ -15,12 +15,14 @@ Source2: collectd.conf.pmon
|
||||
# collectd python plugin files - notifiers
|
||||
Source3: fm_notifier.py
|
||||
Source4: mtce_notifier.py
|
||||
Source5: plugin_common.py
|
||||
|
||||
# collectd python plugin files - resource plugins
|
||||
Source11: cpu.py
|
||||
Source12: memory.py
|
||||
Source14: example.py
|
||||
Source15: ntpq.py
|
||||
Source16: interface.py
|
||||
|
||||
# collectd plugin conf files into /etc/collectd.d
|
||||
Source100: python_plugins.conf
|
||||
@ -29,6 +31,7 @@ Source102: memory.conf
|
||||
Source103: df.conf
|
||||
Source104: example.conf
|
||||
Source105: ntpq.conf
|
||||
Source106: interface.conf
|
||||
|
||||
BuildRequires: systemd-devel
|
||||
|
||||
@ -64,12 +67,15 @@ install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir}
|
||||
# collectd python plugin files - notifiers
|
||||
install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir}
|
||||
|
||||
# collectd python plugin files - resource plugins
|
||||
install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
|
||||
|
||||
|
||||
# collectd plugin conf files into /etc/collectd.d
|
||||
install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir}
|
||||
@ -78,6 +84,7 @@ install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
|
||||
|
||||
%clean
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
|
@ -13,8 +13,8 @@
|
||||
Instance "used"
|
||||
Persist true
|
||||
PersistOK true
|
||||
WarningMax 90.00
|
||||
FailureMax 95.00
|
||||
WarningMax 89.00
|
||||
FailureMax 94.00
|
||||
Hits 2
|
||||
Invert false
|
||||
</Type>
|
||||
|
@ -13,6 +13,7 @@
|
||||
MountPoint "/var/lock"
|
||||
MountPoint "/boot"
|
||||
MountPoint "/scratch"
|
||||
MountPoint "/opt/etcd"
|
||||
MountPoint "/opt/cgcs"
|
||||
MountPoint "/opt/platform"
|
||||
MountPoint "/opt/extension"
|
||||
@ -27,8 +28,8 @@
|
||||
<Plugin "df">
|
||||
<Type "percent_bytes">
|
||||
Instance "used"
|
||||
WarningMax 80.00
|
||||
FailureMax 90.00
|
||||
WarningMax 79.00
|
||||
FailureMax 89.00
|
||||
Persist true
|
||||
PersistOK true
|
||||
Hits 2
|
||||
|
@ -4,8 +4,8 @@
|
||||
Instance "used"
|
||||
Persist true
|
||||
PersistOK true
|
||||
WarningMax 51.00
|
||||
FailureMax 75.00
|
||||
WarningMax 49.00
|
||||
FailureMax 74.00
|
||||
Hits 1
|
||||
Invert false
|
||||
</Type>
|
||||
|
@ -90,6 +90,7 @@ from threading import RLock as Lock
|
||||
from fm_api import constants as fm_constants
|
||||
from fm_api import fm_api
|
||||
import tsconfig.tsconfig as tsc
|
||||
import plugin_common as pc
|
||||
|
||||
# only load influxdb on the controller
|
||||
if tsc.nodetype == 'controller':
|
||||
@ -865,16 +866,19 @@ def _get_base_object(alarm_id):
|
||||
return None
|
||||
|
||||
|
||||
def is_uuid_like(val):
|
||||
"""Returns validation of a value as a UUID.
|
||||
|
||||
For our purposes, a UUID is a canonical form string:
|
||||
aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa
|
||||
def _get_object(alarm_id, eid):
|
||||
"""
|
||||
try:
|
||||
return str(uuid.UUID(val)) == val
|
||||
except (TypeError, ValueError, AttributeError):
|
||||
return False
|
||||
Get the plugin object for the specified alarm id and eid
|
||||
"""
|
||||
|
||||
base_obj = _get_base_object(alarm_id)
|
||||
if len(base_obj.instance_objects):
|
||||
try:
|
||||
return(base_obj.instance_objects[eid])
|
||||
except:
|
||||
collectd.debug("%s %s has no instance objects" %
|
||||
(PLUGIN, base_obj.plugin))
|
||||
return base_obj
|
||||
|
||||
|
||||
def _build_entity_id(plugin, plugin_instance):
|
||||
@ -1530,7 +1534,7 @@ def notifier_func(nObject):
|
||||
suppression=base_obj.suppression)
|
||||
|
||||
alarm_uuid = api.set_fault(fault)
|
||||
if is_uuid_like(alarm_uuid) is False:
|
||||
if pc.is_uuid_like(alarm_uuid) is False:
|
||||
collectd.error("%s %s:%s set_fault failed:%s" %
|
||||
(PLUGIN, base_obj.id, obj.entity_id, alarm_uuid))
|
||||
return 0
|
||||
|
@ -1,11 +1,11 @@
|
||||
<Plugin "threshold">
|
||||
<Plugin "interface">
|
||||
<Type "absolute">
|
||||
Instance "state"
|
||||
<Type "percent">
|
||||
Instance "used"
|
||||
Persist true
|
||||
PersistOK true
|
||||
WarningMin 50
|
||||
FailureMin 0
|
||||
WarningMin 51
|
||||
FailureMin 1
|
||||
# Hits 2
|
||||
Invert false
|
||||
</Type>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -12,8 +12,8 @@
|
||||
Instance "used"
|
||||
Persist true
|
||||
PersistOK true
|
||||
WarningMax 80.00
|
||||
FailureMax 90.00
|
||||
WarningMax 79.00
|
||||
FailureMax 89.00
|
||||
Hits 2
|
||||
Invert false
|
||||
</Type>
|
||||
|
@ -103,7 +103,7 @@ class collectdMtceNotifierObject:
|
||||
PLUGIN__VSWITCH_IFACE,
|
||||
PLUGIN_INTERFACE,
|
||||
PLUGIN__EXAMPLE]
|
||||
self.degrade_list__warning = []
|
||||
self.degrade_list__warning = [PLUGIN_INTERFACE]
|
||||
|
||||
# the running list of resources that require degrade.
|
||||
# a degrade clear message is sent whenever this list is empty.
|
||||
|
255
monitoring/collectd-extensions/src/plugin_common.py
Normal file
255
monitoring/collectd-extensions/src/plugin_common.py
Normal file
@ -0,0 +1,255 @@
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
############################################################################
|
||||
#
|
||||
# This file contains common collectd plugin constructs and utilities
|
||||
#
|
||||
############################################################################
|
||||
|
||||
import collectd
|
||||
import json
|
||||
import uuid
|
||||
import httplib2
|
||||
import socket
|
||||
import os
|
||||
from fm_api import constants as fm_constants
|
||||
import tsconfig.tsconfig as tsc
|
||||
|
||||
# http request constants
|
||||
PLUGIN_TIMEOUT = 10
|
||||
PLUGIN_HTTP_HEADERS = {'Accept': 'application/json', 'Connection': 'close'}
|
||||
|
||||
MIN_AUDITS_B4_FIRST_QUERY = 2
|
||||
|
||||
|
||||
class PluginObject(object):
|
||||
|
||||
def __init__(self, plugin, url):
|
||||
|
||||
# static variables set in init_func
|
||||
self.plugin = plugin # the name of this plugin
|
||||
self.hostname = '' # the name of this host
|
||||
self.port = 0 # the port number for this plugin
|
||||
|
||||
# dynamic gate variables
|
||||
self.config_complete = False # set to True once config is complete
|
||||
self.config_done = False # set true if config_func completed ok
|
||||
self.init_done = False # set true if init_func completed ok
|
||||
|
||||
# dynamic variables set in read_func
|
||||
self.usage = float(0) # last usage value recorded as float
|
||||
self.audits = 0 # number of audit since init
|
||||
|
||||
# http and json specific variables
|
||||
self.url = url # target url
|
||||
self.jresp = None # used to store the json response
|
||||
self.resp = ''
|
||||
|
||||
# Log controls
|
||||
self.config_logged = False # used to log once the plugin config
|
||||
self.error_logged = False # used to prevent log flooding
|
||||
self.log_throttle_count = 0 # used to count throttle logs
|
||||
self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold
|
||||
|
||||
collectd.debug("%s Common PluginObject constructor [%s]" %
|
||||
(plugin, url))
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Name : init_ready
|
||||
#
|
||||
# Description: Test for init ready condition
|
||||
#
|
||||
# Parameters : plugin name
|
||||
#
|
||||
# Returns : False if initial config complete is not done
|
||||
# True if initial config complete is done
|
||||
#
|
||||
###########################################################################
|
||||
|
||||
def init_ready(self):
|
||||
""" Test for system init ready state """
|
||||
|
||||
if os.path.exists(tsc.INITIAL_CONFIG_COMPLETE_FLAG) is False:
|
||||
self.log_throttle_count += 1
|
||||
if self.log_throttle_count > self.INIT_LOG_THROTTLE:
|
||||
collectd.info("%s initialization needs retry" % self.plugin)
|
||||
self.log_throttle_count = 0
|
||||
return False
|
||||
else:
|
||||
self.log_throttle_count = 0
|
||||
|
||||
return True
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Name : gethostname
|
||||
#
|
||||
# Description: load the hostname
|
||||
#
|
||||
# Parameters : plugin name
|
||||
#
|
||||
# Returns : Success - hostname
|
||||
# Failure - None
|
||||
#
|
||||
# Updates : obj.hostname
|
||||
#
|
||||
###########################################################################
|
||||
def gethostname(self):
|
||||
""" Fetch the hostname """
|
||||
|
||||
# get current hostname
|
||||
try:
|
||||
hostname = socket.gethostname()
|
||||
if hostname:
|
||||
return hostname
|
||||
except:
|
||||
collectd.error("%s failed to get hostname" % self.plugin)
|
||||
|
||||
return None
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Name : check_for_fit
|
||||
#
|
||||
# Description: load FIT data if it is present
|
||||
#
|
||||
# Fit Format : unit data -> 0 89
|
||||
# - instance 0 value 89
|
||||
#
|
||||
# Parameters : plugin name
|
||||
# object to update with fit
|
||||
# name in fit file
|
||||
# unit
|
||||
#
|
||||
# Returns : Did a failure occur ?
|
||||
# False = no
|
||||
# True = yes
|
||||
#
|
||||
# Updates : self.usage with FIT value if FIT conditions are present
|
||||
# and apply
|
||||
#
|
||||
###########################################################################
|
||||
def check_for_fit(self, name, unit):
|
||||
""" Load FIT data into usage if it exists """
|
||||
|
||||
fit_file = '/var/run/fit/' + name + '_data'
|
||||
|
||||
if os.path.exists(fit_file):
|
||||
valid = False
|
||||
with open(fit_file, 'r') as infile:
|
||||
for line in infile:
|
||||
try:
|
||||
inst, val = line.split(' ')
|
||||
if int(unit) == int(inst):
|
||||
self.usage = float(val)
|
||||
valid = True
|
||||
|
||||
except:
|
||||
try:
|
||||
val = float(line)
|
||||
self.usage = float(val)
|
||||
valid = True
|
||||
|
||||
except:
|
||||
collectd.error("%s bad FIT data; ignoring" %
|
||||
self.plugin)
|
||||
|
||||
if valid is True:
|
||||
collectd.info("%s %.2f usage (unit %d) (FIT)" %
|
||||
(self.plugin, unit, self.usage))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Name : make_http_request
|
||||
#
|
||||
# Description: Issue an http request to the specified URL.
|
||||
# Load and return the response
|
||||
# Handling execution errors
|
||||
#
|
||||
# Parameters : self as current context.
|
||||
#
|
||||
# Optional:
|
||||
#
|
||||
# url - override the default self url with http address to
|
||||
# issue the get request to.
|
||||
# to - timeout override
|
||||
# hdrs - override use of the default header list
|
||||
#
|
||||
# Updates : self.jresp with the json string response from the request.
|
||||
#
|
||||
# Returns : Error indication (True/False)
|
||||
# True on error
|
||||
# False on success
|
||||
#
|
||||
###########################################################################
|
||||
def make_http_request(self, url=None, to=None, hdrs=None):
|
||||
""" Make a blocking HTTP Request and return result """
|
||||
|
||||
try:
|
||||
|
||||
# handle timeout override
|
||||
if to is None:
|
||||
to = PLUGIN_TIMEOUT
|
||||
|
||||
# handle url override
|
||||
if url is None:
|
||||
url = self.url
|
||||
|
||||
# handle header override
|
||||
if hdrs is None:
|
||||
hdrs = PLUGIN_HTTP_HEADERS
|
||||
|
||||
http = httplib2.Http(timeout=to)
|
||||
resp = http.request(url, headers=hdrs)
|
||||
|
||||
except Exception as ex:
|
||||
collectd.info("%s http request failure (%s)" %
|
||||
(self.plugin, str(ex)))
|
||||
return True
|
||||
|
||||
try:
|
||||
collectd.debug("%s Resp: %s" %
|
||||
(self.plugin, resp[1]))
|
||||
|
||||
self.resp = resp[1]
|
||||
self.jresp = json.loads(resp[1])
|
||||
|
||||
except Exception as ex:
|
||||
collectd.info("%s http request parse failure (%s) (%s)" %
|
||||
(self.plugin, str(ex), resp))
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_uuid_like(val):
|
||||
"""Returns validation of a value as a UUID.
|
||||
|
||||
For our purposes, a UUID is a canonical form string:
|
||||
aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa
|
||||
"""
|
||||
try:
|
||||
return str(uuid.UUID(val)) == val
|
||||
except (TypeError, ValueError, AttributeError):
|
||||
return False
|
||||
|
||||
|
||||
def get_severity_str(severity):
|
||||
""" get string that represents the specified severity """
|
||||
|
||||
if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR:
|
||||
return "clear"
|
||||
elif severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
|
||||
return "critical"
|
||||
elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR:
|
||||
return "major"
|
||||
elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR:
|
||||
return "minor"
|
||||
else:
|
||||
return "unknown"
|
@ -10,6 +10,10 @@ LoadPlugin python
|
||||
Path "/proc/meminfo"
|
||||
</Module>
|
||||
Import "ntpq"
|
||||
Import "interface"
|
||||
<Module "interface">
|
||||
Port 2122
|
||||
</Module>
|
||||
LogTraces = true
|
||||
Encoding "utf-8"
|
||||
</Plugin>
|
||||
|
@ -1,4 +1,4 @@
|
||||
SRC_DIR="platform-util"
|
||||
COPY_LIST_TO_TAR="scripts"
|
||||
|
||||
TIS_PATCH_VER=15
|
||||
TIS_PATCH_VER=16
|
||||
|
@ -131,6 +131,9 @@ do
|
||||
"mtcalarmd")
|
||||
pmon_managed_processes=(${pmon_managed_processes[@]} "mtcalarmd:0")
|
||||
;;
|
||||
"lmond")
|
||||
pmon_managed_processes=(${pmon_managed_processes[@]} "lmond:0")
|
||||
;;
|
||||
|
||||
*)
|
||||
loginfo "Unknown process:${process}"
|
||||
|
Loading…
Reference in New Issue
Block a user