439079693d
This updates the Nagios image tag to include the updated plugin for querying Elasticsearch for alerting on logged events Change-Id: Idd61d82463b79baab0e94c20b32da1dc6a8b3634
1190 lines
59 KiB
YAML
1190 lines
59 KiB
YAML
# Copyright 2017 The Openstack-Helm Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Default values for nagios.
|
|
# This is a YAML-formatted file.
|
|
# Declare variables to be passed into your templates.
|
|
|
|
images:
|
|
tags:
|
|
apache_proxy: docker.io/httpd:2.4
|
|
nagios: quay.io/attcomdev/nagios:410fcb08d2586e98e18ced317dab4157eb27456e
|
|
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
|
|
image_repo_sync: docker.io/docker:17.07.0
|
|
pull_policy: IfNotPresent
|
|
local_registry:
|
|
active: false
|
|
exclude:
|
|
- dep_check
|
|
- image_repo_sync
|
|
|
|
labels:
|
|
nagios:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
job:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
|
|
dependencies:
|
|
dynamic:
|
|
common:
|
|
jobs:
|
|
- nagios-image-repo-sync
|
|
services:
|
|
- service: local_image_registry
|
|
endpoint: node
|
|
static:
|
|
image_repo_sync:
|
|
services:
|
|
- service: local_image_registry
|
|
endpoint: internal
|
|
nagios:
|
|
services: null
|
|
|
|
secrets:
|
|
nagios:
|
|
admin: nagios-admin-creds
|
|
tls:
|
|
nagios:
|
|
nagios:
|
|
public: nagios-tls-public
|
|
|
|
endpoints:
|
|
cluster_domain_suffix: cluster.local
|
|
local_image_registry:
|
|
name: docker-registry
|
|
namespace: docker-registry
|
|
hosts:
|
|
default: localhost
|
|
internal: docker-registry
|
|
node: localhost
|
|
host_fqdn_override:
|
|
default: null
|
|
port:
|
|
registry:
|
|
node: 5000
|
|
monitoring:
|
|
name: prometheus
|
|
auth:
|
|
admin:
|
|
username: admin
|
|
password: changeme
|
|
hosts:
|
|
default: prom-metrics
|
|
public: prometheus
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: http
|
|
port:
|
|
http:
|
|
default: 80
|
|
nagios:
|
|
name: nagios
|
|
namespace: null
|
|
auth:
|
|
admin:
|
|
username: nagiosadmin
|
|
password: password
|
|
hosts:
|
|
default: nagios-metrics
|
|
public: nagios
|
|
host_fqdn_override:
|
|
default: null
|
|
# NOTE(srwilkers): this chart supports TLS for fqdn over-ridden public
|
|
# endpoints using the following format:
|
|
# public:
|
|
# host: null
|
|
# tls:
|
|
# crt: null
|
|
# key: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: http
|
|
port:
|
|
nagios:
|
|
default: 8000
|
|
http:
|
|
default: 80
|
|
ldap:
|
|
hosts:
|
|
default: ldap
|
|
auth:
|
|
admin:
|
|
bind: "cn=admin,dc=cluster,dc=local"
|
|
password: password
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: "/ou=People,dc=cluster,dc=local"
|
|
scheme:
|
|
default: ldap
|
|
port:
|
|
ldap:
|
|
default: 389
|
|
elasticsearch:
|
|
name: elasticsearch
|
|
namespace: null
|
|
auth:
|
|
admin:
|
|
username: admin
|
|
password: changeme
|
|
hosts:
|
|
default: elasticsearch-logging
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: /
|
|
scheme:
|
|
default: http
|
|
port:
|
|
http:
|
|
default: 80
|
|
ceph_mgr:
|
|
namespace: null
|
|
hosts:
|
|
default: ceph-mgr
|
|
host_fqdn_override:
|
|
default: null
|
|
port:
|
|
mgr:
|
|
default: 7000
|
|
metrics:
|
|
default: 9283
|
|
scheme:
|
|
default: http
|
|
|
|
network:
|
|
nagios:
|
|
ingress:
|
|
public: true
|
|
classes:
|
|
namespace: "nginx"
|
|
cluster: "nginx-cluster"
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/rewrite-target: /
|
|
nginx.ingress.kubernetes.io/affinity: cookie
|
|
nginx.ingress.kubernetes.io/session-cookie-name: kube-ingress-session-nagios
|
|
nginx.ingress.kubernetes.io/session-cookie-hash: sha1
|
|
node_port:
|
|
enabled: false
|
|
port: 30925
|
|
|
|
pod:
|
|
lifecycle:
|
|
upgrades:
|
|
revision_history: 3
|
|
pod_replacement_strategy: RollingUpdate
|
|
rolling_update:
|
|
max_unavailable: 1
|
|
max_surge: 3
|
|
termination_grace_period:
|
|
nagios:
|
|
timeout: 30
|
|
replicas:
|
|
nagios: 1
|
|
resources:
|
|
enabled: false
|
|
nagios:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
apache_proxy:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
jobs:
|
|
image_repo_sync:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
|
|
manifests:
|
|
configmap_bin: true
|
|
configmap_etc: true
|
|
deployment: true
|
|
ingress: true
|
|
job_image_repo_sync: true
|
|
network_policy: false
|
|
secret_nagios: true
|
|
secret_ingress_tls: true
|
|
service: true
|
|
service_ingress: true
|
|
|
|
conf:
|
|
httpd: |
|
|
ServerRoot "/usr/local/apache2"
|
|
|
|
Listen 80
|
|
|
|
LoadModule mpm_event_module modules/mod_mpm_event.so
|
|
LoadModule authn_file_module modules/mod_authn_file.so
|
|
LoadModule authn_core_module modules/mod_authn_core.so
|
|
LoadModule authz_host_module modules/mod_authz_host.so
|
|
LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
|
|
LoadModule authz_user_module modules/mod_authz_user.so
|
|
LoadModule authz_core_module modules/mod_authz_core.so
|
|
LoadModule access_compat_module modules/mod_access_compat.so
|
|
LoadModule auth_basic_module modules/mod_auth_basic.so
|
|
LoadModule ldap_module modules/mod_ldap.so
|
|
LoadModule authnz_ldap_module modules/mod_authnz_ldap.so
|
|
LoadModule reqtimeout_module modules/mod_reqtimeout.so
|
|
LoadModule filter_module modules/mod_filter.so
|
|
LoadModule proxy_html_module modules/mod_proxy_html.so
|
|
LoadModule log_config_module modules/mod_log_config.so
|
|
LoadModule env_module modules/mod_env.so
|
|
LoadModule headers_module modules/mod_headers.so
|
|
LoadModule setenvif_module modules/mod_setenvif.so
|
|
LoadModule version_module modules/mod_version.so
|
|
LoadModule proxy_module modules/mod_proxy.so
|
|
LoadModule proxy_connect_module modules/mod_proxy_connect.so
|
|
LoadModule proxy_http_module modules/mod_proxy_http.so
|
|
LoadModule proxy_balancer_module modules/mod_proxy_balancer.so
|
|
LoadModule slotmem_shm_module modules/mod_slotmem_shm.so
|
|
LoadModule slotmem_plain_module modules/mod_slotmem_plain.so
|
|
LoadModule unixd_module modules/mod_unixd.so
|
|
LoadModule status_module modules/mod_status.so
|
|
LoadModule autoindex_module modules/mod_autoindex.so
|
|
|
|
<IfModule unixd_module>
|
|
User daemon
|
|
Group daemon
|
|
</IfModule>
|
|
|
|
<Directory />
|
|
AllowOverride none
|
|
Require all denied
|
|
</Directory>
|
|
|
|
<Files ".ht*">
|
|
Require all denied
|
|
</Files>
|
|
|
|
ErrorLog /dev/stderr
|
|
|
|
LogLevel warn
|
|
|
|
<IfModule log_config_module>
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b" common
|
|
|
|
<IfModule logio_module>
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio
|
|
</IfModule>
|
|
|
|
CustomLog /dev/stdout common
|
|
|
|
CustomLog /dev/stdout combined
|
|
</IfModule>
|
|
|
|
<Directory "/usr/local/apache2/cgi-bin">
|
|
AllowOverride None
|
|
Options None
|
|
Require all granted
|
|
</Directory>
|
|
|
|
<IfModule headers_module>
|
|
RequestHeader unset Proxy early
|
|
</IfModule>
|
|
|
|
<IfModule proxy_html_module>
|
|
Include conf/extra/proxy-html.conf
|
|
</IfModule>
|
|
|
|
<VirtualHost *:80>
|
|
<Location />
|
|
ProxyPass http://localhost:{{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
|
|
ProxyPassReverse http://localhost:{{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
|
|
</Location>
|
|
<Proxy *>
|
|
AuthName "Nagios"
|
|
AuthType Basic
|
|
AuthBasicProvider file ldap
|
|
AuthUserFile /usr/local/apache2/conf/.htpasswd
|
|
AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
|
|
AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
|
|
AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
|
|
Require valid-user
|
|
</Proxy>
|
|
</VirtualHost>
|
|
nagios:
|
|
contacts:
|
|
- notifying_contact:
|
|
name: notifying_contact
|
|
contact_name: notifying_contact
|
|
alias: notifying contact
|
|
service_notification_period: 24x7
|
|
host_notification_period: 24x7
|
|
service_notification_options: w,u,c,r,f,s
|
|
host_notification_options: d,u,r,f,s
|
|
register: 0
|
|
- snmp_notifying_contact:
|
|
use: notifying_contact
|
|
name: snmp_notifying_contact
|
|
contact_name: snmp_notifying_contact
|
|
alias: snmp contact
|
|
service_notification_commands: send_service_snmp_trap
|
|
host_notification_commands: send_host_snmp_trap
|
|
- http_notifying_contact:
|
|
use: notifying_contact
|
|
name: http_notifying_contact
|
|
contact_name: http_notifying_contact
|
|
alias: HTTP contact
|
|
service_notification_commands: send_service_http_post
|
|
host_notification_commands: send_host_http_post
|
|
contactgroups:
|
|
- snmp_and_http_notifying_contact_group:
|
|
contactgroup_name: snmp_and_http_notifying_contact_group
|
|
alias: SNMP and HTTP notifying group
|
|
members: snmp_notifying_contact,http_notifying_contact
|
|
hosts:
|
|
- prometheus:
|
|
use: linux-server
|
|
host_name: prometheus
|
|
alias: "Prometheus Monitoring"
|
|
address: 127.0.0.1
|
|
hostgroups: prometheus-hosts
|
|
check_command: check-prometheus-host-alive
|
|
host_groups:
|
|
- prometheus-hosts:
|
|
hostgroup_name: prometheus-hosts
|
|
alias: "Prometheus Virtual Host"
|
|
- all:
|
|
hostgroup_name: all
|
|
alias: "all"
|
|
- base-os:
|
|
hostgroup_name: base-os
|
|
alias: "base-os"
|
|
commands:
|
|
- send_service_snmp_trap:
|
|
command_name: send_service_snmp_trap
|
|
command_line: "$USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$'"
|
|
- send_host_snmp_trap:
|
|
command_name: send_host_snmp_trap
|
|
command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
|
|
- send_service_http_post:
|
|
command_name: send_service_http_post
|
|
command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
|
|
- send_host_http_post:
|
|
command_name: send_host_http_post
|
|
command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
|
|
- check_prometheus_host_alive:
|
|
command_name: check-prometheus-host-alive
|
|
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
|
|
- check_prom_alert_with_labels:
|
|
command_name: check_prom_alert_with_labels
|
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$'"
|
|
- check_prom_alert:
|
|
command_name: check_prom_alert
|
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
|
|
- check_es_alert:
|
|
command_name: check_es_alert
|
|
command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'"
|
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
|
command_name: check_filespace_mounts-usage-rate-fullin4hrs
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
|
|
- check_filespace_mounts-usage:
|
|
command_name: check_filespace_mounts-usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
|
|
- check_node_loadavg:
|
|
command_name: check_node_loadavg
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
|
|
- check_node_cpu_util:
|
|
command_name: check_node_cpu_util
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
|
|
- check_network_connections:
|
|
command_name: check_network_connections
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
|
|
- check_memory_usage:
|
|
command_name: check_memory_usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
|
|
- check_disk_write_latency:
|
|
command_name: check_disk_write_latency
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
|
|
- check_disk_read_latency:
|
|
command_name: check_disk_read_latency
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
|
|
- check_entropy_availability:
|
|
command_name: check_entropy_availability
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
|
|
- check_filedescriptor_usage_rate:
|
|
command_name: check_filedescriptor_usage_rate
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
|
|
- check_hwmon_high_cpu_temp:
|
|
command_name: check_hwmon_high_cpu_temp
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
|
|
- check_network_receive_drop_high:
|
|
command_name: check_network_receive_drop_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
|
|
- check_network_transmit_drop_high:
|
|
command_name: check_network_transmit_drop_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
|
|
- check_network_receive_errors_high:
|
|
command_name: check_network_receive_errors_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
|
|
- check_network_transmit_errors_high:
|
|
command_name: check_network_transmit_errors_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
|
|
- check_vmstat_paging_rate:
|
|
command_name: check_vmstat_paging_rate
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
|
|
- check_xfs_block_allocation:
|
|
command_name: check_xfs_block_allocation
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
|
|
- check_network_bond_status:
|
|
command_name: check_network_bond_status
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
|
|
- check_numa_memory_usage:
|
|
command_name: check_numa_memory_usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
|
|
- check_ntp_sync:
|
|
command_name: check_ntp_sync
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
|
- check_ceph_health:
|
|
command_name: check_ceph_health
|
|
command_line: $USER1$/check_exporter_health_metric.py --exporter_api $USER10$ --health_metric ceph_health_status --critical 2 --warning 1
|
|
- check_prometheus_hosts:
|
|
command_name: check_prometheus_hosts
|
|
command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
|
|
services:
|
|
- notifying_service:
|
|
name: notifying_service
|
|
use: generic-service
|
|
flap_detection_enabled: 0
|
|
process_perf_data: 0
|
|
contact_groups: snmp_and_http_notifying_contact_group
|
|
check_interval: 60
|
|
notification_interval: 120
|
|
retry_interval: 30
|
|
register: 0
|
|
- check_ceph_health:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "CEPH_health"
|
|
check_command: check_ceph_health
|
|
check_interval: 300
|
|
- check_hosts_health:
|
|
use: generic-service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Nodes_health"
|
|
check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready.
|
|
check_interval: 60
|
|
- check_prometheus_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Prometheus_replica-count"
|
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
|
|
check_interval: 60
|
|
- check_alertmanager_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "PrometheusAlertmanager_replica-count"
|
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
|
|
check_interval: 60
|
|
- check_statefulset_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Statefulset_replica-count"
|
|
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
|
|
check_interval: 60
|
|
- check_daemonset_misscheduled:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Daemonset_misscheduled"
|
|
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
|
|
check_interval: 60
|
|
- check_daemonset_not-scheduled:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Daemonset_not-scheduled"
|
|
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
|
check_interval: 60
|
|
- check_deployment_replicas_unavailable:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Deployment_replicas-unavailable"
|
|
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
|
|
check_interval: 60
|
|
- check_volume_claim_high_utilization:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Volume_claim_high_utilization"
|
|
check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization
|
|
check_interval: 60
|
|
- check_deployment_rollingupdate_replicas_unavailable:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "RollingUpdate_Deployment-replicas-unavailable"
|
|
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
|
|
check_interval: 60
|
|
- check_job_status_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Job_status-failed"
|
|
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
|
|
check_interval: 60
|
|
- check_pod_status_pending:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-pending"
|
|
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
|
|
check_interval: 60
|
|
- check_pod_status_error_image_pull:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-error-image-pull"
|
|
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
|
check_interval: 60
|
|
- check_pod_error_crash_loop_back_off:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-crashLoopBackOff"
|
|
check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status
|
|
check_interval: 60
|
|
- check_replicaset_missing_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Replicaset_missing-replicas"
|
|
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
|
|
check_interval: 60
|
|
- check_pod_container_terminated:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-container-terminated"
|
|
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
|
|
check_interval: 60
|
|
- check_glance_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_glance"
|
|
check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
|
check_interval: 60
|
|
- check_nova_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_nova"
|
|
check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
|
check_interval: 60
|
|
- check_keystone_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_keystone"
|
|
check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
|
check_interval: 60
|
|
- check_neutron_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_neutron"
|
|
check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
|
check_interval: 60
|
|
- check_neutron_metadata_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-metadata-agent"
|
|
check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
|
|
check_interval: 60
|
|
- check_neutron_openvswitch_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-openvswitch-agent"
|
|
check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
|
|
check_interval: 60
|
|
- check_neutron_dhcp_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-dhcp-agent"
|
|
check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
|
|
check_interval: 60
|
|
- check_neutron_l3_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-l3-agent"
|
|
check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
|
|
check_interval: 60
|
|
- check_swift_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_swift"
|
|
check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
|
check_interval: 60
|
|
- check_cinder_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_cinder"
|
|
check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
|
- check_glance_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_heat"
|
|
check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
|
|
check_interval: 60
|
|
- check_cinder_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_cinder"
|
|
check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
|
check_interval: 60
|
|
- check_service_cinder_scheduler:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_cinder-scheduler"
|
|
check_command: check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available
|
|
check_interval: 60
|
|
- check_service_nova_compute:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-compute"
|
|
check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_conductor:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-conductor"
|
|
check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_consoleauth:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-consoleauth"
|
|
check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_scheduler:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-scheduler"
|
|
check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
|
|
check_interval: 60
|
|
- check_os_vm_vcpu_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "OS-Total-Quota_VCPU-usage"
|
|
check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
|
|
check_interval: 60
|
|
- check_os_vm_ram_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "OS-Total-Quota_RAM-usage"
|
|
check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
|
|
check_interval: 60
|
|
- check_os_vm_disk_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "OS-Total-Quota_Disk-usage"
|
|
check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
|
|
check_interval: 60
|
|
- check_ceph_monitor_quorum:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_quorum"
|
|
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
|
check_interval: 60
|
|
- check_ceph_storage_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_storage-usage"
|
|
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_pgs_degradation:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_PGs-degradation"
|
|
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_osds_down:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_OSDs-down"
|
|
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_monitor_clock_skew:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_Clock-skew"
|
|
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
|
check_interval: 60
|
|
- check_fluentd_up:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Fluentd_status"
|
|
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
|
check_interval: 60
|
|
- check_etcd_high_http_deletes_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-delete-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
|
check_interval: 60
|
|
- check_etcd_high_http_get_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-get-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
|
|
check_interval: 60
|
|
- check_etcd_high_http_updates_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-update-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
|
|
check_interval: 60
|
|
- check_felix_iptables_save_errors:
|
|
use: notifying_service
|
|
service_description: Calico_iptables-save-errors
|
|
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_ipset_errors:
|
|
use: notifying_service
|
|
service_description: Calico_ipset-errors
|
|
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_iface_msg_batch_size:
|
|
use: notifying_service
|
|
service_description: Calico_interface-message-batch-size
|
|
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_addr_msg_batch_size:
|
|
use: notifying_service
|
|
service_description: Calico_address-message-batch-size
|
|
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_failures:
|
|
use: notifying_service
|
|
service_description: Calico_datapane_failures_high
|
|
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_network_partitions_detected:
|
|
use: generic-service
|
|
service_description: Rabbitmq_network-partitions-exist
|
|
check_command: check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_available:
|
|
use: generic-service
|
|
service_description: Rabbitmq_up
|
|
check_command: check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_fd_usage:
|
|
use: generic-service
|
|
service_description: Rabbitmq_file-descriptor-usage
|
|
check_command: check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_node_disk_alarm:
|
|
use: generic-service
|
|
service_description: Rabbitmq_node-disk-alarm
|
|
check_command: check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_node_memory_alarm:
|
|
use: generic-service
|
|
service_description: Rabbitmq_node-memory-alarm
|
|
check_command: check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_availability:
|
|
use: generic-service
|
|
service_description: Rabbitmq_high-availability
|
|
check_command: check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_message_return_percent:
|
|
use: generic-service
|
|
service_description: Rabbitmq_message-return-percent
|
|
check_command: check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_consumer_util:
|
|
use: generic-service
|
|
service_description: Rabbitmq_consumer-utilization
|
|
check_command: check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_load:
|
|
use: generic-service
|
|
service_description: Rabbitmq_rabbitmq-queue-health
|
|
check_command: check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_high_process_open_file_count:
|
|
use: generic-service
|
|
service_description: ES_high-process-open-file-count
|
|
check_command: check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_high_process_cpu_percent:
|
|
use: generic-service
|
|
service_description: ES_high-process-cpu-percent
|
|
check_command: check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_fs_usage:
|
|
use: generic-service
|
|
service_description: ES_high-filesystem-usage
|
|
check_command: check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_unassigned_shards:
|
|
use: generic-service
|
|
service_description: ES_unassigned-shards
|
|
check_command: check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_health_timedout:
|
|
use: generic-service
|
|
service_description: ES_cluster-health-timedout
|
|
check_command: check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_health_status:
|
|
use: generic-service
|
|
service_description: ES_cluster-health-status
|
|
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_number_nodes_running:
|
|
use: generic-service
|
|
service_description: ES_cluster-running-node-count
|
|
check_command: check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_number_data_nodes_running:
|
|
use: generic-service
|
|
service_description: ES_cluster-running-data-node-count
|
|
check_command: check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_table_lock_waits:
|
|
use: generic-service
|
|
service_description: Mariadb_table-lock-waits-high
|
|
check_command: check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_node_ready:
|
|
use: generic-service
|
|
service_description: Mariadb_node-ready
|
|
check_command: check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_node_out_of_sync:
|
|
use: generic-service
|
|
service_description: Mariadb_node-synchronized
|
|
check_command: check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_innodb_replication_lag:
|
|
use: generic-service
|
|
service_description: Mariadb_innodb-replication-lag
|
|
check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prometheus_hosts:
|
|
use: notifying_service
|
|
service_description: Prometheus_hosts-update
|
|
check_command: check_prometheus_hosts
|
|
hostgroup_name: prometheus-hosts
|
|
check_interval: 900
|
|
- check_postgresql_replication_lag:
|
|
use: generic-service
|
|
service_description: Postgresql_replication-lag
|
|
check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_postgresql_connections:
|
|
use: generic-service
|
|
service_description: Postgresql_connections
|
|
check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_postgresql_deadlocks:
|
|
use: generic-service
|
|
service_description: Postgresql_deadlocks
|
|
check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_ceph:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_CEPH
|
|
check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_openstack:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Openstack
|
|
check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_mariadb:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_MariaDB
|
|
check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_kube_state_metrics:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Kube-state-metrics
|
|
check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_postgresql:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Postgresql
|
|
check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_node:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Node
|
|
check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_calico:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Calico
|
|
check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_elasticsearch:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Elasticsearch
|
|
check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_fluentd:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Fluentd
|
|
check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
|
|
check_command: check_filespace_mounts-usage-rate-fullin4hrs
|
|
check_interval: 60
|
|
- check_filespace_mounts-usage:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "Filespace_mounts-usage"
|
|
check_command: check_filespace_mounts-usage
|
|
check_interval: 60
|
|
- check_node_loadavg:
|
|
use: notifying_service
|
|
service_description: CPU_Load-average
|
|
check_command: check_node_loadavg
|
|
hostgroup_name: base-os
|
|
- check_node_cpu_util:
|
|
use: notifying_service
|
|
service_description: CPU_utilization
|
|
check_command: check_node_cpu_util
|
|
hostgroup_name: base-os
|
|
- check_network_connections:
|
|
use: notifying_service
|
|
service_description: Network_connections
|
|
check_command: check_network_connections
|
|
hostgroup_name: base-os
|
|
- check_memory_usage:
|
|
use: notifying_service
|
|
service_description: Memory_usage
|
|
check_command: check_memory_usage
|
|
hostgroup_name: base-os
|
|
- check_disk_write_latency:
|
|
use: notifying_service
|
|
service_description: Disk_write-latency
|
|
check_command: check_disk_write_latency
|
|
hostgroup_name: base-os
|
|
- check_disk_read_latency:
|
|
use: notifying_service
|
|
service_description: Disk_read-latency
|
|
check_command: check_disk_read_latency
|
|
hostgroup_name: base-os
|
|
- check_entropy_availability:
|
|
use: notifying_service
|
|
service_description: Entropy_availability
|
|
check_command: check_entropy_availability
|
|
hostgroup_name: base-os
|
|
- check_filedescriptor_usage_rate:
|
|
use: notifying_service
|
|
service_description: FileDescriptors_usage-rate-high
|
|
check_command: check_filedescriptor_usage_rate
|
|
hostgroup_name: base-os
|
|
- check_hwmon_high_cpu_temp:
|
|
use: notifying_service
|
|
service_description: HW_cpu-temp-high
|
|
check_command: check_hwmon_high_cpu_temp
|
|
hostgroup_name: base-os
|
|
- check_network_receive_drop_high:
|
|
use: notifying_service
|
|
service_description: Network_receive-drop-high
|
|
check_command: check_network_receive_drop_high
|
|
hostgroup_name: base-os
|
|
- check_network_transmit_drop_high:
|
|
use: notifying_service
|
|
service_description: Network_transmit-drop-high
|
|
check_command: check_network_transmit_drop_high
|
|
hostgroup_name: base-os
|
|
- check_network_receive_errors_high:
|
|
use: notifying_service
|
|
service_description: Network_receive-errors-high
|
|
check_command: check_network_receive_errors_high
|
|
hostgroup_name: base-os
|
|
- check_network_transmit_errors_high:
|
|
use: notifying_service
|
|
service_description: Network_transmit-errors-high
|
|
check_command: check_network_transmit_errors_high
|
|
hostgroup_name: base-os
|
|
- check_vmstat_paging_rate:
|
|
use: notifying_service
|
|
service_description: Memory_vmstat-paging-rate
|
|
check_command: check_vmstat_paging_rate
|
|
hostgroup_name: base-os
|
|
- check_xfs_block_allocation:
|
|
use: notifying_service
|
|
service_description: XFS_block-allocation
|
|
check_command: check_xfs_block_allocation
|
|
hostgroup_name: base-os
|
|
- check_network_bond_status:
|
|
use: notifying_service
|
|
service_description: Network_bondstatus
|
|
check_command: check_network_bond_status
|
|
hostgroup_name: base-os
|
|
- check_numa_memory_usage:
|
|
use: notifying_service
|
|
service_description: Memory_NUMA-usage
|
|
check_command: check_numa_memory_usage
|
|
hostgroup_name: base-os
|
|
- check_ntp_sync:
|
|
use: notifying_service
|
|
service_description: NTP_sync
|
|
check_command: check_ntp_sync
|
|
hostgroup_name: base-os
|
|
nagios:
|
|
log_file: /opt/nagios/var/log/nagios.log
|
|
cfg_file:
|
|
- /opt/nagios/etc/nagios_objects.cfg
|
|
- /opt/nagios/etc/objects/commands.cfg
|
|
- /opt/nagios/etc/objects/contacts.cfg
|
|
- /opt/nagios/etc/objects/timeperiods.cfg
|
|
- /opt/nagios/etc/objects/templates.cfg
|
|
- /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
|
|
object_cache_file: /opt/nagios/var/objects.cache
|
|
precached_object_file: /opt/nagios/var/objects.precache
|
|
resource_file: /opt/nagios/etc/resource.cfg
|
|
status_file: /opt/nagios/var/status.dat
|
|
status_update_interval: 10
|
|
nagios_user: nagios
|
|
nagios_group: nagios
|
|
check_external_commands: 1
|
|
command_file: /opt/nagios/var/rw/nagios.cmd
|
|
lock_file: /var/run/nagios.lock
|
|
temp_file: /opt/nagios/var/nagios.tmp
|
|
temp_path: /tmp
|
|
event_broker_options: -1
|
|
log_rotation_method: d
|
|
log_archive_path: /opt/nagios/var/log/archives
|
|
use_syslog: 1
|
|
log_service_retries: 1
|
|
log_host_retries: 1
|
|
log_event_handlers: 1
|
|
log_initial_states: 0
|
|
log_current_states: 1
|
|
log_external_commands: 1
|
|
log_passive_checks: 1
|
|
service_inter_check_delay_method: s
|
|
max_service_check_spread: 30
|
|
service_interleave_factor: s
|
|
host_inter_check_delay_method: s
|
|
max_host_check_spread: 30
|
|
max_concurrent_checks: 60
|
|
check_result_reaper_frequency: 10
|
|
max_check_result_reaper_time: 30
|
|
check_result_path: /opt/nagios/var/spool/checkresults
|
|
max_check_result_file_age: 3600
|
|
cached_host_check_horizon: 15
|
|
cached_service_check_horizon: 15
|
|
enable_predictive_host_dependency_checks: 1
|
|
enable_predictive_service_dependency_checks: 1
|
|
soft_state_dependencies: 0
|
|
auto_reschedule_checks: 0
|
|
auto_rescheduling_interval: 30
|
|
auto_rescheduling_window: 180
|
|
service_check_timeout: 60
|
|
host_check_timeout: 60
|
|
event_handler_timeout: 60
|
|
notification_timeout: 60
|
|
ocsp_timeout: 5
|
|
perfdata_timeout: 5
|
|
retain_state_information: 1
|
|
state_retention_file: /opt/nagios/var/retention.dat
|
|
retention_update_interval: 60
|
|
use_retained_program_state: 1
|
|
use_retained_scheduling_info: 1
|
|
retained_host_attribute_mask: 0
|
|
retained_service_attribute_mask: 0
|
|
retained_process_host_attribute_mask: 0
|
|
retained_process_service_attribute_mask: 0
|
|
retained_contact_host_attribute_mask: 0
|
|
retained_contact_service_attribute_mask: 0
|
|
interval_length: 1
|
|
check_workers: 4
|
|
check_for_updates: 1
|
|
bare_update_check: 0
|
|
use_aggressive_host_checking: 0
|
|
execute_service_checks: 1
|
|
accept_passive_service_checks: 1
|
|
execute_host_checks: 1
|
|
accept_passive_host_checks: 1
|
|
enable_notifications: 1
|
|
enable_event_handlers: 1
|
|
process_performance_data: 0
|
|
obsess_over_services: 0
|
|
obsess_over_hosts: 0
|
|
translate_passive_host_checks: 0
|
|
passive_host_checks_are_soft: 0
|
|
check_for_orphaned_services: 1
|
|
check_for_orphaned_hosts: 1
|
|
check_service_freshness: 1
|
|
service_freshness_check_interval: 60
|
|
check_host_freshness: 0
|
|
host_freshness_check_interval: 60
|
|
additional_freshness_latency: 15
|
|
enable_flap_detection: 1
|
|
low_service_flap_threshold: 5.0
|
|
high_service_flap_threshold: 20.0
|
|
low_host_flap_threshold: 5.0
|
|
high_host_flap_threshold: 20.0
|
|
date_format: us
|
|
use_regexp_matching: 1
|
|
use_true_regexp_matching: 0
|
|
daemon_dumps_core: 0
|
|
use_large_installation_tweaks: 0
|
|
enable_environment_macros: 0
|
|
debug_level: 0
|
|
debug_verbosity: 1
|
|
debug_file: /opt/nagios/var/nagios.debug
|
|
max_debug_file_size: 1000000
|
|
allow_empty_hostgroup_assignment: 1
|
|
illegal_macro_output_chars: "`~$&|'<>\""
|
|
cgi:
|
|
main_config_file: /opt/nagios/etc/nagios.cfg
|
|
physical_html_path: /opt/nagios/share
|
|
url_html_path: /nagios
|
|
show_context_help: 0
|
|
use_pending_states: 1
|
|
use_authentication: 0
|
|
use_ssl_authentication: 0
|
|
authorized_for_system_information: "*"
|
|
authorized_for_configuration_information: "*"
|
|
authorized_for_system_commands: nagiosadmin
|
|
authorized_for_all_services: "*"
|
|
authorized_for_all_hosts: "*"
|
|
authorized_for_all_service_commands: "*"
|
|
authorized_for_all_host_commands: "*"
|
|
default_statuswrl_layout: 4
|
|
ping_syntax: /bin/ping -n -U -c 5 $HOSTADDRESS$
|
|
refresh_rate: 90
|
|
result_limit: 100
|
|
escape_html_tags: 1
|
|
action_url_target: _blank
|
|
notes_url_target: _blank
|
|
lock_author_names: 1
|
|
navbar_search_for_addresses: 1
|
|
navbar_search_for_aliases: 1
|
|
notification:
|
|
snmp:
|
|
primary_target: 127.0.0.1:15162
|
|
secondary_target: 127.0.0.1:15162
|
|
http:
|
|
primary_target: 127.0.0.1:3904/events
|
|
secondary_target: 127.0.0.1:3904/events
|