openstack-helm-infra/nagios/values.yaml
Steve Wilkerson c7d0317768 Add nagios cgi.cfg file control to values.yaml
This adds the ability to drive the CGI configuration for
nagios via values, similar to the other nagios configuration
entities

Change-Id: I8e9de21d141e0a87cdda11c4a778abec210277f3
2018-05-24 11:24:37 -07:00

895 lines
45 KiB
YAML

# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values for nagios.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
images:
tags:
apache_proxy: docker.io/httpd:2.4
nagios: quay.io/attcomdev/nagios:8ed23ede915ccf23aacd370953291090007ed16d
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
local_registry:
active: false
exclude:
- dep_check
- image_repo_sync
labels:
nagios:
node_selector_key: openstack-control-plane
node_selector_value: enabled
job:
node_selector_key: openstack-control-plane
node_selector_value: enabled
dependencies:
dynamic:
common:
jobs:
- nagios-image-repo-sync
services:
- service: local_image_registry
endpoint: node
static:
image_repo_sync:
services:
- service: local_image_registry
endpoint: internal
nagios:
services: null
secrets:
nagios:
admin: nagios-admin-creds
endpoints:
cluster_domain_suffix: cluster.local
local_image_registry:
name: docker-registry
namespace: docker-registry
hosts:
default: localhost
internal: docker-registry
node: localhost
host_fqdn_override:
default: null
port:
registry:
node: 5000
monitoring:
name: prometheus
hosts:
default: prom-metrics
public: prometheus
host_fqdn_override:
default: null
path:
default: null
scheme:
default: http
port:
api:
default: 9090
public: 80
nagios:
name: nagios
namespace: null
auth:
admin:
username: nagiosadmin
password: password
hosts:
default: nagios-metrics
public: nagios
host_fqdn_override:
default: null
path:
default: null
scheme:
default: http
port:
nagios:
default: 8000
http:
default: 80
ldap:
hosts:
default: ldap
auth:
admin:
bind: "cn=admin,dc=cluster,dc=local"
password: password
host_fqdn_override:
default: null
path:
default: "/ou=People,dc=cluster,dc=local"
scheme:
default: ldap
port:
ldap:
default: 389
network:
nagios:
ingress:
public: true
classes:
namespace: "nginx"
cluster: "nginx-cluster"
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
node_port:
enabled: false
port: 30925
pod:
lifecycle:
upgrades:
revision_history: 3
pod_replacement_strategy: RollingUpdate
rolling_update:
max_unavailable: 1
max_surge: 3
termination_grace_period:
nagios:
timeout: 30
replicas:
nagios: 1
resources:
enabled: false
nagios:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "100m"
apache_proxy:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "100m"
jobs:
image_repo_sync:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "100m"
manifests:
configmap_bin: true
configmap_etc: true
deployment: true
ingress: true
job_image_repo_sync: true
secret_nagios: true
service: true
service_ingress: true
conf:
apache:
httpd: null
elasticsearch_host: null
nagios:
contacts:
- notifying_contact:
name: notifying_contact
contact_name: notifying_contact
alias: notifying contact
service_notification_period: 24x7
host_notification_period: 24x7
service_notification_options: w,u,c,r,f,s
host_notification_options: d,u,r,f,s
register: 0
- snmp_notifying_contact:
use: notifying_contact
name: snmp_notifying_contact
contact_name: snmp_notifying_contact
alias: snmp contact
service_notification_commands: send_service_snmp_trap
host_notification_commands: send_host_snmp_trap
- http_notifying_contact:
use: notifying_contact
name: http_notifying_contact
contact_name: http_notifying_contact
alias: HTTP contact
service_notification_commands: send_service_http_post
host_notification_commands: send_host_http_post
contactgroups:
- snmp_and_http_notifying_contact_group:
contactgroup_name: snmp_and_http_notifying_contact_group
alias: SNMP and HTTP notifying group
members: snmp_notifying_contact,http_notifying_contact
hosts:
- prometheus:
use: linux-server
host_name: prometheus
alias: "Prometheus Monitoring"
address: 127.0.0.1
hostgroups: prometheus-hosts
check_command: check-prometheus-host-alive
host_groups:
- prometheus-hosts:
hostgroup_name: prometheus-hosts
alias: "Prometheus Virtual Host"
- all:
hostgroup_name: all
alias: "all"
- base-os:
hostgroup_name: base-os
alias: "base-os"
- ceph_mgr_placeholder:
hostgroup_name: ceph_mgr_placeholder
alias: "ceph_mgr_placeholder"
commands:
- send_service_snmp_trap:
command_name: send_service_snmp_trap
command_line: "$USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$'"
- send_host_snmp_trap:
command_name: send_host_snmp_trap
command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
- send_service_http_post:
command_name: send_service_http_post
command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
- send_host_http_post:
command_name: send_host_http_post
command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
- check_prometheus_host_alive:
command_name: check-prometheus-host-alive
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
- check_prom_alert_with_labels:
command_name: check_prom_alert_with_labels
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$'"
- check_prom_alert:
command_name: check_prom_alert
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
- check_filespace_mounts-usage-rate-fullin4hrs:
command_name: check_filespace_mounts-usage-rate-fullin4hrs
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
- check_filespace_mounts-usage:
command_name: check_filespace_mounts-usage
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
- check_node_loadavg:
command_name: check_node_loadavg
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
- check_node_cpu_util:
command_name: check_node_cpu_util
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
- check_network_connections:
command_name: check_network_connections
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
- check_memory_usage:
command_name: check_memory_usage
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
- check_disk_write_latency:
command_name: check_disk_write_latency
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
- check_disk_read_latency:
command_name: check_disk_read_latency
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
- check_entropy_availability:
command_name: check_entropy_availability
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
- check_filedescriptor_usage_rate:
command_name: check_filedescriptor_usage_rate
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
- check_hwmon_high_cpu_temp:
command_name: check_hwmon_high_cpu_temp
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
- check_network_receive_drop_high:
command_name: check_network_receive_drop_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
- check_network_transmit_drop_high:
command_name: check_network_transmit_drop_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
- check_network_receive_errors_high:
command_name: check_network_receive_errors_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
- check_network_transmit_errors_high:
command_name: check_network_transmit_errors_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
- check_vmstat_paging_rate:
command_name: check_vmstat_paging_rate
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
- check_xfs_block_allocation:
command_name: check_xfs_block_allocation
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
- check_network_bond_status:
command_name: check_network_bond_status
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
- check_numa_memory_usage:
command_name: check_numa_memory_usage
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
- check_ntp_sync:
command_name: check_ntp_sync
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
- check_ceph_health:
command_name: check_ceph_health
command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
services:
- notifying_service:
name: notifying_service
use: generic-service
flap_detection_enabled: 0
process_perf_data: 0
contact_groups: snmp_and_http_notifying_contact_group
check_interval: 60
notification_interval: 120
retry_interval: 15
register: 0
- check_ceph_health:
use: notifying_service
hostgroup_name: ^ceph_mgr.*$
service_description: "CEPH_health"
check_command: check_ceph_health
check_interval: 60
- check_prometheus_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Prometheus_replica-count"
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
check_interval: 60
- check_alertmanager_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "PrometheusAlertmanager_replica-count"
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
check_interval: 60
- check_statefulset_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Statefulset_replica-count"
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
check_interval: 60
- check_daemonset_misscheduled:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Daemonset_misscheduled"
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
check_interval: 60
- check_daemonset_not-scheduled:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Daemonset_not-scheduled"
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
check_interval: 60
- check_deployment_replicas_unavailable:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Deployment_replicas-unavailable"
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
check_interval: 60
- check_deployment_rollingupdate_replicas_unavailable:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "RollingUpdate_Deployment-replicas-unavailable"
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
check_interval: 60
- check_job_status_failed:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Job_status-failed"
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
check_interval: 60
- check_pod_status_pending:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-pending"
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
check_interval: 60
- check_pod_status_error_image_pull:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_replicaset_missing_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Replicaset_missing-replicas"
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
check_interval: 60
- check_pod_container_terminated:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-container-terminated"
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
check_interval: 60
- check_glance_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_glance"
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
check_interval: 60
- check_nova_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_nova"
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
check_interval: 60
- check_keystone_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_keystone"
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
check_interval: 60
- check_neutron_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_neutron"
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
check_interval: 60
- check_swift_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_swift"
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
check_interval: 60
- check_service_nova_compute:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-compute"
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
check_interval: 60
- check_service_nova_conductor:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-conductor"
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
check_interval: 60
- check_service_nova_consoleauth:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-consoleauth"
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
check_interval: 60
- check_service_nova_scheduler:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-scheduler"
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
check_interval: 60
- check_ceph_monitor_quorum:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "CEPH_quorum"
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
check_interval: 60
- check_ceph_storage_usage:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "CEPH_storage-usage"
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
check_interval: 60
- check_ceph_pgs_degradation:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "CEPH_PGs-degradation"
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
check_interval: 60
- check_ceph_osds_down:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "CEPH_OSDs-down"
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
check_interval: 60
- check_ceph_monitor_clock_skew:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "CEPH_Clock-skew"
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
check_interval: 60
- check_fluentd_up:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Fluentd_status"
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
check_interval: 60
- check_etcd_high_http_deletes_failed:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: ETCD_high-http-delete-failures
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
check_interval: 60
- check_etcd_high_http_get_failed:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: ETCD_high-http-get-failures
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
check_interval: 60
- check_etcd_high_http_updates_failed:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: ETCD_high-http-update-failures
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
check_interval: 60
- check_felix_iptables_save_errors:
use: notifying_service
service_description: Calico_iptables-save-errors
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
hostgroup_name: prometheus-hosts
- check_felix_ipset_errors:
use: notifying_service
service_description: Calico_ipset-errors
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
hostgroup_name: prometheus-hosts
- check_felix_int_dataplane_iface_msg_batch_size:
use: notifying_service
service_description: Calico_interface-message-batch-size
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
hostgroup_name: prometheus-hosts
- check_felix_int_dataplane_addr_msg_batch_size:
use: notifying_service
service_description: Calico_address-message-batch-size
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
hostgroup_name: prometheus-hosts
- check_felix_int_dataplane_failures:
use: notifying_service
service_description: Calico_datapane_failures_high
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
hostgroup_name: prometheus-hosts
- check_rabbitmq_network_partitions_detected:
use: generic-service
service_description: Rabbitmq_network-partitions-exist
check_command: check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
hostgroup_name: prometheus-hosts
- check_rabbitmq_available:
use: generic-service
service_description: Rabbitmq_up
check_command: check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
hostgroup_name: prometheus-hosts
- check_rabbitmq_fd_usage:
use: generic-service
service_description: Rabbitmq_file-descriptor-usage
check_command: check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
hostgroup_name: prometheus-hosts
- check_rabbitmq_node_disk_alarm:
use: generic-service
service_description: Rabbitmq_node-disk-alarm
check_command: check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
hostgroup_name: prometheus-hosts
- check_rabbitmq_node_memory_alarm:
use: generic-service
service_description: Rabbitmq_node-memory-alarm
check_command: check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
hostgroup_name: prometheus-hosts
- check_rabbitmq_availability:
use: generic-service
service_description: Rabbitmq_high-availability
check_command: check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
hostgroup_name: prometheus-hosts
- check_queue_message_return_percent:
use: generic-service
service_description: Rabbitmq_message-return-percent
check_command: check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
hostgroup_name: prometheus-hosts
- check_queue_consumer_util:
use: generic-service
service_description: Rabbitmq_consumer-utilization
check_command: check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
hostgroup_name: prometheus-hosts
- check_queue_load:
use: generic-service
service_description: Rabbitmq_rabbitmq-queue-health
check_command: check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
hostgroup_name: prometheus-hosts
- check_es_high_process_open_file_count:
use: generic-service
service_description: ES_high-process-open-file-count
check_command: check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
hostgroup_name: prometheus-hosts
- check_es_high_process_cpu_percent:
use: generic-service
service_description: ES_high-process-cpu-percent
check_command: check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
hostgroup_name: prometheus-hosts
- check_es_fs_usage:
use: generic-service
service_description: ES_high-filesystem-usage
check_command: check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
hostgroup_name: prometheus-hosts
- check_es_unassigned_shards:
use: generic-service
service_description: ES_unassigned-shards
check_command: check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
hostgroup_name: prometheus-hosts
- check_es_cluster_health_timedout:
use: generic-service
service_description: ES_cluster-health-timedout
check_command: check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
hostgroup_name: prometheus-hosts
- check_es_cluster_health_status:
use: generic-service
service_description: ES_cluster-health-status
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
hostgroup_name: prometheus-hosts
- check_es_cluster_number_nodes_running:
use: generic-service
service_description: ES_cluster-running-node-count
check_command: check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
hostgroup_name: prometheus-hosts
- check_es_cluster_number_data_nodes_running:
use: generic-service
service_description: ES_cluster-running-data-node-count
check_command: check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
hostgroup_name: prometheus-hosts
- check_mariadb_table_lock_waits:
use: generic-service
service_description: Mariadb_table-lock-waits-high
check_command: check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
hostgroup_name: prometheus-hosts
- check_mariadb_node_ready:
use: generic-service
service_description: Mariadb_node-ready
check_command: check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
hostgroup_name: prometheus-hosts
- check_mariadb_node_out_of_sync:
use: generic-service
service_description: Mariadb_node-synchronized
check_command: check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
hostgroup_name: prometheus-hosts
- check_mariadb_innodb_replication_lag:
use: generic-service
service_description: Mariadb_innodb-replication-lag
check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
hostgroup_name: prometheus-hosts
- check_filespace_mounts-usage-rate-fullin4hrs:
use: notifying_service
hostgroup_name: base-os
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
check_command: check_filespace_mounts-usage-rate-fullin4hrs
check_interval: 60
- check_filespace_mounts-usage:
use: notifying_service
hostgroup_name: base-os
service_description: "Filespace_mounts-usage"
check_command: check_filespace_mounts-usage
check_interval: 60
- check_node_loadavg:
use: notifying_service
service_description: CPU_Load-average
check_command: check_node_loadavg
hostgroup_name: base-os
- check_node_cpu_util:
use: notifying_service
service_description: CPU_utilization
check_command: check_node_cpu_util
hostgroup_name: base-os
- check_network_connections:
use: notifying_service
service_description: Network_connections
check_command: check_network_connections
hostgroup_name: base-os
- check_memory_usage:
use: notifying_service
service_description: Memory_usage
check_command: check_memory_usage
hostgroup_name: base-os
- check_disk_write_latency:
use: notifying_service
service_description: Disk_write-latency
check_command: check_disk_write_latency
hostgroup_name: base-os
- check_disk_read_latency:
use: notifying_service
service_description: Disk_read-latency
check_command: check_disk_read_latency
hostgroup_name: base-os
- check_entropy_availability:
use: notifying_service
service_description: Entropy_availability
check_command: check_entropy_availability
hostgroup_name: base-os
- check_filedescriptor_usage_rate:
use: notifying_service
service_description: FileDescriptors_usage-rate-high
check_command: check_filedescriptor_usage_rate
hostgroup_name: base-os
- check_hwmon_high_cpu_temp:
use: notifying_service
service_description: HW_cpu-temp-high
check_command: check_hwmon_high_cpu_temp
hostgroup_name: base-os
- check_network_receive_drop_high:
use: notifying_service
service_description: Network_receive-drop-high
check_command: check_network_receive_drop_high
hostgroup_name: base-os
- check_network_transmit_drop_high:
use: notifying_service
service_description: Network_transmit-drop-high
check_command: check_network_transmit_drop_high
hostgroup_name: base-os
- check_network_receive_errors_high:
use: notifying_service
service_description: Network_receive-errors-high
check_command: check_network_receive_errors_high
hostgroup_name: base-os
- check_network_transmit_errors_high:
use: notifying_service
service_description: Network_transmit-errors-high
check_command: check_network_transmit_errors_high
hostgroup_name: base-os
- check_vmstat_paging_rate:
use: notifying_service
service_description: Memory_vmstat-paging-rate
check_command: check_vmstat_paging_rate
hostgroup_name: base-os
- check_xfs_block_allocation:
use: notifying_service
service_description: XFS_block-allocation
check_command: check_xfs_block_allocation
hostgroup_name: base-os
- check_network_bond_status:
use: notifying_service
service_description: Network_bondstatus
check_command: check_network_bond_status
hostgroup_name: base-os
- check_numa_memory_usage:
use: notifying_service
service_description: Memory_NUMA-usage
check_command: check_numa_memory_usage
hostgroup_name: base-os
- check_ntp_sync:
use: notifying_service
service_description: NTP_sync
check_command: check_ntp_sync
hostgroup_name: base-os
nagios:
log_file: /opt/nagios/var/nagios.log
cfg_file:
- /opt/nagios/etc/nagios_objects.cfg
- /opt/nagios/etc/objects/commands.cfg
- /opt/nagios/etc/objects/contacts.cfg
- /opt/nagios/etc/objects/timeperiods.cfg
- /opt/nagios/etc/objects/templates.cfg
- /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
object_cache_file: /opt/nagios/var/objects.cache
precached_object_file: /opt/nagios/var/objects.precache
resource_file: /opt/nagios/etc/resource.cfg
status_file: /opt/nagios/var/status.dat
status_update_interval: 10
nagios_user: nagios
nagios_group: nagios
check_external_commands: 1
command_file: /opt/nagios/var/rw/nagios.cmd
lock_file: /var/run/nagios.lock
temp_file: /opt/nagios/var/nagios.tmp
temp_path: /tmp
event_broker_options: -1
log_rotation_method: d
log_archive_path: /opt/nagios/var/archives
use_syslog: 1
log_service_retries: 1
log_host_retries: 1
log_event_handlers: 1
log_initial_states: 0
log_current_states: 1
log_external_commands: 1
log_passive_checks: 1
service_inter_check_delay_method: s
max_service_check_spread: 30
service_interleave_factor: s
host_inter_check_delay_method: s
max_host_check_spread: 30
max_concurrent_checks: 0
check_result_reaper_frequency: 10
max_check_result_reaper_time: 30
check_result_path: /opt/nagios/var/spool/checkresults
max_check_result_file_age: 3600
cached_host_check_horizon: 15
cached_service_check_horizon: 15
enable_predictive_host_dependency_checks: 1
enable_predictive_service_dependency_checks: 1
soft_state_dependencies: 0
auto_reschedule_checks: 0
auto_rescheduling_interval: 30
auto_rescheduling_window: 180
service_check_timeout: 60
host_check_timeout: 60
event_handler_timeout: 60
notification_timeout: 60
ocsp_timeout: 5
perfdata_timeout: 5
retain_state_information: 1
state_retention_file: /opt/nagios/var/retention.dat
retention_update_interval: 60
use_retained_program_state: 1
use_retained_scheduling_info: 1
retained_host_attribute_mask: 0
retained_service_attribute_mask: 0
retained_process_host_attribute_mask: 0
retained_process_service_attribute_mask: 0
retained_contact_host_attribute_mask: 0
retained_contact_service_attribute_mask: 0
interval_length: 1
check_for_updates: 1
bare_update_check: 0
use_aggressive_host_checking: 0
execute_service_checks: 1
accept_passive_service_checks: 1
execute_host_checks: 1
accept_passive_host_checks: 1
enable_notifications: 1
enable_event_handlers: 1
process_performance_data: 0
obsess_over_services: 0
obsess_over_hosts: 0
translate_passive_host_checks: 0
passive_host_checks_are_soft: 0
check_for_orphaned_services: 1
check_for_orphaned_hosts: 1
check_service_freshness: 1
service_freshness_check_interval: 60
check_host_freshness: 0
host_freshness_check_interval: 60
additional_freshness_latency: 15
enable_flap_detection: 1
low_service_flap_threshold: 5.0
high_service_flap_threshold: 20.0
low_host_flap_threshold: 5.0
high_host_flap_threshold: 20.0
date_format: us
use_regexp_matching: 1
use_true_regexp_matching: 0
daemon_dumps_core: 0
use_large_installation_tweaks: 0
enable_environment_macros: 0
debug_level: 0
debug_verbosity: 1
debug_file: /opt/nagios/var/nagios.debug
max_debug_file_size: 1000000
allow_empty_hostgroup_assignment: 1
illegal_macro_output_chars: "`~$&|'<>\""
cgi:
main_config_file: /opt/nagios/etc/nagios.cfg
physical_html_path: /opt/nagios/share
url_html_path: /nagios
show_context_help: 0
use_pending_states: 1
use_authentication: 0
use_ssl_authentication: 0
authorized_for_system_information: "*"
authorized_for_configuration_information: "*"
authorized_for_system_commands: nagiosadmin
authorized_for_all_services: "*"
authorized_for_all_hosts: "*"
authorized_for_all_service_commands: "*"
authorized_for_all_host_commands: "*"
default_statuswrl_layout: 4
ping_syntax: /bin/ping -n -U -c 5 $HOSTADDRESS$
refresh_rate: 90
result_limit: 100
escape_html_tags: 1
action_url_target: _blank
notes_url_target: _blank
lock_author_names: 1
navbar_search_for_addresses: 1
navbar_search_for_aliases: 1
notification:
snmp:
primary_target: 127.0.0.1:15162
secondary_target: 127.0.0.1:15162
http:
primary_target: 127.0.0.1:3904/events
secondary_target: 127.0.0.1:3904/events