4f78e1f6fc
This proposes defining the apache proxy hosts entirely via values templates. While complicated on its face, this gives flexibility by allowing the ability to define the desired authentication mechanism via values templates. These options can range from using http basic auth for development purposes to defining more complex ldap configurations without a need to modify the chart directly Change-Id: Ief1b6890444ff90cc9c0ca872087af74836c0771 Signed-off-by: Pete Birley <pete@port.direct>
1005 lines
49 KiB
YAML
1005 lines
49 KiB
YAML
# Copyright 2017 The Openstack-Helm Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Default values for nagios.
|
|
# This is a YAML-formatted file.
|
|
# Declare variables to be passed into your templates.
|
|
|
|
images:
|
|
tags:
|
|
apache_proxy: docker.io/httpd:2.4
|
|
nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3
|
|
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
|
|
image_repo_sync: docker.io/docker:17.07.0
|
|
pull_policy: IfNotPresent
|
|
local_registry:
|
|
active: false
|
|
exclude:
|
|
- dep_check
|
|
- image_repo_sync
|
|
|
|
labels:
|
|
nagios:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
job:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
|
|
dependencies:
|
|
dynamic:
|
|
common:
|
|
jobs:
|
|
- nagios-image-repo-sync
|
|
services:
|
|
- service: local_image_registry
|
|
endpoint: node
|
|
static:
|
|
image_repo_sync:
|
|
services:
|
|
- service: local_image_registry
|
|
endpoint: internal
|
|
nagios:
|
|
services: null
|
|
|
|
secrets:
|
|
nagios:
|
|
admin: nagios-admin-creds
|
|
tls:
|
|
nagios:
|
|
nagios:
|
|
public: nagios-tls-public
|
|
|
|
endpoints:
|
|
cluster_domain_suffix: cluster.local
|
|
local_image_registry:
|
|
name: docker-registry
|
|
namespace: docker-registry
|
|
hosts:
|
|
default: localhost
|
|
internal: docker-registry
|
|
node: localhost
|
|
host_fqdn_override:
|
|
default: null
|
|
port:
|
|
registry:
|
|
node: 5000
|
|
monitoring:
|
|
name: prometheus
|
|
hosts:
|
|
default: prom-metrics
|
|
public: prometheus
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: http
|
|
port:
|
|
api:
|
|
default: 9090
|
|
public: 80
|
|
nagios:
|
|
name: nagios
|
|
namespace: null
|
|
auth:
|
|
admin:
|
|
username: nagiosadmin
|
|
password: password
|
|
hosts:
|
|
default: nagios-metrics
|
|
public: nagios
|
|
host_fqdn_override:
|
|
default: null
|
|
# NOTE(srwilkers): this chart supports TLS for fqdn over-ridden public
|
|
# endpoints using the following format:
|
|
# public:
|
|
# host: null
|
|
# tls:
|
|
# crt: null
|
|
# key: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: http
|
|
port:
|
|
nagios:
|
|
default: 8000
|
|
http:
|
|
default: 80
|
|
ldap:
|
|
hosts:
|
|
default: ldap
|
|
auth:
|
|
admin:
|
|
bind: "cn=admin,dc=cluster,dc=local"
|
|
password: password
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: "/ou=People,dc=cluster,dc=local"
|
|
scheme:
|
|
default: ldap
|
|
port:
|
|
ldap:
|
|
default: 389
|
|
|
|
network:
|
|
nagios:
|
|
ingress:
|
|
public: true
|
|
classes:
|
|
namespace: "nginx"
|
|
cluster: "nginx-cluster"
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/rewrite-target: /
|
|
node_port:
|
|
enabled: false
|
|
port: 30925
|
|
|
|
pod:
|
|
lifecycle:
|
|
upgrades:
|
|
revision_history: 3
|
|
pod_replacement_strategy: RollingUpdate
|
|
rolling_update:
|
|
max_unavailable: 1
|
|
max_surge: 3
|
|
termination_grace_period:
|
|
nagios:
|
|
timeout: 30
|
|
replicas:
|
|
nagios: 1
|
|
resources:
|
|
enabled: false
|
|
nagios:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
apache_proxy:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
jobs:
|
|
image_repo_sync:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
|
|
manifests:
|
|
configmap_bin: true
|
|
configmap_etc: true
|
|
deployment: true
|
|
ingress: true
|
|
job_image_repo_sync: true
|
|
secret_nagios: true
|
|
secret_ingress_tls: true
|
|
service: true
|
|
service_ingress: true
|
|
|
|
conf:
|
|
httpd: |
|
|
ServerRoot "/usr/local/apache2"
|
|
|
|
Listen 80
|
|
|
|
LoadModule mpm_event_module modules/mod_mpm_event.so
|
|
LoadModule authn_file_module modules/mod_authn_file.so
|
|
LoadModule authn_core_module modules/mod_authn_core.so
|
|
LoadModule authz_host_module modules/mod_authz_host.so
|
|
LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
|
|
LoadModule authz_user_module modules/mod_authz_user.so
|
|
LoadModule authz_core_module modules/mod_authz_core.so
|
|
LoadModule access_compat_module modules/mod_access_compat.so
|
|
LoadModule auth_basic_module modules/mod_auth_basic.so
|
|
LoadModule ldap_module modules/mod_ldap.so
|
|
LoadModule authnz_ldap_module modules/mod_authnz_ldap.so
|
|
LoadModule reqtimeout_module modules/mod_reqtimeout.so
|
|
LoadModule filter_module modules/mod_filter.so
|
|
LoadModule proxy_html_module modules/mod_proxy_html.so
|
|
LoadModule log_config_module modules/mod_log_config.so
|
|
LoadModule env_module modules/mod_env.so
|
|
LoadModule headers_module modules/mod_headers.so
|
|
LoadModule setenvif_module modules/mod_setenvif.so
|
|
LoadModule version_module modules/mod_version.so
|
|
LoadModule proxy_module modules/mod_proxy.so
|
|
LoadModule proxy_connect_module modules/mod_proxy_connect.so
|
|
LoadModule proxy_http_module modules/mod_proxy_http.so
|
|
LoadModule proxy_balancer_module modules/mod_proxy_balancer.so
|
|
LoadModule slotmem_shm_module modules/mod_slotmem_shm.so
|
|
LoadModule slotmem_plain_module modules/mod_slotmem_plain.so
|
|
LoadModule unixd_module modules/mod_unixd.so
|
|
LoadModule status_module modules/mod_status.so
|
|
LoadModule autoindex_module modules/mod_autoindex.so
|
|
|
|
<IfModule unixd_module>
|
|
User daemon
|
|
Group daemon
|
|
</IfModule>
|
|
|
|
<Directory />
|
|
AllowOverride none
|
|
Require all denied
|
|
</Directory>
|
|
|
|
<Files ".ht*">
|
|
Require all denied
|
|
</Files>
|
|
|
|
ErrorLog /dev/stderr
|
|
|
|
LogLevel warn
|
|
|
|
<IfModule log_config_module>
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b" common
|
|
|
|
<IfModule logio_module>
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio
|
|
</IfModule>
|
|
|
|
CustomLog /dev/stdout common
|
|
|
|
CustomLog /dev/stdout combined
|
|
</IfModule>
|
|
|
|
<Directory "/usr/local/apache2/cgi-bin">
|
|
AllowOverride None
|
|
Options None
|
|
Require all granted
|
|
</Directory>
|
|
|
|
<IfModule headers_module>
|
|
RequestHeader unset Proxy early
|
|
</IfModule>
|
|
|
|
<IfModule proxy_html_module>
|
|
Include conf/extra/proxy-html.conf
|
|
</IfModule>
|
|
|
|
<VirtualHost *:80>
|
|
<Location />
|
|
ProxyPass http://localhost:{{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
|
|
ProxyPassReverse http://localhost:{{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
|
|
</Location>
|
|
<Proxy *>
|
|
AuthName "Nagios"
|
|
AuthType Basic
|
|
AuthBasicProvider file ldap
|
|
AuthUserFile /usr/local/apache2/conf/.htpasswd
|
|
AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
|
|
AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
|
|
AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
|
|
Require valid-user
|
|
</Proxy>
|
|
</VirtualHost>
|
|
nagios:
|
|
contacts:
|
|
- notifying_contact:
|
|
name: notifying_contact
|
|
contact_name: notifying_contact
|
|
alias: notifying contact
|
|
service_notification_period: 24x7
|
|
host_notification_period: 24x7
|
|
service_notification_options: w,u,c,r,f,s
|
|
host_notification_options: d,u,r,f,s
|
|
register: 0
|
|
- snmp_notifying_contact:
|
|
use: notifying_contact
|
|
name: snmp_notifying_contact
|
|
contact_name: snmp_notifying_contact
|
|
alias: snmp contact
|
|
service_notification_commands: send_service_snmp_trap
|
|
host_notification_commands: send_host_snmp_trap
|
|
- http_notifying_contact:
|
|
use: notifying_contact
|
|
name: http_notifying_contact
|
|
contact_name: http_notifying_contact
|
|
alias: HTTP contact
|
|
service_notification_commands: send_service_http_post
|
|
host_notification_commands: send_host_http_post
|
|
contactgroups:
|
|
- snmp_and_http_notifying_contact_group:
|
|
contactgroup_name: snmp_and_http_notifying_contact_group
|
|
alias: SNMP and HTTP notifying group
|
|
members: snmp_notifying_contact,http_notifying_contact
|
|
hosts:
|
|
- prometheus:
|
|
use: linux-server
|
|
host_name: prometheus
|
|
alias: "Prometheus Monitoring"
|
|
address: 127.0.0.1
|
|
hostgroups: prometheus-hosts
|
|
check_command: check-prometheus-host-alive
|
|
host_groups:
|
|
- prometheus-hosts:
|
|
hostgroup_name: prometheus-hosts
|
|
alias: "Prometheus Virtual Host"
|
|
- all:
|
|
hostgroup_name: all
|
|
alias: "all"
|
|
- base-os:
|
|
hostgroup_name: base-os
|
|
alias: "base-os"
|
|
- ceph_mgr_placeholder:
|
|
hostgroup_name: ceph_mgr_placeholder
|
|
alias: "ceph_mgr_placeholder"
|
|
commands:
|
|
- send_service_snmp_trap:
|
|
command_name: send_service_snmp_trap
|
|
command_line: "$USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$'"
|
|
- send_host_snmp_trap:
|
|
command_name: send_host_snmp_trap
|
|
command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
|
|
- send_service_http_post:
|
|
command_name: send_service_http_post
|
|
command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
|
|
- send_host_http_post:
|
|
command_name: send_host_http_post
|
|
command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
|
|
- check_prometheus_host_alive:
|
|
command_name: check-prometheus-host-alive
|
|
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
|
|
- check_prom_alert_with_labels:
|
|
command_name: check_prom_alert_with_labels
|
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$'"
|
|
- check_prom_alert:
|
|
command_name: check_prom_alert
|
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
|
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
|
command_name: check_filespace_mounts-usage-rate-fullin4hrs
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
|
|
- check_filespace_mounts-usage:
|
|
command_name: check_filespace_mounts-usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
|
|
- check_node_loadavg:
|
|
command_name: check_node_loadavg
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
|
|
- check_node_cpu_util:
|
|
command_name: check_node_cpu_util
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
|
|
- check_network_connections:
|
|
command_name: check_network_connections
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
|
|
- check_memory_usage:
|
|
command_name: check_memory_usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
|
|
- check_disk_write_latency:
|
|
command_name: check_disk_write_latency
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
|
|
- check_disk_read_latency:
|
|
command_name: check_disk_read_latency
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
|
|
- check_entropy_availability:
|
|
command_name: check_entropy_availability
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
|
|
- check_filedescriptor_usage_rate:
|
|
command_name: check_filedescriptor_usage_rate
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
|
|
- check_hwmon_high_cpu_temp:
|
|
command_name: check_hwmon_high_cpu_temp
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
|
|
- check_network_receive_drop_high:
|
|
command_name: check_network_receive_drop_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
|
|
- check_network_transmit_drop_high:
|
|
command_name: check_network_transmit_drop_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
|
|
- check_network_receive_errors_high:
|
|
command_name: check_network_receive_errors_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
|
|
- check_network_transmit_errors_high:
|
|
command_name: check_network_transmit_errors_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
|
|
- check_vmstat_paging_rate:
|
|
command_name: check_vmstat_paging_rate
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
|
|
- check_xfs_block_allocation:
|
|
command_name: check_xfs_block_allocation
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
|
|
- check_network_bond_status:
|
|
command_name: check_network_bond_status
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
|
|
- check_numa_memory_usage:
|
|
command_name: check_numa_memory_usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
|
|
- check_ntp_sync:
|
|
command_name: check_ntp_sync
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
|
- check_ceph_health:
|
|
command_name: check_ceph_health
|
|
command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
|
|
services:
|
|
- notifying_service:
|
|
name: notifying_service
|
|
use: generic-service
|
|
flap_detection_enabled: 0
|
|
process_perf_data: 0
|
|
contact_groups: snmp_and_http_notifying_contact_group
|
|
check_interval: 60
|
|
notification_interval: 120
|
|
retry_interval: 15
|
|
register: 0
|
|
- check_ceph_health:
|
|
use: notifying_service
|
|
hostgroup_name: ^ceph_mgr.*$
|
|
service_description: "CEPH_health"
|
|
check_command: check_ceph_health
|
|
check_interval: 60
|
|
- check_prometheus_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Prometheus_replica-count"
|
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
|
|
check_interval: 60
|
|
- check_alertmanager_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "PrometheusAlertmanager_replica-count"
|
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
|
|
check_interval: 60
|
|
- check_statefulset_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Statefulset_replica-count"
|
|
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
|
|
check_interval: 60
|
|
- check_daemonset_misscheduled:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Daemonset_misscheduled"
|
|
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
|
|
check_interval: 60
|
|
- check_daemonset_not-scheduled:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Daemonset_not-scheduled"
|
|
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
|
check_interval: 60
|
|
- check_deployment_replicas_unavailable:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Deployment_replicas-unavailable"
|
|
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
|
|
check_interval: 60
|
|
- check_volume_claim_high_utilization:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Volume_claim_high_utilization"
|
|
check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization
|
|
check_interval: 60
|
|
- check_deployment_rollingupdate_replicas_unavailable:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "RollingUpdate_Deployment-replicas-unavailable"
|
|
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
|
|
check_interval: 60
|
|
- check_job_status_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Job_status-failed"
|
|
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
|
|
check_interval: 60
|
|
- check_pod_status_pending:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-pending"
|
|
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
|
|
check_interval: 60
|
|
- check_pod_status_error_image_pull:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-error-image-pull"
|
|
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
|
check_interval: 60
|
|
- check_replicaset_missing_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Replicaset_missing-replicas"
|
|
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
|
|
check_interval: 60
|
|
- check_pod_container_terminated:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-container-terminated"
|
|
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
|
|
check_interval: 60
|
|
- check_glance_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_glance"
|
|
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
|
check_interval: 60
|
|
- check_nova_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_nova"
|
|
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
|
check_interval: 60
|
|
- check_keystone_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_keystone"
|
|
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
|
check_interval: 60
|
|
- check_neutron_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_neutron"
|
|
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
|
check_interval: 60
|
|
- check_swift_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_swift"
|
|
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
|
check_interval: 60
|
|
- check_service_nova_compute:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-compute"
|
|
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_conductor:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-conductor"
|
|
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_consoleauth:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-consoleauth"
|
|
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_scheduler:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-scheduler"
|
|
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
|
|
check_interval: 60
|
|
- check_ceph_monitor_quorum:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_quorum"
|
|
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
|
check_interval: 60
|
|
- check_ceph_storage_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_storage-usage"
|
|
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_pgs_degradation:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_PGs-degradation"
|
|
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_osds_down:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_OSDs-down"
|
|
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_monitor_clock_skew:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_Clock-skew"
|
|
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
|
check_interval: 60
|
|
- check_fluentd_up:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Fluentd_status"
|
|
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
|
check_interval: 60
|
|
- check_etcd_high_http_deletes_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-delete-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
|
check_interval: 60
|
|
- check_etcd_high_http_get_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-get-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
|
|
check_interval: 60
|
|
- check_etcd_high_http_updates_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-update-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
|
|
check_interval: 60
|
|
- check_felix_iptables_save_errors:
|
|
use: notifying_service
|
|
service_description: Calico_iptables-save-errors
|
|
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_ipset_errors:
|
|
use: notifying_service
|
|
service_description: Calico_ipset-errors
|
|
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_iface_msg_batch_size:
|
|
use: notifying_service
|
|
service_description: Calico_interface-message-batch-size
|
|
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_addr_msg_batch_size:
|
|
use: notifying_service
|
|
service_description: Calico_address-message-batch-size
|
|
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_failures:
|
|
use: notifying_service
|
|
service_description: Calico_datapane_failures_high
|
|
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_network_partitions_detected:
|
|
use: generic-service
|
|
service_description: Rabbitmq_network-partitions-exist
|
|
check_command: check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_available:
|
|
use: generic-service
|
|
service_description: Rabbitmq_up
|
|
check_command: check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_fd_usage:
|
|
use: generic-service
|
|
service_description: Rabbitmq_file-descriptor-usage
|
|
check_command: check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_node_disk_alarm:
|
|
use: generic-service
|
|
service_description: Rabbitmq_node-disk-alarm
|
|
check_command: check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_node_memory_alarm:
|
|
use: generic-service
|
|
service_description: Rabbitmq_node-memory-alarm
|
|
check_command: check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_availability:
|
|
use: generic-service
|
|
service_description: Rabbitmq_high-availability
|
|
check_command: check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_message_return_percent:
|
|
use: generic-service
|
|
service_description: Rabbitmq_message-return-percent
|
|
check_command: check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_consumer_util:
|
|
use: generic-service
|
|
service_description: Rabbitmq_consumer-utilization
|
|
check_command: check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_load:
|
|
use: generic-service
|
|
service_description: Rabbitmq_rabbitmq-queue-health
|
|
check_command: check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_high_process_open_file_count:
|
|
use: generic-service
|
|
service_description: ES_high-process-open-file-count
|
|
check_command: check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_high_process_cpu_percent:
|
|
use: generic-service
|
|
service_description: ES_high-process-cpu-percent
|
|
check_command: check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_fs_usage:
|
|
use: generic-service
|
|
service_description: ES_high-filesystem-usage
|
|
check_command: check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_unassigned_shards:
|
|
use: generic-service
|
|
service_description: ES_unassigned-shards
|
|
check_command: check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_health_timedout:
|
|
use: generic-service
|
|
service_description: ES_cluster-health-timedout
|
|
check_command: check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_health_status:
|
|
use: generic-service
|
|
service_description: ES_cluster-health-status
|
|
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_number_nodes_running:
|
|
use: generic-service
|
|
service_description: ES_cluster-running-node-count
|
|
check_command: check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_number_data_nodes_running:
|
|
use: generic-service
|
|
service_description: ES_cluster-running-data-node-count
|
|
check_command: check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_table_lock_waits:
|
|
use: generic-service
|
|
service_description: Mariadb_table-lock-waits-high
|
|
check_command: check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_node_ready:
|
|
use: generic-service
|
|
service_description: Mariadb_node-ready
|
|
check_command: check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_node_out_of_sync:
|
|
use: generic-service
|
|
service_description: Mariadb_node-synchronized
|
|
check_command: check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_innodb_replication_lag:
|
|
use: generic-service
|
|
service_description: Mariadb_innodb-replication-lag
|
|
check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
|
|
check_command: check_filespace_mounts-usage-rate-fullin4hrs
|
|
check_interval: 60
|
|
- check_filespace_mounts-usage:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "Filespace_mounts-usage"
|
|
check_command: check_filespace_mounts-usage
|
|
check_interval: 60
|
|
- check_node_loadavg:
|
|
use: notifying_service
|
|
service_description: CPU_Load-average
|
|
check_command: check_node_loadavg
|
|
hostgroup_name: base-os
|
|
- check_node_cpu_util:
|
|
use: notifying_service
|
|
service_description: CPU_utilization
|
|
check_command: check_node_cpu_util
|
|
hostgroup_name: base-os
|
|
- check_network_connections:
|
|
use: notifying_service
|
|
service_description: Network_connections
|
|
check_command: check_network_connections
|
|
hostgroup_name: base-os
|
|
- check_memory_usage:
|
|
use: notifying_service
|
|
service_description: Memory_usage
|
|
check_command: check_memory_usage
|
|
hostgroup_name: base-os
|
|
- check_disk_write_latency:
|
|
use: notifying_service
|
|
service_description: Disk_write-latency
|
|
check_command: check_disk_write_latency
|
|
hostgroup_name: base-os
|
|
- check_disk_read_latency:
|
|
use: notifying_service
|
|
service_description: Disk_read-latency
|
|
check_command: check_disk_read_latency
|
|
hostgroup_name: base-os
|
|
- check_entropy_availability:
|
|
use: notifying_service
|
|
service_description: Entropy_availability
|
|
check_command: check_entropy_availability
|
|
hostgroup_name: base-os
|
|
- check_filedescriptor_usage_rate:
|
|
use: notifying_service
|
|
service_description: FileDescriptors_usage-rate-high
|
|
check_command: check_filedescriptor_usage_rate
|
|
hostgroup_name: base-os
|
|
- check_hwmon_high_cpu_temp:
|
|
use: notifying_service
|
|
service_description: HW_cpu-temp-high
|
|
check_command: check_hwmon_high_cpu_temp
|
|
hostgroup_name: base-os
|
|
- check_network_receive_drop_high:
|
|
use: notifying_service
|
|
service_description: Network_receive-drop-high
|
|
check_command: check_network_receive_drop_high
|
|
hostgroup_name: base-os
|
|
- check_network_transmit_drop_high:
|
|
use: notifying_service
|
|
service_description: Network_transmit-drop-high
|
|
check_command: check_network_transmit_drop_high
|
|
hostgroup_name: base-os
|
|
- check_network_receive_errors_high:
|
|
use: notifying_service
|
|
service_description: Network_receive-errors-high
|
|
check_command: check_network_receive_errors_high
|
|
hostgroup_name: base-os
|
|
- check_network_transmit_errors_high:
|
|
use: notifying_service
|
|
service_description: Network_transmit-errors-high
|
|
check_command: check_network_transmit_errors_high
|
|
hostgroup_name: base-os
|
|
- check_vmstat_paging_rate:
|
|
use: notifying_service
|
|
service_description: Memory_vmstat-paging-rate
|
|
check_command: check_vmstat_paging_rate
|
|
hostgroup_name: base-os
|
|
- check_xfs_block_allocation:
|
|
use: notifying_service
|
|
service_description: XFS_block-allocation
|
|
check_command: check_xfs_block_allocation
|
|
hostgroup_name: base-os
|
|
- check_network_bond_status:
|
|
use: notifying_service
|
|
service_description: Network_bondstatus
|
|
check_command: check_network_bond_status
|
|
hostgroup_name: base-os
|
|
- check_numa_memory_usage:
|
|
use: notifying_service
|
|
service_description: Memory_NUMA-usage
|
|
check_command: check_numa_memory_usage
|
|
hostgroup_name: base-os
|
|
- check_ntp_sync:
|
|
use: notifying_service
|
|
service_description: NTP_sync
|
|
check_command: check_ntp_sync
|
|
hostgroup_name: base-os
|
|
nagios:
|
|
log_file: /opt/nagios/var/nagios.log
|
|
cfg_file:
|
|
- /opt/nagios/etc/nagios_objects.cfg
|
|
- /opt/nagios/etc/objects/commands.cfg
|
|
- /opt/nagios/etc/objects/contacts.cfg
|
|
- /opt/nagios/etc/objects/timeperiods.cfg
|
|
- /opt/nagios/etc/objects/templates.cfg
|
|
- /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
|
|
object_cache_file: /opt/nagios/var/objects.cache
|
|
precached_object_file: /opt/nagios/var/objects.precache
|
|
resource_file: /opt/nagios/etc/resource.cfg
|
|
status_file: /opt/nagios/var/status.dat
|
|
status_update_interval: 10
|
|
nagios_user: nagios
|
|
nagios_group: nagios
|
|
check_external_commands: 1
|
|
command_file: /opt/nagios/var/rw/nagios.cmd
|
|
lock_file: /var/run/nagios.lock
|
|
temp_file: /opt/nagios/var/nagios.tmp
|
|
temp_path: /tmp
|
|
event_broker_options: -1
|
|
log_rotation_method: d
|
|
log_archive_path: /opt/nagios/var/archives
|
|
use_syslog: 1
|
|
log_service_retries: 1
|
|
log_host_retries: 1
|
|
log_event_handlers: 1
|
|
log_initial_states: 0
|
|
log_current_states: 1
|
|
log_external_commands: 1
|
|
log_passive_checks: 1
|
|
service_inter_check_delay_method: s
|
|
max_service_check_spread: 30
|
|
service_interleave_factor: s
|
|
host_inter_check_delay_method: s
|
|
max_host_check_spread: 30
|
|
max_concurrent_checks: 0
|
|
check_result_reaper_frequency: 10
|
|
max_check_result_reaper_time: 30
|
|
check_result_path: /opt/nagios/var/spool/checkresults
|
|
max_check_result_file_age: 3600
|
|
cached_host_check_horizon: 15
|
|
cached_service_check_horizon: 15
|
|
enable_predictive_host_dependency_checks: 1
|
|
enable_predictive_service_dependency_checks: 1
|
|
soft_state_dependencies: 0
|
|
auto_reschedule_checks: 0
|
|
auto_rescheduling_interval: 30
|
|
auto_rescheduling_window: 180
|
|
service_check_timeout: 60
|
|
host_check_timeout: 60
|
|
event_handler_timeout: 60
|
|
notification_timeout: 60
|
|
ocsp_timeout: 5
|
|
perfdata_timeout: 5
|
|
retain_state_information: 1
|
|
state_retention_file: /opt/nagios/var/retention.dat
|
|
retention_update_interval: 60
|
|
use_retained_program_state: 1
|
|
use_retained_scheduling_info: 1
|
|
retained_host_attribute_mask: 0
|
|
retained_service_attribute_mask: 0
|
|
retained_process_host_attribute_mask: 0
|
|
retained_process_service_attribute_mask: 0
|
|
retained_contact_host_attribute_mask: 0
|
|
retained_contact_service_attribute_mask: 0
|
|
interval_length: 1
|
|
check_for_updates: 1
|
|
bare_update_check: 0
|
|
use_aggressive_host_checking: 0
|
|
execute_service_checks: 1
|
|
accept_passive_service_checks: 1
|
|
execute_host_checks: 1
|
|
accept_passive_host_checks: 1
|
|
enable_notifications: 1
|
|
enable_event_handlers: 1
|
|
process_performance_data: 0
|
|
obsess_over_services: 0
|
|
obsess_over_hosts: 0
|
|
translate_passive_host_checks: 0
|
|
passive_host_checks_are_soft: 0
|
|
check_for_orphaned_services: 1
|
|
check_for_orphaned_hosts: 1
|
|
check_service_freshness: 1
|
|
service_freshness_check_interval: 60
|
|
check_host_freshness: 0
|
|
host_freshness_check_interval: 60
|
|
additional_freshness_latency: 15
|
|
enable_flap_detection: 1
|
|
low_service_flap_threshold: 5.0
|
|
high_service_flap_threshold: 20.0
|
|
low_host_flap_threshold: 5.0
|
|
high_host_flap_threshold: 20.0
|
|
date_format: us
|
|
use_regexp_matching: 1
|
|
use_true_regexp_matching: 0
|
|
daemon_dumps_core: 0
|
|
use_large_installation_tweaks: 0
|
|
enable_environment_macros: 0
|
|
debug_level: 0
|
|
debug_verbosity: 1
|
|
debug_file: /opt/nagios/var/nagios.debug
|
|
max_debug_file_size: 1000000
|
|
allow_empty_hostgroup_assignment: 1
|
|
illegal_macro_output_chars: "`~$&|'<>\""
|
|
cgi:
|
|
main_config_file: /opt/nagios/etc/nagios.cfg
|
|
physical_html_path: /opt/nagios/share
|
|
url_html_path: /nagios
|
|
show_context_help: 0
|
|
use_pending_states: 1
|
|
use_authentication: 0
|
|
use_ssl_authentication: 0
|
|
authorized_for_system_information: "*"
|
|
authorized_for_configuration_information: "*"
|
|
authorized_for_system_commands: nagiosadmin
|
|
authorized_for_all_services: "*"
|
|
authorized_for_all_hosts: "*"
|
|
authorized_for_all_service_commands: "*"
|
|
authorized_for_all_host_commands: "*"
|
|
default_statuswrl_layout: 4
|
|
ping_syntax: /bin/ping -n -U -c 5 $HOSTADDRESS$
|
|
refresh_rate: 90
|
|
result_limit: 100
|
|
escape_html_tags: 1
|
|
action_url_target: _blank
|
|
notes_url_target: _blank
|
|
lock_author_names: 1
|
|
navbar_search_for_addresses: 1
|
|
navbar_search_for_aliases: 1
|
|
notification:
|
|
snmp:
|
|
primary_target: 127.0.0.1:15162
|
|
secondary_target: 127.0.0.1:15162
|
|
http:
|
|
primary_target: 127.0.0.1:3904/events
|
|
secondary_target: 127.0.0.1:3904/events
|