Nagios notificiation on alerts and ceph monitoring
Change-Id: I782f54b5ad8159e7a4375d336a42524f380e65d2
This commit is contained in:
parent
d57d3344a5
commit
69cd66b7c9
@ -29,6 +29,8 @@ data:
|
|||||||
nagios.cfg: |+
|
nagios.cfg: |+
|
||||||
{{ include "nagios.to_nagios_conf" .Values.conf.nagios.config | indent 4 }}
|
{{ include "nagios.to_nagios_conf" .Values.conf.nagios.config | indent 4 }}
|
||||||
nagios_objects.cfg: |+
|
nagios_objects.cfg: |+
|
||||||
|
{{- tuple "contact" .Values.conf.nagios.contacts | include "nagios.object_definition" | indent 4 }}
|
||||||
|
{{- tuple "contactgroup" .Values.conf.nagios.contactgroups | include "nagios.object_definition" | indent 4 }}
|
||||||
{{- tuple "host" .Values.conf.nagios.hosts | include "nagios.object_definition" | indent 4 }}
|
{{- tuple "host" .Values.conf.nagios.hosts | include "nagios.object_definition" | indent 4 }}
|
||||||
{{- tuple "hostgroup" .Values.conf.nagios.host_groups | include "nagios.object_definition" | indent 4 }}
|
{{- tuple "hostgroup" .Values.conf.nagios.host_groups | include "nagios.object_definition" | indent 4 }}
|
||||||
{{- tuple "command" .Values.conf.nagios.commands | include "nagios.object_definition" | indent 4 }}
|
{{- tuple "command" .Values.conf.nagios.commands | include "nagios.object_definition" | indent 4 }}
|
||||||
|
@ -126,6 +126,14 @@ spec:
|
|||||||
env:
|
env:
|
||||||
- name: PROMETHEUS_SERVICE
|
- name: PROMETHEUS_SERVICE
|
||||||
value: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}
|
value: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}
|
||||||
|
- name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT
|
||||||
|
value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }}
|
||||||
|
- name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT
|
||||||
|
value: {{ $envAll.Values.conf.nagios.notification.snmp.secondary_target }}
|
||||||
|
- name: REST_NOTIF_PRIMARY_TARGET_URL
|
||||||
|
value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }}
|
||||||
|
- name: REST_NOTIF_SECONDARY_TARGET_URL
|
||||||
|
value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }}
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: nagios-etc
|
- name: nagios-etc
|
||||||
mountPath: /opt/nagios/etc/nagios.cfg
|
mountPath: /opt/nagios/etc/nagios.cfg
|
||||||
|
@ -186,6 +186,35 @@ conf:
|
|||||||
httpd: null
|
httpd: null
|
||||||
elasticsearch_host: null
|
elasticsearch_host: null
|
||||||
nagios:
|
nagios:
|
||||||
|
contacts:
|
||||||
|
- notifying_contact:
|
||||||
|
name: notifying_contact
|
||||||
|
contact_name: notifying_contact
|
||||||
|
alias: notifying contact
|
||||||
|
service_notification_period: 24x7
|
||||||
|
host_notification_period: 24x7
|
||||||
|
service_notification_options: w,u,c,r,f,s
|
||||||
|
host_notification_options: d,u,r,f,s
|
||||||
|
register: 0
|
||||||
|
- snmp_notifying_contact:
|
||||||
|
use: notifying_contact
|
||||||
|
name: snmp_notifying_contact
|
||||||
|
contact_name: snmp_notifying_contact
|
||||||
|
alias: snmp contact
|
||||||
|
service_notification_commands: send_service_snmp_trap
|
||||||
|
host_notification_commands: send_host_snmp_trap
|
||||||
|
- http_notifying_contact:
|
||||||
|
use: notifying_contact
|
||||||
|
name: http_notifying_contact
|
||||||
|
contact_name: http_notifying_contact
|
||||||
|
alias: HTTP contact
|
||||||
|
service_notification_commands: send_service_http_post
|
||||||
|
host_notification_commands: send_host_http_post
|
||||||
|
contactgroups:
|
||||||
|
- snmp_and_http_notifying_contact_group:
|
||||||
|
contactgroup_name: snmp_and_http_notifying_contact_group
|
||||||
|
alias: SNMP and HTTP notifying group
|
||||||
|
members: snmp_notifying_contact,http_notifying_contact
|
||||||
hosts:
|
hosts:
|
||||||
- prometheus:
|
- prometheus:
|
||||||
use: linux-server
|
use: linux-server
|
||||||
@ -204,7 +233,22 @@ conf:
|
|||||||
- base-os:
|
- base-os:
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
alias: "base-os"
|
alias: "base-os"
|
||||||
|
- ceph_mgr_placeholder:
|
||||||
|
hostgroup_name: ceph_mgr_placeholder
|
||||||
|
alias: "ceph_mgr_placeholder"
|
||||||
commands:
|
commands:
|
||||||
|
- send_service_snmp_trap:
|
||||||
|
command_name: send_service_snmp_trap
|
||||||
|
command_line: "$USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$'"
|
||||||
|
- send_host_snmp_trap:
|
||||||
|
command_name: send_host_snmp_trap
|
||||||
|
command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
|
||||||
|
- send_service_http_post:
|
||||||
|
command_name: send_service_http_post
|
||||||
|
command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
|
||||||
|
- send_host_http_post:
|
||||||
|
command_name: send_host_http_post
|
||||||
|
command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
|
||||||
- check_prometheus_host_alive:
|
- check_prometheus_host_alive:
|
||||||
command_name: check-prometheus-host-alive
|
command_name: check-prometheus-host-alive
|
||||||
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
|
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
|
||||||
@ -274,311 +318,330 @@ conf:
|
|||||||
- check_ntp_sync:
|
- check_ntp_sync:
|
||||||
command_name: check_ntp_sync
|
command_name: check_ntp_sync
|
||||||
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
||||||
|
- check_ceph_health:
|
||||||
|
command_name: check_ceph_health
|
||||||
|
command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
|
||||||
services:
|
services:
|
||||||
- check_prometheus_replicas:
|
- notifying_service:
|
||||||
|
name: notifying_service
|
||||||
use: generic-service
|
use: generic-service
|
||||||
|
flap_detection_enabled: 0
|
||||||
|
process_perf_data: 0
|
||||||
|
contact_groups: snmp_and_http_notifying_contact_group
|
||||||
|
check_interval: 60
|
||||||
|
notification_interval: 120
|
||||||
|
retry_interval: 15
|
||||||
|
register: 0
|
||||||
|
- check_ceph_health:
|
||||||
|
use: notifying_service
|
||||||
|
hostgroup_name: ^ceph_mgr.*$
|
||||||
|
service_description: "CEPH_health"
|
||||||
|
check_command: check_ceph_health
|
||||||
|
check_interval: 60
|
||||||
|
- check_prometheus_replicas:
|
||||||
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Prometheus_replica-count"
|
service_description: "Prometheus_replica-count"
|
||||||
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_alertmanager_replicas:
|
- check_alertmanager_replicas:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "PrometheusAlertmanager_replica-count"
|
service_description: "PrometheusAlertmanager_replica-count"
|
||||||
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_statefulset_replicas:
|
- check_statefulset_replicas:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Statefulset_replica-count"
|
service_description: "Statefulset_replica-count"
|
||||||
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
|
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_daemonset_misscheduled:
|
- check_daemonset_misscheduled:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Daemonset_misscheduled"
|
service_description: "Daemonset_misscheduled"
|
||||||
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
|
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_daemonset_not-scheduled:
|
- check_daemonset_not-scheduled:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Daemonset_not-scheduled"
|
service_description: "Daemonset_not-scheduled"
|
||||||
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_deployment_replicas_unavailable:
|
- check_deployment_replicas_unavailable:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Deployment_replicas-unavailable"
|
service_description: "Deployment_replicas-unavailable"
|
||||||
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
|
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_deployment_rollingupdate_replicas_unavailable:
|
- check_deployment_rollingupdate_replicas_unavailable:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "RollingUpdate_Deployment-replicas-unavailable"
|
service_description: "RollingUpdate_Deployment-replicas-unavailable"
|
||||||
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
|
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_job_status_failed:
|
- check_job_status_failed:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Job_status-failed"
|
service_description: "Job_status-failed"
|
||||||
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
|
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_pod_status_pending:
|
- check_pod_status_pending:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Pod_status-pending"
|
service_description: "Pod_status-pending"
|
||||||
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
|
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_pod_status_error_image_pull:
|
- check_pod_status_error_image_pull:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Pod_status-error-image-pull"
|
service_description: "Pod_status-error-image-pull"
|
||||||
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_replicaset_missing_replicas:
|
- check_replicaset_missing_replicas:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Replicaset_missing-replicas"
|
service_description: "Replicaset_missing-replicas"
|
||||||
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
|
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_pod_container_terminated:
|
- check_pod_container_terminated:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Pod_status-container-terminated"
|
service_description: "Pod_status-container-terminated"
|
||||||
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
|
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_glance_api:
|
- check_glance_api:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "API_glance"
|
service_description: "API_glance"
|
||||||
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_nova_api:
|
- check_nova_api:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "API_nova"
|
service_description: "API_nova"
|
||||||
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_keystone_api:
|
- check_keystone_api:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "API_keystone"
|
service_description: "API_keystone"
|
||||||
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_neutron_api:
|
- check_neutron_api:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "API_neutron"
|
service_description: "API_neutron"
|
||||||
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_swift_api:
|
- check_swift_api:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "API_swift"
|
service_description: "API_swift"
|
||||||
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_service_nova_compute:
|
- check_service_nova_compute:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Service_nova-compute"
|
service_description: "Service_nova-compute"
|
||||||
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
|
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_service_nova_conductor:
|
- check_service_nova_conductor:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Service_nova-conductor"
|
service_description: "Service_nova-conductor"
|
||||||
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
|
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_service_nova_consoleauth:
|
- check_service_nova_consoleauth:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Service_nova-consoleauth"
|
service_description: "Service_nova-consoleauth"
|
||||||
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
|
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_service_nova_scheduler:
|
- check_service_nova_scheduler:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Service_nova-scheduler"
|
service_description: "Service_nova-scheduler"
|
||||||
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
|
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_ceph_monitor_quorum:
|
- check_ceph_monitor_quorum:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "CEPH_quorum"
|
service_description: "CEPH_quorum"
|
||||||
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_ceph_storage_usage:
|
- check_ceph_storage_usage:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "CEPH_storage-usage"
|
service_description: "CEPH_storage-usage"
|
||||||
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_ceph_pgs_degradation:
|
- check_ceph_pgs_degradation:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "CEPH_PGs-degradation"
|
service_description: "CEPH_PGs-degradation"
|
||||||
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_ceph_osds_down:
|
- check_ceph_osds_down:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "CEPH_OSDs-down"
|
service_description: "CEPH_OSDs-down"
|
||||||
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
|
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_ceph_monitor_clock_skew:
|
- check_ceph_monitor_clock_skew:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "CEPH_Clock-skew"
|
service_description: "CEPH_Clock-skew"
|
||||||
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_fluentd_up:
|
- check_fluentd_up:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Fluentd_status"
|
service_description: "Fluentd_status"
|
||||||
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_etcd_high_http_deletes_failed:
|
- check_etcd_high_http_deletes_failed:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: ETCD_high-http-delete-failures
|
service_description: ETCD_high-http-delete-failures
|
||||||
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_etcd_high_http_get_failed:
|
- check_etcd_high_http_get_failed:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: ETCD_high-http-get-failures
|
service_description: ETCD_high-http-get-failures
|
||||||
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_etcd_high_http_updates_failed:
|
- check_etcd_high_http_updates_failed:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: ETCD_high-http-update-failures
|
service_description: ETCD_high-http-update-failures
|
||||||
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_felix_iptables_save_errors:
|
- check_felix_iptables_save_errors:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Calico_iptables-save-errors
|
service_description: Calico_iptables-save-errors
|
||||||
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
|
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
- check_felix_ipset_errors:
|
- check_felix_ipset_errors:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Calico_ipset-errors
|
service_description: Calico_ipset-errors
|
||||||
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
|
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
- check_felix_int_dataplane_iface_msg_batch_size:
|
- check_felix_int_dataplane_iface_msg_batch_size:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Calico_interface-message-batch-size
|
service_description: Calico_interface-message-batch-size
|
||||||
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
|
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
- check_felix_int_dataplane_addr_msg_batch_size:
|
- check_felix_int_dataplane_addr_msg_batch_size:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Calico_address-message-batch-size
|
service_description: Calico_address-message-batch-size
|
||||||
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
|
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
- check_felix_int_dataplane_failures:
|
- check_felix_int_dataplane_failures:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Calico_datapane_failures_high
|
service_description: Calico_datapane_failures_high
|
||||||
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
|
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
- check_filespace_mounts-usage-rate-fullin4hrs:
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
|
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
|
||||||
check_command: check_filespace_mounts-usage-rate-fullin4hrs
|
check_command: check_filespace_mounts-usage-rate-fullin4hrs
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_filespace_mounts-usage:
|
- check_filespace_mounts-usage:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
service_description: "Filespace_mounts-usage"
|
service_description: "Filespace_mounts-usage"
|
||||||
check_command: check_filespace_mounts-usage
|
check_command: check_filespace_mounts-usage
|
||||||
check_interval: 1
|
check_interval: 60
|
||||||
- check_node_loadavg:
|
- check_node_loadavg:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: CPU_Load-average
|
service_description: CPU_Load-average
|
||||||
check_command: check_node_loadavg
|
check_command: check_node_loadavg
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_node_cpu_util:
|
- check_node_cpu_util:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: CPU_utilization
|
service_description: CPU_utilization
|
||||||
check_command: check_node_cpu_util
|
check_command: check_node_cpu_util
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_network_connections:
|
- check_network_connections:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Network_connections
|
service_description: Network_connections
|
||||||
check_command: check_network_connections
|
check_command: check_network_connections
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_memory_usage:
|
- check_memory_usage:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Memory_usage
|
service_description: Memory_usage
|
||||||
check_command: check_memory_usage
|
check_command: check_memory_usage
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_disk_write_latency:
|
- check_disk_write_latency:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Disk_write-latency
|
service_description: Disk_write-latency
|
||||||
check_command: check_disk_write_latency
|
check_command: check_disk_write_latency
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_disk_read_latency:
|
- check_disk_read_latency:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Disk_read-latency
|
service_description: Disk_read-latency
|
||||||
check_command: check_disk_read_latency
|
check_command: check_disk_read_latency
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_entropy_availability:
|
- check_entropy_availability:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Entropy_availability
|
service_description: Entropy_availability
|
||||||
check_command: check_entropy_availability
|
check_command: check_entropy_availability
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_filedescriptor_usage_rate:
|
- check_filedescriptor_usage_rate:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: FileDescriptors_usage-rate-high
|
service_description: FileDescriptors_usage-rate-high
|
||||||
check_command: check_filedescriptor_usage_rate
|
check_command: check_filedescriptor_usage_rate
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_hwmon_high_cpu_temp:
|
- check_hwmon_high_cpu_temp:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: HW_cpu-temp-high
|
service_description: HW_cpu-temp-high
|
||||||
check_command: check_hwmon_high_cpu_temp
|
check_command: check_hwmon_high_cpu_temp
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_network_receive_drop_high:
|
- check_network_receive_drop_high:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Network_receive-drop-high
|
service_description: Network_receive-drop-high
|
||||||
check_command: check_network_receive_drop_high
|
check_command: check_network_receive_drop_high
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_network_transmit_drop_high:
|
- check_network_transmit_drop_high:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Network_transmit-drop-high
|
service_description: Network_transmit-drop-high
|
||||||
check_command: check_network_transmit_drop_high
|
check_command: check_network_transmit_drop_high
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_network_receive_errors_high:
|
- check_network_receive_errors_high:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Network_receive-errors-high
|
service_description: Network_receive-errors-high
|
||||||
check_command: check_network_receive_errors_high
|
check_command: check_network_receive_errors_high
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_network_transmit_errors_high:
|
- check_network_transmit_errors_high:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Network_transmit-errors-high
|
service_description: Network_transmit-errors-high
|
||||||
check_command: check_network_transmit_errors_high
|
check_command: check_network_transmit_errors_high
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_vmstat_paging_rate:
|
- check_vmstat_paging_rate:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Memory_vmstat-paging-rate
|
service_description: Memory_vmstat-paging-rate
|
||||||
check_command: check_vmstat_paging_rate
|
check_command: check_vmstat_paging_rate
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_xfs_block_allocation:
|
- check_xfs_block_allocation:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: XFS_block-allocation
|
service_description: XFS_block-allocation
|
||||||
check_command: check_xfs_block_allocation
|
check_command: check_xfs_block_allocation
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_network_bond_status:
|
- check_network_bond_status:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Network_bondstatus
|
service_description: Network_bondstatus
|
||||||
check_command: check_network_bond_status
|
check_command: check_network_bond_status
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_numa_memory_usage:
|
- check_numa_memory_usage:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: Memory_NUMA-usage
|
service_description: Memory_NUMA-usage
|
||||||
check_command: check_numa_memory_usage
|
check_command: check_numa_memory_usage
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
- check_ntp_sync:
|
- check_ntp_sync:
|
||||||
use: generic-service
|
use: notifying_service
|
||||||
service_description: NTP_sync
|
service_description: NTP_sync
|
||||||
check_command: check_ntp_sync
|
check_command: check_ntp_sync
|
||||||
hostgroup_name: base-os
|
hostgroup_name: base-os
|
||||||
@ -633,9 +696,9 @@ conf:
|
|||||||
auto_rescheduling_interval: 30
|
auto_rescheduling_interval: 30
|
||||||
auto_rescheduling_window: 180
|
auto_rescheduling_window: 180
|
||||||
service_check_timeout: 60
|
service_check_timeout: 60
|
||||||
host_check_timeout: 30
|
host_check_timeout: 60
|
||||||
event_handler_timeout: 30
|
event_handler_timeout: 60
|
||||||
notification_timeout: 30
|
notification_timeout: 60
|
||||||
ocsp_timeout: 5
|
ocsp_timeout: 5
|
||||||
perfdata_timeout: 5
|
perfdata_timeout: 5
|
||||||
retain_state_information: 1
|
retain_state_information: 1
|
||||||
@ -649,7 +712,7 @@ conf:
|
|||||||
retained_process_service_attribute_mask: 0
|
retained_process_service_attribute_mask: 0
|
||||||
retained_contact_host_attribute_mask: 0
|
retained_contact_host_attribute_mask: 0
|
||||||
retained_contact_service_attribute_mask: 0
|
retained_contact_service_attribute_mask: 0
|
||||||
interval_length: 60
|
interval_length: 1
|
||||||
check_for_updates: 1
|
check_for_updates: 1
|
||||||
bare_update_check: 0
|
bare_update_check: 0
|
||||||
use_aggressive_host_checking: 0
|
use_aggressive_host_checking: 0
|
||||||
@ -677,7 +740,7 @@ conf:
|
|||||||
low_host_flap_threshold: 5.0
|
low_host_flap_threshold: 5.0
|
||||||
high_host_flap_threshold: 20.0
|
high_host_flap_threshold: 20.0
|
||||||
date_format: us
|
date_format: us
|
||||||
use_regexp_matching: 0
|
use_regexp_matching: 1
|
||||||
use_true_regexp_matching: 0
|
use_true_regexp_matching: 0
|
||||||
daemon_dumps_core: 0
|
daemon_dumps_core: 0
|
||||||
use_large_installation_tweaks: 0
|
use_large_installation_tweaks: 0
|
||||||
@ -687,3 +750,11 @@ conf:
|
|||||||
debug_file: /opt/nagios/var/nagios.debug
|
debug_file: /opt/nagios/var/nagios.debug
|
||||||
max_debug_file_size: 1000000
|
max_debug_file_size: 1000000
|
||||||
allow_empty_hostgroup_assignment: 1
|
allow_empty_hostgroup_assignment: 1
|
||||||
|
illegal_macro_output_chars: "`~$&|'<>\""
|
||||||
|
notification:
|
||||||
|
snmp:
|
||||||
|
primary_target: 127.0.0.1:15162
|
||||||
|
secondary_target: 127.0.0.1:15162
|
||||||
|
http:
|
||||||
|
primary_target: 127.0.0.1:3904/events
|
||||||
|
secondary_target: 127.0.0.1:3904/events
|
||||||
|
Loading…
x
Reference in New Issue
Block a user