From 4fdcff593cdddb5edb7229f2319aa2714c0461cc Mon Sep 17 00:00:00 2001 From: Steven Fitzpatrick Date: Thu, 2 Jan 2020 15:54:01 -0600 Subject: [PATCH] Fix incorrect prometheus alert names in nagios I noticed a some nagios service checks were checking prometheus alerts which did not exist in our default prometheus configuration. In one case a prometheus alert did not match the naming convention of similar alerts. One nagios service check, ceph_monitor_clock_skew_high, does not have a corresponding alert at all, so I've changed it to check the node_ntmp_clock_skew_high alert, where a node has the label ceph-mon="enabled". Change-Id: I2ebf9a4954190b8e2caefc8a61270e28bf24d9fa --- nagios/values.yaml | 20 ++++++++++---------- prometheus/values_overrides/kubernetes.yaml | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/nagios/values.yaml b/nagios/values.yaml index 1603db1c2..30cbe721b 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -524,7 +524,7 @@ conf: } define service { - check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas + check_command check_prom_alert_with_labels!kube_statefulset_replicas_unavailable!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas check_interval 60 hostgroup_name prometheus-hosts service_description Prometheus_replica-count @@ -532,7 +532,7 @@ conf: } define service { - check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas + check_command check_prom_alert_with_labels!kube_statefulset_replicas_unavailable!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas check_interval 60 hostgroup_name prometheus-hosts service_description PrometheusAlertmanager_replica-count @@ -540,7 +540,7 @@ conf: } define service { - check_command check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas + check_command check_prom_alert!kube_statefulset_replicas_unavailable!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas check_interval 60 hostgroup_name prometheus-hosts service_description Statefulset_replica-count @@ -752,7 +752,7 @@ conf: } define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%' + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%' command_name check_memory_usage } @@ -782,22 +782,22 @@ conf: } define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.' + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.' command_name check_network_receive_drop_high } define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.' + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.' command_name check_network_transmit_drop_high } define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.' + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_errs_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.' command_name check_network_receive_errors_high } define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.' + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_errs_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.' command_name check_network_transmit_errors_high } @@ -990,7 +990,7 @@ conf: } define service { - check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists + check_command check_prom_alert!ceph_mon_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_quorum @@ -1022,7 +1022,7 @@ conf: } define service { - check_command check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds + check_command check_prom_alert_with_labels!node_ntp_clock_skew_high!ceph-mon="enabled"!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_Clock-skew diff --git a/prometheus/values_overrides/kubernetes.yaml b/prometheus/values_overrides/kubernetes.yaml index dd15f1a3e..638722a82 100644 --- a/prometheus/values_overrides/kubernetes.yaml +++ b/prometheus/values_overrides/kubernetes.yaml @@ -321,7 +321,7 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status' - - alert: pod_error_image_pull + - alert: pod_status_error_image_pull expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 for: 10m labels: