Prometheus: Update pod container status alerts

This updates the Prometheus pod container status alerts. This ensures there are alerts defined for ImagePullBackOff, ErrImagePull, and CreateContainerConfigError errors. This also updates the Nagios service checks to include correct checks for those alerts Change-Id: I91544e7dff8c6aac8c79cd8aa7d8f7bc03adaa9a
2019-01-18 09:54:18 -06:00 · 2019-01-18 09:54:18 -06:00 · 87ff958fb8
commit 87ff958fb8
parent 067a37f76f
2 changed files with 37 additions and 3 deletions
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@ -526,6 +526,12 @@ conf:
          service_description: "Daemonset_not-scheduled"
          check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
          check_interval: 60
+      - check_daemonset_unavailable:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Daemonset_pods-unavailable"
+          check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available
+          check_interval: 60
      - check_deployment_replicas_unavailable:
          use: notifying_service
          hostgroup_name: prometheus-hosts
@ -562,6 +568,18 @@ conf:
          service_description: "Pod_status-error-image-pull"
          check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
          check_interval: 60
+      - check_pod_status_error_image_pull_backoff:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Pod_status-error-image-pull"
+          check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status
+          check_interval: 60
+      - check_pod_status_error_container_config_error:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Pod_status-error-image-pull"
+          check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status
+          check_interval: 60
      - check_pod_error_crash_loop_back_off:
          use: notifying_service
          hostgroup_name: prometheus-hosts
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@ -1300,6 +1300,14 @@ conf:
            annotations:
              description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
              summary: 'Less than desired number of daemonsets scheduled'
+          - alert: daemonset_pods_unavailable
+            expr: kube_daemonset_status_number_unavailable > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
+              summary: 'Daemonset pods unavailable, due to one of many reasons'
          - alert: deployment_replicas_unavailable
            expr: kube_deployment_status_replicas_unavailable > 0
            for: 10m
@ -1340,13 +1348,13 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: pod_status_error_image_pull
-            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+          - alert: pod_status_error_image_pull_backoff
+            expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
            for: 10m
            labels:
              severity: page
            annotations:
-              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
          - alert: pod_error_crash_loop_back_off
            expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
@ -1356,6 +1364,14 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff  error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: pod_error_config_error
+            expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
          - alert: replicaset_missing_replicas
            expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
            for: 10m