Prometheus: Update pod container status alerts

This updates the Prometheus pod container status alerts. This
ensures there are alerts defined for ImagePullBackOff,
ErrImagePull, and CreateContainerConfigError errors.

This also updates the Nagios service checks to include correct
checks for those alerts

Change-Id: I91544e7dff8c6aac8c79cd8aa7d8f7bc03adaa9a
This commit is contained in:
Steve Wilkerson 2019-01-18 09:54:18 -06:00
parent 067a37f76f
commit 87ff958fb8
2 changed files with 37 additions and 3 deletions

View File

@ -526,6 +526,12 @@ conf:
service_description: "Daemonset_not-scheduled"
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
check_interval: 60
- check_daemonset_unavailable:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Daemonset_pods-unavailable"
check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available
check_interval: 60
- check_deployment_replicas_unavailable:
use: notifying_service
hostgroup_name: prometheus-hosts
@ -562,6 +568,18 @@ conf:
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_status_error_image_pull_backoff:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_status_error_container_config_error:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_error_crash_loop_back_off:
use: notifying_service
hostgroup_name: prometheus-hosts

View File

@ -1300,6 +1300,14 @@ conf:
annotations:
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
summary: 'Less than desired number of daemonsets scheduled'
- alert: daemonset_pods_unavailable
expr: kube_daemonset_status_number_unavailable > 0
for: 10m
labels:
severity: warning
annotations:
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
summary: 'Daemonset pods unavailable, due to one of many reasons'
- alert: deployment_replicas_unavailable
expr: kube_deployment_status_replicas_unavailable > 0
for: 10m
@ -1340,13 +1348,13 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_status_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
- alert: pod_status_error_image_pull_backoff
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_crash_loop_back_off
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
@ -1356,6 +1364,14 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_config_error
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: replicaset_missing_replicas
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
for: 10m