Prometheus: Update pod container status alerts
This updates the Prometheus pod container status alerts. This ensures there are alerts defined for ImagePullBackOff, ErrImagePull, and CreateContainerConfigError errors. This also updates the Nagios service checks to include correct checks for those alerts Change-Id: I91544e7dff8c6aac8c79cd8aa7d8f7bc03adaa9a
This commit is contained in:
parent
067a37f76f
commit
87ff958fb8
@ -526,6 +526,12 @@ conf:
|
||||
service_description: "Daemonset_not-scheduled"
|
||||
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
||||
check_interval: 60
|
||||
- check_daemonset_unavailable:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
service_description: "Daemonset_pods-unavailable"
|
||||
check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available
|
||||
check_interval: 60
|
||||
- check_deployment_replicas_unavailable:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
@ -562,6 +568,18 @@ conf:
|
||||
service_description: "Pod_status-error-image-pull"
|
||||
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
||||
check_interval: 60
|
||||
- check_pod_status_error_image_pull_backoff:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
service_description: "Pod_status-error-image-pull"
|
||||
check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status
|
||||
check_interval: 60
|
||||
- check_pod_status_error_container_config_error:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
service_description: "Pod_status-error-image-pull"
|
||||
check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status
|
||||
check_interval: 60
|
||||
- check_pod_error_crash_loop_back_off:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
|
@ -1300,6 +1300,14 @@ conf:
|
||||
annotations:
|
||||
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
|
||||
summary: 'Less than desired number of daemonsets scheduled'
|
||||
- alert: daemonset_pods_unavailable
|
||||
expr: kube_daemonset_status_number_unavailable > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
|
||||
summary: 'Daemonset pods unavailable, due to one of many reasons'
|
||||
- alert: deployment_replicas_unavailable
|
||||
expr: kube_deployment_status_replicas_unavailable > 0
|
||||
for: 10m
|
||||
@ -1340,13 +1348,13 @@ conf:
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_status_error_image_pull
|
||||
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
||||
- alert: pod_status_error_image_pull_backoff
|
||||
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_error_crash_loop_back_off
|
||||
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
|
||||
@ -1356,6 +1364,14 @@ conf:
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_error_config_error
|
||||
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: replicaset_missing_replicas
|
||||
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
||||
for: 10m
|
||||
|
Loading…
x
Reference in New Issue
Block a user