diff --git a/nagios/values.yaml b/nagios/values.yaml index e6daf7609..74e2da9fe 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -526,6 +526,12 @@ conf: service_description: "Daemonset_not-scheduled" check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired check_interval: 60 + - check_daemonset_unavailable: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Daemonset_pods-unavailable" + check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available + check_interval: 60 - check_deployment_replicas_unavailable: use: notifying_service hostgroup_name: prometheus-hosts @@ -562,6 +568,18 @@ conf: service_description: "Pod_status-error-image-pull" check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status check_interval: 60 + - check_pod_status_error_image_pull_backoff: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Pod_status-error-image-pull" + check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status + check_interval: 60 + - check_pod_status_error_container_config_error: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Pod_status-error-image-pull" + check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status + check_interval: 60 - check_pod_error_crash_loop_back_off: use: notifying_service hostgroup_name: prometheus-hosts diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 28ce99e46..e3675b507 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1300,6 +1300,14 @@ conf: annotations: description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number' summary: 'Less than desired number of daemonsets scheduled' + - alert: daemonset_pods_unavailable + expr: kube_daemonset_status_number_unavailable > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable' + summary: 'Daemonset pods unavailable, due to one of many reasons' - alert: deployment_replicas_unavailable expr: kube_deployment_status_replicas_unavailable > 0 for: 10m @@ -1340,13 +1348,13 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - - alert: pod_status_error_image_pull - expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 + - alert: pod_status_error_image_pull_backoff + expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1 for: 10m labels: severity: page annotations: - description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - alert: pod_error_crash_loop_back_off expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1 @@ -1356,6 +1364,14 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: pod_error_config_error + expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - alert: replicaset_missing_replicas expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0 for: 10m