fb7fc87d23
This change allows us to substitute values into our rules files. Example: - alert: my_region_is_down expr: up{region="{{ $my_region }}"} == 0 To support this change, rule annotations that used the expansion {{ $labels.foo }} had to be surrounded with "{{` ... `}}" to render correctly. Change-Id: Ia7ac891de8261acca62105a3e2636bd747a5fbea
382 lines
21 KiB
YAML
382 lines
21 KiB
YAML
---
|
|
conf:
|
|
prometheus:
|
|
rules:
|
|
kubernetes:
|
|
groups:
|
|
- name: calico.rules
|
|
rules:
|
|
- alert: prom_exporter_calico_unavailable
|
|
expr: avg_over_time(up{job="kubernetes-pods",application="calico"}[5m]) == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Calico exporter is not collecting metrics or is not available for past 10 minutes
|
|
title: Calico exporter is not collecting metrics or is not available
|
|
- alert: calico_datapane_failures_high_1h
|
|
expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour`}}"
|
|
summary: A high number of dataplane failures within Felix are happening
|
|
- alert: calico_datapane_address_msg_batch_size_high_5m
|
|
expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size`}}"
|
|
summary: Felix address message batch size is higher
|
|
- alert: calico_datapane_iface_msg_batch_size_high_5m
|
|
expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size`}}"
|
|
summary: Felix interface message batch size is higher
|
|
- alert: calico_ipset_errors_high_1h
|
|
expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour`}}"
|
|
summary: A high number of ipset errors within Felix are happening
|
|
- alert: calico_iptable_save_errors_high_1h
|
|
expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour`}}"
|
|
summary: A high number of iptable save errors within Felix are happening
|
|
- alert: calico_iptable_restore_errors_high_1h
|
|
expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour`}}"
|
|
summary: A high number of iptable restore errors within Felix are happening
|
|
- name: etcd3.rules
|
|
rules:
|
|
- alert: etcd_InsufficientMembers
|
|
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: If one more etcd member goes down the cluster will be unavailable
|
|
summary: etcd cluster insufficient members
|
|
- alert: etcd_NoLeader
|
|
expr: etcd_server_has_leader{job="etcd"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`etcd member {{ $labels.instance }} has no leader`}}"
|
|
summary: etcd member has no leader
|
|
- alert: etcd_HighNumberOfLeaderChanges
|
|
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour`}}"
|
|
summary: a high number of leader changes within the etcd cluster are happening
|
|
- alert: etcd_HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}`}}"
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: etcd_HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}`}}"
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: etcd_GRPCRequestsSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow`}}"
|
|
summary: slow gRPC requests
|
|
- alert: etcd_HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}`}}"
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: etcd_HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}`}}"
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: etcd_HTTPRequestsSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow`}}"
|
|
summary: slow HTTP requests
|
|
- alert: etcd_EtcdMemberCommunicationSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow`}}"
|
|
summary: etcd member communication is slow
|
|
- alert: etcd_HighNumberOfFailedProposals
|
|
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour`}}"
|
|
summary: a high number of proposals within the etcd cluster are failing
|
|
- alert: etcd_HighFsyncDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`etcd instance {{ $labels.instance }} fync durations are high`}}"
|
|
summary: high fsync durations
|
|
- alert: etcd_HighCommitDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`etcd instance {{ $labels.instance }} commit durations are high`}}"
|
|
summary: high commit durations
|
|
- name: kubelet.rules
|
|
rules:
|
|
- alert: K8SNodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready", status="unknown"} == 1 or kube_node_status_condition{condition="Ready", status="false"} == 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute`}}"
|
|
summary: "{{`{{ $labels.node }} Node status is NotReady and {{ $labels.status }}`}}"
|
|
- alert: K8SManyNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).`}}"
|
|
summary: Many Kubernetes nodes are Not Ready
|
|
- alert: K8SManyNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).`}}"
|
|
summary: Many Kubernetes nodes are Not Ready
|
|
- alert: K8SNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`{{ $value }} nodes are notReady state.`}}"
|
|
summary: One or more Kubernetes nodes are Not Ready
|
|
- alert: K8SKubeletDown
|
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`Prometheus failed to scrape {{ $value }}% of kubelets.`}}"
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletDown
|
|
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.`}}"
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletTooManyPods
|
|
expr: kubelet_running_pod_count > 100
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110`}}"
|
|
summary: Kubelet is close to pod limit
|
|
- name: kube-apiserver.rules
|
|
rules:
|
|
- alert: K8SApiserverDown
|
|
expr: absent(up{job="apiserver"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
|
|
summary: API server unreachable
|
|
- alert: K8SApiServerLatency
|
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.`}}"
|
|
summary: Kubernetes apiserver latency is high
|
|
- name: kube-controller-manager.rules
|
|
rules:
|
|
- alert: K8SControllerManagerDown
|
|
expr: absent(up{job="kube-controller-manager-discovery"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
|
|
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
|
summary: Controller manager is down
|
|
- name: kubernetes-object.rules
|
|
rules:
|
|
- alert: prom_exporter_kube_state_metrics_unavailable
|
|
expr: avg_over_time(up{job="kube-state-metrics"}[5m]) == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
|
|
title: kube-state-metrics exporter is not collecting metrics or is not available
|
|
- alert: kube_statefulset_replicas_unavailable
|
|
expr: kube_statefulset_status_replicas < kube_statefulset_replicas
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired`}}"
|
|
summary: "{{`{{$labels.statefulset}}: has inssuficient replicas.`}}"
|
|
- alert: daemonsets_misscheduled
|
|
expr: kube_daemonset_status_number_misscheduled > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`Daemonset {{$labels.daemonset}} is running where it is not supposed to run`}}"
|
|
summary: Daemonsets not scheduled correctly
|
|
- alert: daemonsets_not_scheduled
|
|
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number`}}"
|
|
summary: Less than desired number of daemonsets scheduled
|
|
- alert: daemonset_pods_unavailable
|
|
expr: kube_daemonset_status_number_unavailable > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`Daemonset {{$labels.daemonset}} currently has pods unavailable`}}"
|
|
summary: Daemonset pods unavailable, due to one of many reasons
|
|
- alert: deployment_replicas_unavailable
|
|
expr: kube_deployment_status_replicas_unavailable > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`deployment {{$labels.deployment}} has {{$value}} replicas unavailable`}}"
|
|
summary: "{{`{{$labels.deployment}}: has inssuficient replicas.`}}"
|
|
- alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
|
|
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update`}}"
|
|
summary: "{{`{{$labels.deployment}}: has inssuficient replicas during a rolling update.`}}"
|
|
- alert: job_status_failed
|
|
expr: kube_job_status_failed > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Job {{$labels.exported_job}} is in failed status`}}"
|
|
summary: "{{`{{$labels.exported_job}} has failed status`}}"
|
|
- alert: pod_status_pending
|
|
expr: kube_pod_status_phase{phase="Pending"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes`}}"
|
|
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status`}}"
|
|
- alert: pod_status_error_image_pull
|
|
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes`}}"
|
|
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}"
|
|
- alert: pod_status_error_image_pull_backoff
|
|
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes`}}"
|
|
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}"
|
|
- alert: pod_error_crash_loop_back_off
|
|
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes`}}"
|
|
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}"
|
|
- alert: pod_error_config_error
|
|
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes`}}"
|
|
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}"
|
|
- alert: replicaset_missing_replicas
|
|
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes`}}"
|
|
summary: "{{`Replicaset {{$labels.replicaset}} is missing replicas`}}"
|
|
- alert: pod_container_terminated
|
|
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes`}}"
|
|
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}"
|
|
- alert: volume_claim_capacity_high_utilization
|
|
expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "{{`volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity`}}"
|
|
summary: "{{`{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.`}}"
|
|
...
|