fbd34421f2
This updates the Prometheus chart to support federation. This moves to defining the Prometheus configuration file via a template in the values.yaml file instead of through raw yaml. This allows for overriding the chart's default configuration wholesale, as this would be required for a hierarchical federated setup. This also strips out all of the default rules defined in the chart for the same reason. There are example rules defined for the various aspects of OSH's infrastructure in the prometheus/values_overrides directory that are executed as part of the normal CI jobs. This also adds a nonvoting federated-monitoring job that vets out the ability to federate prometheus in a hierarchical fashion with extremely basic overrides Change-Id: I0f121ad5e4f80be4c790dc869955c6b299ca9f26 Signed-off-by: Steve Wilkerson <sw5822@att.com>
380 lines
20 KiB
YAML
380 lines
20 KiB
YAML
conf:
|
|
prometheus:
|
|
rules:
|
|
kubernetes:
|
|
groups:
|
|
- name: calico.rules
|
|
rules:
|
|
- alert: prom_exporter_calico_unavailable
|
|
expr: absent(felix_host)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Calico exporter is not collecting metrics or is not available for past 10 minutes
|
|
title: Calico exporter is not collecting metrics or is not available
|
|
- alert: calico_datapane_failures_high_1h
|
|
expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
|
|
summary: 'A high number of dataplane failures within Felix are happening'
|
|
- alert: calico_datapane_address_msg_batch_size_high_5m
|
|
expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
|
|
summary: 'Felix address message batch size is higher'
|
|
- alert: calico_datapane_iface_msg_batch_size_high_5m
|
|
expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
|
|
summary: 'Felix interface message batch size is higher'
|
|
- alert: calico_ipset_errors_high_1h
|
|
expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
|
|
summary: 'A high number of ipset errors within Felix are happening'
|
|
- alert: calico_iptable_save_errors_high_1h
|
|
expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
|
|
summary: 'A high number of iptable save errors within Felix are happening'
|
|
- alert: calico_iptable_restore_errors_high_1h
|
|
expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
|
|
summary: 'A high number of iptable restore errors within Felix are happening'
|
|
- name: etcd3.rules
|
|
rules:
|
|
- alert: etcd_InsufficientMembers
|
|
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: If one more etcd member goes down the cluster will be unavailable
|
|
summary: etcd cluster insufficient members
|
|
- alert: etcd_NoLeader
|
|
expr: etcd_server_has_leader{job="etcd"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: etcd member {{ $labels.instance }} has no leader
|
|
summary: etcd member has no leader
|
|
- alert: etcd_HighNumberOfLeaderChanges
|
|
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
|
|
summary: a high number of leader changes within the etcd cluster are happening
|
|
- alert: etcd_HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: etcd_HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: etcd_GRPCRequestsSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
|
|
summary: slow gRPC requests
|
|
- alert: etcd_HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: etcd_HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: etcd_HTTPRequestsSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
|
|
summary: slow HTTP requests
|
|
- alert: etcd_EtcdMemberCommunicationSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
|
|
summary: etcd member communication is slow
|
|
- alert: etcd_HighNumberOfFailedProposals
|
|
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
|
|
summary: a high number of proposals within the etcd cluster are failing
|
|
- alert: etcd_HighFsyncDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} fync durations are high
|
|
summary: high fsync durations
|
|
- alert: etcd_HighCommitDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} commit durations are high
|
|
summary: high commit durations
|
|
- name: kubelet.rules
|
|
rules:
|
|
- alert: K8SNodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready", status="unknown"} == 1 or kube_node_status_condition{condition="Ready", status="false"} == 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute
|
|
summary: '{{ $labels.node }} Node status is NotReady and {{ $labels.status }}'
|
|
- alert: K8SManyNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
|
|
summary: Many Kubernetes nodes are Not Ready
|
|
- alert: K8SManyNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
|
|
summary: Many Kubernetes nodes are Not Ready
|
|
- alert: K8SNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }} nodes are notReady state.'
|
|
summary: One or more Kubernetes nodes are Not Ready
|
|
- alert: K8SKubeletDown
|
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletDown
|
|
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletTooManyPods
|
|
expr: kubelet_running_pod_count > 100
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
|
|
summary: Kubelet is close to pod limit
|
|
- name: kube-apiserver.rules
|
|
rules:
|
|
- alert: K8SApiserverDown
|
|
expr: absent(up{job="apiserver"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
|
|
summary: API server unreachable
|
|
- alert: K8SApiServerLatency
|
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
|
|
summary: Kubernetes apiserver latency is high
|
|
- name: kube-controller-manager.rules
|
|
rules:
|
|
- alert: K8SControllerManagerDown
|
|
expr: absent(up{job="kube-controller-manager-discovery"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
|
|
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
|
summary: Controller manager is down
|
|
- name: kubernetes-object.rules
|
|
rules:
|
|
- alert: prom_exporter_kube_state_metrics_unavailable
|
|
expr: absent(kube_node_info)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
|
|
title: kube-state-metrics exporter is not collecting metrics or is not available
|
|
- alert: kube_statefulset_replicas_unavailable
|
|
expr: kube_statefulset_status_replicas < kube_statefulset_replicas
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
|
|
summary: '{{$labels.statefulset}}: has inssuficient replicas.'
|
|
- alert: daemonsets_misscheduled
|
|
expr: kube_daemonset_status_number_misscheduled > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
|
|
summary: 'Daemonsets not scheduled correctly'
|
|
- alert: daemonsets_not_scheduled
|
|
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
|
|
summary: 'Less than desired number of daemonsets scheduled'
|
|
- alert: daemonset_pods_unavailable
|
|
expr: kube_daemonset_status_number_unavailable > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
|
|
summary: 'Daemonset pods unavailable, due to one of many reasons'
|
|
- alert: deployment_replicas_unavailable
|
|
expr: kube_deployment_status_replicas_unavailable > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
|
|
summary: '{{$labels.deployment}}: has inssuficient replicas.'
|
|
- alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
|
|
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
|
|
summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
|
|
- alert: job_status_failed
|
|
expr: kube_job_status_failed > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Job {{$labels.exported_job}} is in failed status'
|
|
summary: '{{$labels.exported_job}} has failed status'
|
|
- alert: pod_status_pending
|
|
expr: kube_pod_status_phase{phase="Pending"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
|
|
- alert: pod_error_image_pull
|
|
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
- alert: pod_status_error_image_pull_backoff
|
|
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
- alert: pod_error_crash_loop_back_off
|
|
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
- alert: pod_error_config_error
|
|
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
- alert: replicaset_missing_replicas
|
|
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
|
|
summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
|
|
- alert: pod_container_terminated
|
|
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
- alert: volume_claim_capacity_high_utilization
|
|
expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
|
|
summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
|