openstack-helm-infra/prometheus/values.yaml
Steve Wilkerson fae7f98c01 Update prometheus service discovery for openstack-exporter
This updates the prometheus service discovery configuration
to define the openstack-exporter service discovery separate from
the other services. This allows for relabeling the instance label
for the openstack-exporter service, removing the potential for
multiple data series being returned by the single stat panels in
the Grafana dashboards for the openstack services. As the other
services perform as expected when exporter pods restart, they
remain configured the same as before.

Change-Id: Iad4c56d31fb553a9629f5a6fd1eac5464207add4
Signed-off-by: Steve Wilkerson <wilkers.steve@gmail.com>
2018-05-15 14:39:43 -05:00

1312 lines
60 KiB
YAML

# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values for prometheus.
# This is a YAML-formatted file.
# Declare name/value pairs to be passed into your templates.
# name: value
images:
tags:
prometheus: docker.io/prom/prometheus:v2.0.0
helm_tests: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.3.1
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
local_registry:
active: false
exclude:
- dep_check
- image_repo_sync
labels:
prometheus:
node_selector_key: openstack-control-plane
node_selector_value: enabled
job:
node_selector_key: openstack-control-plane
node_selector_value: enabled
pod:
affinity:
anti:
type:
default: preferredDuringSchedulingIgnoredDuringExecution
topologyKey:
default: kubernetes.io/hostname
mounts:
prometheus:
prometheus:
init_container: null
replicas:
prometheus: 1
lifecycle:
upgrades:
revision_history: 3
pod_replacement_strategy: RollingUpdate
rolling_update:
max_unavailable: 1
max_surge: 3
termination_grace_period:
prometheus:
timeout: 30
resources:
enabled: false
prometheus:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "500m"
jobs:
image_repo_sync:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
tests:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
endpoints:
cluster_domain_suffix: cluster.local
local_image_registry:
name: docker-registry
namespace: docker-registry
hosts:
default: localhost
internal: docker-registry
node: localhost
host_fqdn_override:
default: null
port:
registry:
node: 5000
monitoring:
name: prometheus
namespace: null
hosts:
default: prom-metrics
public: prometheus
host_fqdn_override:
default: null
path:
default: null
scheme:
default: 'http'
port:
api:
default: 9090
public: 80
alerts:
name: alertmanager
namespace: null
hosts:
default: alerts-engine
public: alertmanager
discovery: alertmanager-discovery
host_fqdn_override:
default: null
path:
default: null
scheme:
default: 'http'
port:
api:
default: 9093
public: 80
mesh:
default: 6783
dependencies:
dynamic:
common:
local_image_registry:
jobs:
- prometheus-image-repo-sync
services:
- endpoint: node
service: local_image_registry
static:
image_repo_sync:
services:
- endpoint: internal
service: local_image_registry
prometheus:
services: null
monitoring:
prometheus:
enabled: true
prometheus:
scrape: true
network:
prometheus:
ingress:
public: true
classes:
namespace: "nginx"
cluster: "nginx-cluster"
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
node_port:
enabled: false
port: 30900
storage:
enabled: true
pvc:
name: prometheus-pvc
access_mode: [ "ReadWriteOnce" ]
requests:
storage: 5Gi
storage_class: general
manifests:
configmap_bin: true
configmap_etc: true
ingress: true
helm_tests: true
job_image_repo_sync: true
service_ingress: true
service: true
statefulset_prometheus: true
conf:
prometheus:
# Consumed by a prometheus helper function to generate the command line flags
# for configuring the prometheus service
command_line_flags:
log.level: info
query.max_concurrency: 20
query.timeout: 2m
storage.tsdb.path: /var/lib/prometheus/data
storage.tsdb.retention: 7d
storage.tsdb.min_block_duration: 2h
storage.tsdb.max_block_duration: 2h
web.enable_admin_api: false
scrape_configs:
global:
scrape_interval: 60s
evaluation_interval: 60s
scrape_configs:
- job_name: kubelet
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
scrape_interval: 45s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels:
- __meta_kubernetes_node_name
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- source_labels:
- __meta_kubernetes_node_name
action: replace
target_label: kubernetes_io_hostname
# Scrape config for Kubelet cAdvisor.
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
#
# In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
# HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
# in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
# the --cadvisor-port=0 Kubelet flag).
#
# This job is not necessary and should be removed in Kubernetes 1.6 and
# earlier versions, or it will cause the metrics to be scraped twice.
- job_name: 'kubernetes-cadvisor'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
scrape_interval: 45s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels:
- __meta_kubernetes_node_name
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- source_labels:
- __meta_kubernetes_node_name
action: replace
target_label: kubernetes_io_hostname
metric_relabel_configs:
- action: replace
source_labels:
- id
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
target_label: rkt_container_name
replacement: '${2}-${1}'
- action: replace
source_labels:
- id
regex: '^/system\.slice/(.+)\.service$'
target_label: systemd_service_name
replacement: '${1}'
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'apiserver'
kubernetes_sd_configs:
- role: endpoints
scrape_interval: 45s
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
# insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
action: keep
regex: default;kubernetes;https
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'openstack-exporter'
kubernetes_sd_configs:
- role: endpoints
scrape_interval: 60s
relabel_configs:
- source_labels:
- __meta_kubernetes_service_name
action: keep
regex: "openstack-metrics"
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
action: keep
regex: true
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
action: replace
target_label: __scheme__
regex: (https?)
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
action: replace
target_label: kubernetes_namespace
- source_labels:
- __meta_kubernetes_service_name
action: replace
target_label: instance
- source_labels:
- __meta_kubernetes_service_name
action: replace
target_label: kubernetes_name
- source_labels:
- __meta_kubernetes_service_name
target_label: job
replacement: ${1}
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
scrape_interval: 60s
relabel_configs:
- source_labels:
- __meta_kubernetes_service_name
action: drop
regex: "openstack-metrics"
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
action: keep
regex: true
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
action: replace
target_label: __scheme__
regex: (https?)
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
action: replace
target_label: kubernetes_namespace
- source_labels:
- __meta_kubernetes_service_name
action: replace
target_label: kubernetes_name
- source_labels:
- __meta_kubernetes_service_name
target_label: job
replacement: ${1}
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
# pod's declared ports (default is a port-free target if none are declared).
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: calico-etcd
honor_labels: false
kubernetes_sd_configs:
- role: service
scrape_interval: 20s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: keep
source_labels:
- __meta_kubernetes_service_name
regex: "calico-etcd"
- action: keep
source_labels:
- __meta_kubernetes_namespace
regex: kube-system
target_label: namespace
- source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- source_labels:
- __meta_kubernetes_service_name
target_label: service
- source_labels:
- __meta_kubernetes_service_name
target_label: job
replacement: ${1}
- source_labels:
- __meta_kubernetes_service_label
target_label: job
regex: calico-etcd
replacement: ${1}
- target_label: endpoint
replacement: "calico-etcd"
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: pod
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_application]
regex: alertmanager
action: keep
- source_labels: [__meta_kubernetes_pod_container_port_name]
regex: alerts-api
action: keep
- source_labels: [__meta_kubernetes_pod_container_port_name]
regex: peer-mesh
action: drop
rules:
alertmanager:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
summary: Alertmanager down or not discovered
- alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
summary: Alertmanager configuration reload has failed
etcd3:
groups:
- name: etcd3.rules
rules:
- alert: etcd_InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: etcd_NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: etcd_HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: etcd_HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: etcd_HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: etcd_GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests
- alert: etcd_HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: etcd_HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: etcd_HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests
- alert: etcd_EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: etcd_HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: etcd_HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: etcd_HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
kube_apiserver:
groups:
- name: kube-apiserver.rules
rules:
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
summary: API server unreachable
- alert: K8SApiServerLatency
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
for: 10m
labels:
severity: warning
annotations:
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high
kube_controller_manager:
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager-discovery"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
kubelet:
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_ready{condition="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit
kubernetes:
groups:
- name: kubernetes.rules
rules:
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
- record: cluster_namespace_controller_pod_container:memory_oom:rate
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
- record: cluster:memory_allocation:percent
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:memory_used:percent
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:cpu_allocation:percent
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
- record: cluster:node_cpu_use:percent
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.99"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.9"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- alert: kube_statefulset_replicas_unavailable
expr: kube_statefulset_status_replicas < kube_statefulset_replicas
for: 5m
labels:
severity: page
annotations:
description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
summary: '{{$labels.statefulset}}: has inssuficient replicas.'
- alert: kube_daemonsets_misscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
summary: 'Daemonsets not scheduled correctly'
- alert: kube_daemonsets_not_scheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
summary: 'Less than desired number of daemonsets scheduled'
- alert: kube_deployment_replicas_unavailable
expr: kube_deployment_status_replicas_unavailable > 0
for: 10m
labels:
severity: page
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
summary: '{{$labels.deployment}}: has inssuficient replicas.'
- alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
for: 10m
labels:
severity: page
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
- alert: kube_job_status_failed
expr: kube_job_status_failed > 0
for: 10m
labels:
severity: page
annotations:
description: 'Job {{$labels.exported_job}} is in failed status'
summary: '{{$labels.exported_job}} has failed status'
- alert: kube_pod_status_pending
expr: kube_pod_status_phase{phase="Pending"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
- alert: kube_pod_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: kube_pod_status_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: kube_replicaset_missing_replicas
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
for: 10m
labels:
severity: page
annotations:
description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
- alert: kube_pod_container_terminated
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
basic_linux:
groups:
- name: basic_linux.rules
rules:
- alert: node_filesystem_full_80percent
expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"}
* 0.2) / 1024 ^ 3
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
got less than 10% space left on its filesystem.'
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
- alert: node_filesystem_full_in_4h
expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
is running out of space of in approx. 4 hours'
summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
- alert: node_filedescriptors_full_in_3h
expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
for: 20m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is running out of available file descriptors
in approx. 3 hours'
summary: '{{$labels.alias}} is running out of available file descriptors in
3 hours.'
- alert: node_load1_90percent
expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
for: 1h
labels:
severity: page
annotations:
description: '{{$labels.alias}} is running with > 90% total load for at least
1h.'
summary: '{{$labels.alias}}: Running on high load.'
- alert: node_cpu_util_90percent
expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
for: 1h
labels:
severity: page
annotations:
description: '{{$labels.alias}} has total CPU utilization over 90% for at least
1h.'
summary: '{{$labels.alias}}: High CPU utilization.'
- alert: node_ram_using_90percent
expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
* 0.1
for: 30m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is using at least 90% of its RAM for at least
30 minutes now.'
summary: '{{$labels.alias}}: Using lots of RAM.'
- alert: node_swap_using_80percent
expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
> node_memory_SwapTotal * 0.8
for: 10m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is using 80% of its swap space for at least
10 minutes now.'
summary: '{{$labels.alias}}: Running out of swap soon.'
- alert: node_high_cpu_load
expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
for: 1m
labels:
severity: warning
annotations:
description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
summary: '{{$labels.alias}}: Running on high load: {{$value}}'
- alert: node_high_memory_load
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+ node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
for: 1m
labels:
severity: warning
annotations:
description: Host memory usage is {{ humanize $value }}%. Reported by
instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server memory is almost full
- alert: node_high_storage_load
expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
/ node_filesystem_size{mountpoint="/"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
description: Host storage usage is {{ humanize $value }}%. Reported by
instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server storage is almost full
- alert: node_high_swap
expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
* 0.4)
for: 1m
labels:
severity: warning
annotations:
description: Host system has a high swap usage of {{ humanize $value }}. Reported
by instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server has a high swap usage
- alert: node_high_network_drop_rcv
expr: node_network_receive_drop{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high drop in network reception ({{
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
$labels.job }}
summary: Server has a high receive drop
- alert: node_high_network_drop_send
expr: node_network_transmit_drop{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high drop in network transmission ({{
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
$labels.job }}
summary: Server has a high transmit drop
- alert: node_high_network_errs_rcv
expr: node_network_receive_errs{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high error rate in network reception
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
{{ $labels.job }}
summary: Server has unusual high reception errors
- alert: node_high_network_errs_send
expr: node_network_transmit_errs{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high error rate in network transmission
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
{{ $labels.job }}
summary: Server has unusual high transmission errors
- alert: node_network_conntrack_usage_80percent
expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8)
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
summary: '{{$labels.instance}}: available network conntrack entries are low.'
- alert: node_entropy_available_low
expr: node_entropy_available_bits < 300
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
summary: '{{$labels.instance}}: is low on entropy bits.'
- alert: node_hwmon_high_cpu_temp
expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
- alert: node_vmstat_paging_rate_high
expr: irate(node_vmstat_pgpgin[5m]) > 80
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
- alert: node_xfs_block_allocation_high
expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
- alert: node_network_bond_slaves_down
expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
for: 5m
labels:
severity: page
annotations:
description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
- alert: node_numa_memory_used
expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
- alert: node_ntp_clock_skew_high
expr: abs(node_ntp_drift_seconds) > 2
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
- alert: node_disk_read_latency
expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.device}} has a high read latency of {{ $value }}'
summary: 'High read latency observed for device {{ $labels.device }}'
- alert: node_disk_write_latency
expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.device}} has a high write latency of {{ $value }}'
summary: 'High write latency observed for device {{ $labels.device }}'
openstack:
groups:
- name: openstack.rules
rules:
- alert: os_glance_api_availability
expr: check_glance_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Glance API is not available at {{$labels.url}}'
- alert: os_nova_api_availability
expr: check_nova_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Nova API is not available at {{$labels.url}}'
- alert: os_keystone_api_availability
expr: check_keystone_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Keystone API is not available at {{$labels.url}}'
- alert: os_neutron_api_availability
expr: check_neutron_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Neutron API is not available at {{$labels.url}}'
- alert: os_swift_api_availability
expr: check_swift_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Swift API is not available at {{$labels.url}}'
- alert: os_nova_compute_disabled
expr: services_nova_compute_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is disabled on some hosts'
- alert: os_nova_conductor_disabled
expr: services_nova_conductor_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is disabled on some hosts'
- alert: os_nova_consoleauth_disabled
expr: services_nova_consoleauth_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
- alert: os_nova_scheduler_disabled
expr: services_nova_scheduler_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
ceph:
groups:
- name: ceph.rules
rules:
- alert: ceph_monitor_quorum_low
expr: ceph_monitor_quorum_count < 3
for: 5m
labels:
severity: page
annotations:
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
summary: 'ceph high availability is at risk'
- alert: ceph_cluster_usage_high
expr: 100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80
for: 5m
labels:
severity: page
annotations:
description: 'ceph cluster capacity usage more than 80 percent'
summary: 'ceph cluster usage is more than 80 percent'
- alert: ceph_placement_group_degrade_pct_high
expr: 100*ceph_degraded_pgs/ceph_total_pgs > 80
for: 5m
labels:
severity: page
annotations:
description: 'ceph placement group degradation is more than 80 percent'
summary: 'ceph placement groups degraded'
- alert: ceph_osd_down_pct_high
expr: 100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80
for: 5m
labels:
severity: page
annotations:
description: 'ceph OSDs down percent is more than 80 percent'
summary: 'ceph OSDs down percent is high'
- alert: ceph_monitor_clock_skew_high
expr: ceph_monitor_clock_skew_seconds > 2
for: 5m
labels:
severity: page
annotations:
description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds'
summary: 'ceph monitor clock skew high'
fluentd:
groups:
- name: fluentd.rules
rules:
- alert: fluentd_not_running
expr: fluentd_up == 0
for: 5m
labels:
severity: page
annotations:
description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
summary: 'Fluentd is down'
calico:
groups:
- name: calico.rules
rules:
- alert: calico_datapane_failures_high_1h
expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
summary: 'A high number of dataplane failures within Felix are happening'
- alert: calico_datapane_address_msg_batch_size_high_5m
expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
for: 5m
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
summary: 'Felix address message batch size is higher'
- alert: calico_datapane_iface_msg_batch_size_high_5m
expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
for: 5m
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
summary: 'Felix interface message batch size is higher'
- alert: calico_ipset_errors_high_1h
expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
summary: 'A high number of ipset errors within Felix are happening'
- alert: calico_iptable_save_errors_high_1h
expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
summary: 'A high number of iptable save errors within Felix are happening'
- alert: calico_iptable_restore_errors_high_1h
expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
summary: 'A high number of iptable restore errors within Felix are happening'