f402171e42
Move to v0.3.1 of kubernetes-entrypoint which has 2 breaking changes to pod dependencies, and also adds support for depending on jobs via labels. Change-Id: I2bafc2153ddd46b3833b253a2e7950bccbccf8ed
1263 lines
58 KiB
YAML
1263 lines
58 KiB
YAML
# Copyright 2017 The Openstack-Helm Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Default values for prometheus.
|
|
# This is a YAML-formatted file.
|
|
# Declare name/value pairs to be passed into your templates.
|
|
# name: value
|
|
|
|
images:
|
|
tags:
|
|
prometheus: docker.io/prom/prometheus:v2.0.0
|
|
helm_tests: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
|
|
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.3.1
|
|
image_repo_sync: docker.io/docker:17.07.0
|
|
pull_policy: IfNotPresent
|
|
local_registry:
|
|
active: false
|
|
exclude:
|
|
- dep_check
|
|
- image_repo_sync
|
|
|
|
labels:
|
|
prometheus:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
job:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
|
|
pod:
|
|
affinity:
|
|
anti:
|
|
type:
|
|
default: preferredDuringSchedulingIgnoredDuringExecution
|
|
topologyKey:
|
|
default: kubernetes.io/hostname
|
|
mounts:
|
|
prometheus:
|
|
prometheus:
|
|
init_container: null
|
|
replicas:
|
|
prometheus: 1
|
|
lifecycle:
|
|
upgrades:
|
|
revision_history: 3
|
|
pod_replacement_strategy: RollingUpdate
|
|
rolling_update:
|
|
max_unavailable: 1
|
|
max_surge: 3
|
|
termination_grace_period:
|
|
prometheus:
|
|
timeout: 30
|
|
resources:
|
|
enabled: false
|
|
prometheus:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "500m"
|
|
jobs:
|
|
image_repo_sync:
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
tests:
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
|
|
endpoints:
|
|
cluster_domain_suffix: cluster.local
|
|
local_image_registry:
|
|
name: docker-registry
|
|
namespace: docker-registry
|
|
hosts:
|
|
default: localhost
|
|
internal: docker-registry
|
|
node: localhost
|
|
host_fqdn_override:
|
|
default: null
|
|
port:
|
|
registry:
|
|
node: 5000
|
|
monitoring:
|
|
name: prometheus
|
|
namespace: null
|
|
hosts:
|
|
default: prom-metrics
|
|
public: prometheus
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: 'http'
|
|
port:
|
|
api:
|
|
default: 9090
|
|
public: 80
|
|
alerts:
|
|
name: alertmanager
|
|
namespace: null
|
|
hosts:
|
|
default: alerts-engine
|
|
public: alertmanager
|
|
discovery: alertmanager-discovery
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: 'http'
|
|
port:
|
|
api:
|
|
default: 9093
|
|
public: 80
|
|
mesh:
|
|
default: 6783
|
|
|
|
dependencies:
|
|
dynamic:
|
|
common:
|
|
local_image_registry:
|
|
jobs:
|
|
- prometheus-image-repo-sync
|
|
services:
|
|
- endpoint: node
|
|
service: local_image_registry
|
|
static:
|
|
image_repo_sync:
|
|
services:
|
|
- endpoint: internal
|
|
service: local_image_registry
|
|
prometheus:
|
|
services: null
|
|
|
|
monitoring:
|
|
prometheus:
|
|
enabled: true
|
|
prometheus:
|
|
scrape: true
|
|
|
|
network:
|
|
prometheus:
|
|
ingress:
|
|
public: true
|
|
classes:
|
|
namespace: "nginx"
|
|
cluster: "nginx-cluster"
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/rewrite-target: /
|
|
node_port:
|
|
enabled: false
|
|
port: 30900
|
|
|
|
storage:
|
|
enabled: true
|
|
pvc:
|
|
name: prometheus-pvc
|
|
access_mode: [ "ReadWriteOnce" ]
|
|
requests:
|
|
storage: 5Gi
|
|
storage_class: general
|
|
|
|
manifests:
|
|
configmap_bin: true
|
|
configmap_etc: true
|
|
ingress: true
|
|
helm_tests: true
|
|
job_image_repo_sync: true
|
|
service_ingress: true
|
|
service: true
|
|
statefulset_prometheus: true
|
|
|
|
conf:
|
|
prometheus:
|
|
# Consumed by a prometheus helper function to generate the command line flags
|
|
# for configuring the prometheus service
|
|
command_line_flags:
|
|
log.level: info
|
|
query.max_concurrency: 20
|
|
query.timeout: 2m
|
|
storage.tsdb.path: /var/lib/prometheus/data
|
|
storage.tsdb.retention: 7d
|
|
storage.tsdb.min_block_duration: 2h
|
|
storage.tsdb.max_block_duration: 2h
|
|
web.enable_admin_api: false
|
|
scrape_configs:
|
|
global:
|
|
scrape_interval: 60s
|
|
evaluation_interval: 60s
|
|
scrape_configs:
|
|
- job_name: kubelet
|
|
scheme: https
|
|
# This TLS & bearer token file config is used to connect to the actual scrape
|
|
# endpoints for cluster components. This is separate to discovery auth
|
|
# configuration because discovery & scraping are two separate concerns in
|
|
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
|
|
# the cluster. Otherwise, more config options have to be provided within the
|
|
# <kubernetes_sd_config>.
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
kubernetes_sd_configs:
|
|
- role: node
|
|
scrape_interval: 45s
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
- target_label: __address__
|
|
replacement: kubernetes.default.svc:443
|
|
- source_labels:
|
|
- __meta_kubernetes_node_name
|
|
regex: (.+)
|
|
target_label: __metrics_path__
|
|
replacement: /api/v1/nodes/${1}/proxy/metrics
|
|
- source_labels:
|
|
- __meta_kubernetes_node_name
|
|
action: replace
|
|
target_label: kubernetes_io_hostname
|
|
# Scrape config for Kubelet cAdvisor.
|
|
#
|
|
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
|
|
# (those whose names begin with 'container_') have been removed from the
|
|
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
|
|
# retrieve those metrics.
|
|
#
|
|
# In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
|
|
# HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
|
|
# in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
|
|
# the --cadvisor-port=0 Kubelet flag).
|
|
#
|
|
# This job is not necessary and should be removed in Kubernetes 1.6 and
|
|
# earlier versions, or it will cause the metrics to be scraped twice.
|
|
- job_name: 'kubernetes-cadvisor'
|
|
# Default to scraping over https. If required, just disable this or change to
|
|
# `http`.
|
|
scheme: https
|
|
# This TLS & bearer token file config is used to connect to the actual scrape
|
|
# endpoints for cluster components. This is separate to discovery auth
|
|
# configuration because discovery & scraping are two separate concerns in
|
|
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
|
|
# the cluster. Otherwise, more config options have to be provided within the
|
|
# <kubernetes_sd_config>.
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
kubernetes_sd_configs:
|
|
- role: node
|
|
scrape_interval: 45s
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
- target_label: __address__
|
|
replacement: kubernetes.default.svc:443
|
|
- source_labels:
|
|
- __meta_kubernetes_node_name
|
|
regex: (.+)
|
|
target_label: __metrics_path__
|
|
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
|
- source_labels:
|
|
- __meta_kubernetes_node_name
|
|
action: replace
|
|
target_label: kubernetes_io_hostname
|
|
metric_relabel_configs:
|
|
- action: replace
|
|
source_labels:
|
|
- id
|
|
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
|
|
target_label: rkt_container_name
|
|
replacement: '${2}-${1}'
|
|
- action: replace
|
|
source_labels:
|
|
- id
|
|
regex: '^/system\.slice/(.+)\.service$'
|
|
target_label: systemd_service_name
|
|
replacement: '${1}'
|
|
# Scrape config for API servers.
|
|
#
|
|
# Kubernetes exposes API servers as endpoints to the default/kubernetes
|
|
# service so this uses `endpoints` role and uses relabelling to only keep
|
|
# the endpoints associated with the default/kubernetes service using the
|
|
# default named port `https`. This works for single API server deployments as
|
|
# well as HA API server deployments.
|
|
- job_name: 'apiserver'
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
scrape_interval: 45s
|
|
# Default to scraping over https. If required, just disable this or change to
|
|
# `http`.
|
|
scheme: https
|
|
# This TLS & bearer token file config is used to connect to the actual scrape
|
|
# endpoints for cluster components. This is separate to discovery auth
|
|
# configuration because discovery & scraping are two separate concerns in
|
|
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
|
|
# the cluster. Otherwise, more config options have to be provided within the
|
|
# <kubernetes_sd_config>.
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
# If your node certificates are self-signed or use a different CA to the
|
|
# master CA, then disable certificate verification below. Note that
|
|
# certificate verification is an integral part of a secure infrastructure
|
|
# so this should only be disabled in a controlled environment. You can
|
|
# disable certificate verification by uncommenting the line below.
|
|
#
|
|
# insecure_skip_verify: true
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
# Keep only the default/kubernetes service endpoints for the https port. This
|
|
# will add targets for each API server which Kubernetes adds an endpoint to
|
|
# the default/kubernetes service.
|
|
relabel_configs:
|
|
- source_labels:
|
|
- __meta_kubernetes_namespace
|
|
- __meta_kubernetes_service_name
|
|
- __meta_kubernetes_endpoint_port_name
|
|
action: keep
|
|
regex: default;kubernetes;https
|
|
# Scrape config for service endpoints.
|
|
#
|
|
# The relabeling allows the actual service scrape endpoint to be configured
|
|
# via the following annotations:
|
|
#
|
|
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
|
|
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
|
|
# to set this to `https` & most likely set the `tls_config` of the scrape config.
|
|
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
|
|
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
|
|
# service then set this appropriately.
|
|
- job_name: 'kubernetes-service-endpoints'
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
scrape_interval: 60s
|
|
relabel_configs:
|
|
- source_labels:
|
|
- __meta_kubernetes_service_annotation_prometheus_io_scrape
|
|
action: keep
|
|
regex: true
|
|
- source_labels:
|
|
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
|
action: replace
|
|
target_label: __scheme__
|
|
regex: (https?)
|
|
- source_labels:
|
|
- __meta_kubernetes_service_annotation_prometheus_io_path
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels:
|
|
- __address__
|
|
- __meta_kubernetes_service_annotation_prometheus_io_port
|
|
action: replace
|
|
target_label: __address__
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_service_label_(.+)
|
|
- source_labels:
|
|
- __meta_kubernetes_namespace
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels:
|
|
- __meta_kubernetes_service_name
|
|
action: replace
|
|
target_label: kubernetes_name
|
|
- source_labels:
|
|
- __meta_kubernetes_service_name
|
|
target_label: job
|
|
replacement: ${1}
|
|
# Example scrape config for pods
|
|
#
|
|
# The relabeling allows the actual pod scrape endpoint to be configured via the
|
|
# following annotations:
|
|
#
|
|
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
|
|
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
|
|
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
|
|
# pod's declared ports (default is a port-free target if none are declared).
|
|
- job_name: 'kubernetes-pods'
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
action: replace
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
target_label: __address__
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_pod_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: kubernetes_pod_name
|
|
- job_name: calico-etcd
|
|
honor_labels: false
|
|
kubernetes_sd_configs:
|
|
- role: service
|
|
scrape_interval: 20s
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_service_label_(.+)
|
|
- action: keep
|
|
source_labels:
|
|
- __meta_kubernetes_service_name
|
|
regex: "calico-etcd"
|
|
- action: keep
|
|
source_labels:
|
|
- __meta_kubernetes_namespace
|
|
regex: kube-system
|
|
target_label: namespace
|
|
- source_labels:
|
|
- __meta_kubernetes_pod_name
|
|
target_label: pod
|
|
- source_labels:
|
|
- __meta_kubernetes_service_name
|
|
target_label: service
|
|
- source_labels:
|
|
- __meta_kubernetes_service_name
|
|
target_label: job
|
|
replacement: ${1}
|
|
- source_labels:
|
|
- __meta_kubernetes_service_label
|
|
target_label: job
|
|
regex: calico-etcd
|
|
replacement: ${1}
|
|
- target_label: endpoint
|
|
replacement: "calico-etcd"
|
|
alerting:
|
|
alertmanagers:
|
|
- kubernetes_sd_configs:
|
|
- role: pod
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_label_application]
|
|
regex: alertmanager
|
|
action: keep
|
|
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
regex: alerts-api
|
|
action: keep
|
|
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
regex: peer-mesh
|
|
action: drop
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
regex: openstack
|
|
action: keep
|
|
rules:
|
|
alertmanager:
|
|
groups:
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: AlertmanagerConfigInconsistent
|
|
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
|
|
summary: Alertmanager configurations are inconsistent
|
|
- alert: AlertmanagerDownOrMissing
|
|
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
|
|
summary: Alertmanager down or not discovered
|
|
- alert: FailedReload
|
|
expr: alertmanager_config_last_reload_successful == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
|
|
summary: Alertmanager configuration reload has failed
|
|
etcd3:
|
|
groups:
|
|
- name: etcd3.rules
|
|
rules:
|
|
- alert: etcd_InsufficientMembers
|
|
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: If one more etcd member goes down the cluster will be unavailable
|
|
summary: etcd cluster insufficient members
|
|
- alert: etcd_NoLeader
|
|
expr: etcd_server_has_leader{job="etcd"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: etcd member {{ $labels.instance }} has no leader
|
|
summary: etcd member has no leader
|
|
- alert: etcd_HighNumberOfLeaderChanges
|
|
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
|
|
summary: a high number of leader changes within the etcd cluster are happening
|
|
- alert: etcd_HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: etcd_HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: etcd_GRPCRequestsSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
|
|
summary: slow gRPC requests
|
|
- alert: etcd_HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: etcd_HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: etcd_HTTPRequestsSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
|
|
summary: slow HTTP requests
|
|
- alert: etcd_EtcdMemberCommunicationSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
|
|
summary: etcd member communication is slow
|
|
- alert: etcd_HighNumberOfFailedProposals
|
|
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
|
|
summary: a high number of proposals within the etcd cluster are failing
|
|
- alert: etcd_HighFsyncDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} fync durations are high
|
|
summary: high fsync durations
|
|
- alert: etcd_HighCommitDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} commit durations are high
|
|
summary: high commit durations
|
|
kube_apiserver:
|
|
groups:
|
|
- name: kube-apiserver.rules
|
|
rules:
|
|
- alert: K8SApiserverDown
|
|
expr: absent(up{job="apiserver"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
|
|
summary: API server unreachable
|
|
- alert: K8SApiServerLatency
|
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
|
|
summary: Kubernetes apiserver latency is high
|
|
kube_controller_manager:
|
|
groups:
|
|
- name: kube-controller-manager.rules
|
|
rules:
|
|
- alert: K8SControllerManagerDown
|
|
expr: absent(up{job="kube-controller-manager-discovery"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
|
|
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
|
summary: Controller manager is down
|
|
kubelet:
|
|
groups:
|
|
- name: kubelet.rules
|
|
rules:
|
|
- alert: K8SNodeNotReady
|
|
expr: kube_node_status_ready{condition="true"} == 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
|
|
summary: Node status is NotReady
|
|
- alert: K8SManyNodesNotReady
|
|
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
|
|
summary: Many Kubernetes nodes are Not Ready
|
|
- alert: K8SKubeletDown
|
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletDown
|
|
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletTooManyPods
|
|
expr: kubelet_running_pod_count > 100
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
|
|
summary: Kubelet is close to pod limit
|
|
kubernetes:
|
|
groups:
|
|
- name: kubernetes.rules
|
|
rules:
|
|
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
|
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
|
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
|
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
|
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
|
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
|
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
|
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
|
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
|
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
|
|
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
|
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
|
|
- record: cluster:memory_allocation:percent
|
|
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
|
|
- record: cluster:memory_used:percent
|
|
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
|
|
- record: cluster:cpu_allocation:percent
|
|
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
|
- record: cluster:node_cpu_use:percent
|
|
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
|
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- alert: kube_statefulset_replicas_unavailable
|
|
expr: kube_statefulset_status_replicas < kube_statefulset_replicas
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
|
|
summary: '{{$labels.statefulset}}: has inssuficient replicas.'
|
|
- alert: kube_daemonsets_misscheduled
|
|
expr: kube_daemonset_status_number_misscheduled > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
|
|
summary: 'Daemonsets not scheduled correctly'
|
|
- alert: kube_daemonsets_not_scheduled
|
|
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
|
|
summary: 'Less than desired number of daemonsets scheduled'
|
|
- alert: kube_deployment_replicas_unavailable
|
|
expr: kube_deployment_status_replicas_unavailable > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
|
|
summary: '{{$labels.deployment}}: has inssuficient replicas.'
|
|
- alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
|
|
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
|
|
summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
|
|
- alert: kube_job_status_failed
|
|
expr: kube_job_status_failed > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Job {{$labels.exported_job}} is in failed status'
|
|
summary: '{{$labels.exported_job}} has failed status'
|
|
- alert: kube_pod_status_pending
|
|
expr: kube_pod_status_phase{phase="Pending"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
|
|
- alert: kube_pod_error_image_pull
|
|
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
- alert: kube_pod_status_error_image_pull
|
|
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
- alert: kube_replicaset_missing_replicas
|
|
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
|
|
summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
|
|
- alert: kube_pod_container_terminated
|
|
expr: kube_pod_container_status_terminated > 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
|
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
|
basic_linux:
|
|
groups:
|
|
- name: basic_linux.rules
|
|
rules:
|
|
- alert: node_filesystem_full_80percent
|
|
expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"}
|
|
* 0.2) / 1024 ^ 3
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
|
|
got less than 10% space left on its filesystem.'
|
|
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
|
|
- alert: node_filesystem_full_in_4h
|
|
expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
|
|
is running out of space of in approx. 4 hours'
|
|
summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
|
|
- alert: node_filedescriptors_full_in_3h
|
|
expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
|
|
for: 20m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} is running out of available file descriptors
|
|
in approx. 3 hours'
|
|
summary: '{{$labels.alias}} is running out of available file descriptors in
|
|
3 hours.'
|
|
- alert: node_load1_90percent
|
|
expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
|
|
for: 1h
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} is running with > 90% total load for at least
|
|
1h.'
|
|
summary: '{{$labels.alias}}: Running on high load.'
|
|
- alert: node_cpu_util_90percent
|
|
expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
|
|
for: 1h
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} has total CPU utilization over 90% for at least
|
|
1h.'
|
|
summary: '{{$labels.alias}}: High CPU utilization.'
|
|
- alert: node_ram_using_90percent
|
|
expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
|
|
* 0.1
|
|
for: 30m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} is using at least 90% of its RAM for at least
|
|
30 minutes now.'
|
|
summary: '{{$labels.alias}}: Using lots of RAM.'
|
|
- alert: node_swap_using_80percent
|
|
expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
|
|
> node_memory_SwapTotal * 0.8
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} is using 80% of its swap space for at least
|
|
10 minutes now.'
|
|
summary: '{{$labels.alias}}: Running out of swap soon.'
|
|
- alert: node_high_cpu_load
|
|
expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
|
|
summary: '{{$labels.alias}}: Running on high load: {{$value}}'
|
|
- alert: node_high_memory_load
|
|
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
|
|
+ node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Host memory usage is {{ humanize $value }}%. Reported by
|
|
instance {{ $labels.instance }} of job {{ $labels.job }}.
|
|
summary: Server memory is almost full
|
|
- alert: node_high_storage_load
|
|
expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
|
|
/ node_filesystem_size{mountpoint="/"} * 100 > 85
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Host storage usage is {{ humanize $value }}%. Reported by
|
|
instance {{ $labels.instance }} of job {{ $labels.job }}.
|
|
summary: Server storage is almost full
|
|
- alert: node_high_swap
|
|
expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
|
|
* 0.4)
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Host system has a high swap usage of {{ humanize $value }}. Reported
|
|
by instance {{ $labels.instance }} of job {{ $labels.job }}.
|
|
summary: Server has a high swap usage
|
|
- alert: node_high_network_drop_rcv
|
|
expr: node_network_receive_drop{device!="lo"} > 3000
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Host system has an unusally high drop in network reception ({{
|
|
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
|
|
$labels.job }}
|
|
summary: Server has a high receive drop
|
|
- alert: node_high_network_drop_send
|
|
expr: node_network_transmit_drop{device!="lo"} > 3000
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Host system has an unusally high drop in network transmission ({{
|
|
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
|
|
$labels.job }}
|
|
summary: Server has a high transmit drop
|
|
- alert: node_high_network_errs_rcv
|
|
expr: node_network_receive_errs{device!="lo"} > 3000
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Host system has an unusally high error rate in network reception
|
|
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
|
|
{{ $labels.job }}
|
|
summary: Server has unusual high reception errors
|
|
- alert: node_high_network_errs_send
|
|
expr: node_network_transmit_errs{device!="lo"} > 3000
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Host system has an unusally high error rate in network transmission
|
|
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
|
|
{{ $labels.job }}
|
|
summary: Server has unusual high transmission errors
|
|
- alert: node_network_conntrack_usage_80percent
|
|
expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8)
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
|
|
summary: '{{$labels.instance}}: available network conntrack entries are low.'
|
|
- alert: node_entropy_available_low
|
|
expr: node_entropy_available_bits < 300
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
|
|
summary: '{{$labels.instance}}: is low on entropy bits.'
|
|
- alert: node_hwmon_high_cpu_temp
|
|
expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
|
|
summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
|
|
- alert: node_vmstat_paging_rate_high
|
|
expr: irate(node_vmstat_pgpgin[5m]) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
|
|
summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
|
|
- alert: node_xfs_block_allocation_high
|
|
expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
|
|
summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
|
|
- alert: node_network_bond_slaves_down
|
|
expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
|
|
summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
|
|
- alert: node_numa_memory_used
|
|
expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
|
|
summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
|
|
- alert: node_ntp_clock_skew_high
|
|
expr: abs(node_ntp_drift_seconds) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
|
|
summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
|
|
- alert: node_disk_read_latency
|
|
expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.device}} has a high read latency of {{ $value }}'
|
|
summary: 'High read latency observed for device {{ $labels.device }}'
|
|
- alert: node_disk_write_latency
|
|
expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{$labels.device}} has a high write latency of {{ $value }}'
|
|
summary: 'High write latency observed for device {{ $labels.device }}'
|
|
openstack:
|
|
groups:
|
|
- name: openstack.rules
|
|
rules:
|
|
- alert: os_glance_api_availability
|
|
expr: check_glance_api != 1
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
|
|
summary: 'Glance API is not available at {{$labels.url}}'
|
|
- alert: os_nova_api_availability
|
|
expr: check_nova_api != 1
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
|
|
summary: 'Nova API is not available at {{$labels.url}}'
|
|
- alert: os_keystone_api_availability
|
|
expr: check_keystone_api != 1
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
|
|
summary: 'Keystone API is not available at {{$labels.url}}'
|
|
- alert: os_neutron_api_availability
|
|
expr: check_neutron_api != 1
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
|
|
summary: 'Neutron API is not available at {{$labels.url}}'
|
|
- alert: os_swift_api_availability
|
|
expr: check_swift_api != 1
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
|
|
summary: 'Swift API is not available at {{$labels.url}}'
|
|
- alert: os_nova_compute_disabled
|
|
expr: services_nova_compute_disabled_total > 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
|
|
summary: 'Openstack compute service nova-compute is disabled on some hosts'
|
|
- alert: os_nova_conductor_disabled
|
|
expr: services_nova_conductor_disabled_total > 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
|
|
summary: 'Openstack compute service nova-conductor is disabled on some hosts'
|
|
- alert: os_nova_consoleauth_disabled
|
|
expr: services_nova_consoleauth_disabled_total > 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
|
|
summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
|
|
- alert: os_nova_scheduler_disabled
|
|
expr: services_nova_scheduler_disabled_total > 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
|
|
summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
|
|
ceph:
|
|
groups:
|
|
- name: ceph.rules
|
|
rules:
|
|
- alert: ceph_monitor_quorum_low
|
|
expr: ceph_monitor_quorum_count < 3
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
|
|
summary: 'ceph high availability is at risk'
|
|
- alert: ceph_cluster_usage_high
|
|
expr: 100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph cluster capacity usage more than 80 percent'
|
|
summary: 'ceph cluster usage is more than 80 percent'
|
|
- alert: ceph_placement_group_degrade_pct_high
|
|
expr: 100*ceph_degraded_pgs/ceph_total_pgs > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph placement group degradation is more than 80 percent'
|
|
summary: 'ceph placement groups degraded'
|
|
- alert: ceph_osd_down_pct_high
|
|
expr: 100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph OSDs down percent is more than 80 percent'
|
|
summary: 'ceph OSDs down percent is high'
|
|
- alert: ceph_monitor_clock_skew_high
|
|
expr: ceph_monitor_clock_skew_seconds > 2
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds'
|
|
summary: 'ceph monitor clock skew high'
|
|
fluentd:
|
|
groups:
|
|
- name: fluentd.rules
|
|
rules:
|
|
- alert: fluentd_not_running
|
|
expr: fluentd_up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
|
|
summary: 'Fluentd is down'
|
|
calico:
|
|
groups:
|
|
- name: calico.rules
|
|
rules:
|
|
- alert: calico_datapane_failures_high_1h
|
|
expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
|
|
summary: 'A high number of dataplane failures within Felix are happening'
|
|
- alert: calico_datapane_address_msg_batch_size_high_5m
|
|
expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
|
|
summary: 'Felix address message batch size is higher'
|
|
- alert: calico_datapane_iface_msg_batch_size_high_5m
|
|
expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
|
|
summary: 'Felix interface message batch size is higher'
|
|
- alert: calico_ipset_errors_high_1h
|
|
expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
|
|
summary: 'A high number of ipset errors within Felix are happening'
|
|
- alert: calico_iptable_save_errors_high_1h
|
|
expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
|
|
summary: 'A high number of iptable save errors within Felix are happening'
|
|
- alert: calico_iptable_restore_errors_high_1h
|
|
expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
|
|
summary: 'A high number of iptable restore errors within Felix are happening'
|