fb7fc87d23
This change allows us to substitute values into our rules files. Example: - alert: my_region_is_down expr: up{region="{{ $my_region }}"} == 0 To support this change, rule annotations that used the expansion {{ $labels.foo }} had to be surrounded with "{{` ... `}}" to render correctly. Change-Id: Ia7ac891de8261acca62105a3e2636bd747a5fbea
34 lines
1.6 KiB
YAML
34 lines
1.6 KiB
YAML
---
|
|
conf:
|
|
prometheus:
|
|
rules:
|
|
alertmanager:
|
|
groups:
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: AlertmanagerConfigInconsistent
|
|
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "{{`The configuration of the instances of the Alertmanager cluster {{$labels.service}} are out of sync.`}}"
|
|
summary: Alertmanager configurations are inconsistent
|
|
- alert: AlertmanagerDownOrMissing
|
|
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.`}}"
|
|
summary: Alertmanager down or not discovered
|
|
- alert: FailedReload
|
|
expr: alertmanager_config_last_reload_successful == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "{{`Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod }}.`}}"
|
|
summary: Alertmanager configuration reload has failed
|
|
...
|