openstack-helm-infra/prometheus/values_overrides/ceph.yaml
Steve Wilkerson fbd34421f2 Prometheus: Update chart to support federation
This updates the Prometheus chart to support federation. This
moves to defining the Prometheus configuration file via a template
in the values.yaml file instead of through raw yaml. This allows
for overriding the chart's default configuration wholesale, as
this would be required for a hierarchical federated setup. This
also strips out all of the default rules defined in the chart for
the same reason. There are example rules defined for the various
aspects of OSH's infrastructure in the prometheus/values_overrides
directory that are executed as part of the normal CI jobs. This
also adds a nonvoting federated-monitoring job that vets out the
ability to federate prometheus in a hierarchical fashion with
extremely basic overrides

Change-Id: I0f121ad5e4f80be4c790dc869955c6b299ca9f26
Signed-off-by: Steve Wilkerson <sw5822@att.com>
2019-11-21 12:39:56 +00:00

72 lines
2.9 KiB
YAML

conf:
prometheus:
rules:
ceph:
groups:
- name: ceph.rules
rules:
- alert: prom_exporter_ceph_unavailable
expr: absent(ceph_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
title: Ceph exporter is not collecting metrics or is not available
- alert: no_active_ceph_mgr
expr: count(up{job="ceph-mgr"} == 1) == 0
for: 5m
labels:
severity: warning
annotations:
description: 'no ceph active mgr is present or all ceph mgr are down'
summary: 'no ceph active mgt is present'
- alert: ceph_mon_quorum_low
expr: ceph_mon_quorum_count < 3
for: 5m
labels:
severity: page
annotations:
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
summary: 'ceph high availability is at risk'
- alert: ceph_cluster_usage_high
expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
for: 5m
labels:
severity: page
annotations:
description: 'ceph cluster capacity usage more than 80 percent'
summary: 'ceph cluster usage is more than 80 percent'
- alert: ceph_placement_group_degrade_pct_high
expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
for: 5m
labels:
severity: critical
annotations:
description: 'ceph placement group degradation is more than 80 percent'
summary: 'ceph placement groups degraded'
- alert: ceph_osd_down_pct_high
expr: 100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
for: 5m
labels:
severity: critical
annotations:
description: 'ceph OSDs down percent is more than 80 percent'
summary: 'ceph OSDs down percent is high'
- alert: ceph_osd_down
expr: ceph_osd_up == 0
for: 1m
labels:
severity: critical
annotations:
description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
- alert: ceph_osd_out
expr: ceph_osd_in == 0
for: 5m
labels:
severity: page
annotations:
description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'