fbd34421f2
This updates the Prometheus chart to support federation. This moves to defining the Prometheus configuration file via a template in the values.yaml file instead of through raw yaml. This allows for overriding the chart's default configuration wholesale, as this would be required for a hierarchical federated setup. This also strips out all of the default rules defined in the chart for the same reason. There are example rules defined for the various aspects of OSH's infrastructure in the prometheus/values_overrides directory that are executed as part of the normal CI jobs. This also adds a nonvoting federated-monitoring job that vets out the ability to federate prometheus in a hierarchical fashion with extremely basic overrides Change-Id: I0f121ad5e4f80be4c790dc869955c6b299ca9f26 Signed-off-by: Steve Wilkerson <sw5822@att.com>
72 lines
2.9 KiB
YAML
72 lines
2.9 KiB
YAML
conf:
|
|
prometheus:
|
|
rules:
|
|
ceph:
|
|
groups:
|
|
- name: ceph.rules
|
|
rules:
|
|
- alert: prom_exporter_ceph_unavailable
|
|
expr: absent(ceph_health_status)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
|
|
title: Ceph exporter is not collecting metrics or is not available
|
|
- alert: no_active_ceph_mgr
|
|
expr: count(up{job="ceph-mgr"} == 1) == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'no ceph active mgr is present or all ceph mgr are down'
|
|
summary: 'no ceph active mgt is present'
|
|
- alert: ceph_mon_quorum_low
|
|
expr: ceph_mon_quorum_count < 3
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
|
|
summary: 'ceph high availability is at risk'
|
|
- alert: ceph_cluster_usage_high
|
|
expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph cluster capacity usage more than 80 percent'
|
|
summary: 'ceph cluster usage is more than 80 percent'
|
|
- alert: ceph_placement_group_degrade_pct_high
|
|
expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: 'ceph placement group degradation is more than 80 percent'
|
|
summary: 'ceph placement groups degraded'
|
|
- alert: ceph_osd_down_pct_high
|
|
expr: 100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: 'ceph OSDs down percent is more than 80 percent'
|
|
summary: 'ceph OSDs down percent is high'
|
|
- alert: ceph_osd_down
|
|
expr: ceph_osd_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
|
|
summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
|
|
- alert: ceph_osd_out
|
|
expr: ceph_osd_in == 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
|
|
summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
|