From f3d8bda9d6d33dcc88b526a0446ce741d1e419a5 Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Tue, 16 Oct 2018 09:50:54 -0500 Subject: [PATCH] Grafana: Support multiple Ceph clusters with dashboards This updates the Grafana Ceph dashboards to use templating to determine which ceph-mgr to use for displaying ceph related metrics. This required setting the appropriate labels on the ceph-mgr service to be able to distinguish between releases Change-Id: Id2eceacadc5b6366d7bc6668bc16ccf5ba878e4a --- ceph-client/templates/service-mgr.yaml | 2 + grafana/values.yaml | 128 +++++++++++++++---------- 2 files changed, 81 insertions(+), 49 deletions(-) diff --git a/ceph-client/templates/service-mgr.yaml b/ceph-client/templates/service-mgr.yaml index 3198e83d4..76a825532 100644 --- a/ceph-client/templates/service-mgr.yaml +++ b/ceph-client/templates/service-mgr.yaml @@ -22,6 +22,8 @@ apiVersion: v1 kind: Service metadata: name: ceph-mgr + labels: +{{ tuple $envAll "ceph" "manager" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }} annotations: {{- if .Values.monitoring.prometheus.enabled }} {{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_service_annotations" | indent 4 }} diff --git a/grafana/values.yaml b/grafana/values.yaml index d3c5dc00b..47775ca7e 100644 --- a/grafana/values.yaml +++ b/grafana/values.yaml @@ -3289,7 +3289,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: count(ceph_health_status) + - expr: count(ceph_health_status{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 refId: A @@ -3355,7 +3355,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: ceph_mon_quorum_count + - expr: ceph_mon_quorum_count{application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3416,7 +3416,7 @@ conf: lineColor: rgb(31, 120, 193) show: true targets: - - expr: count(ceph_pool_max_avail) + - expr: count(ceph_pool_max_avail{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3477,7 +3477,7 @@ conf: lineColor: rgb(31, 120, 193) show: true targets: - - expr: ceph_cluster_total_bytes + - expr: ceph_cluster_total_bytes{application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3538,7 +3538,7 @@ conf: lineColor: rgb(31, 120, 193) show: true targets: - - expr: ceph_cluster_total_used_bytes + - expr: ceph_cluster_total_used_bytes{application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3599,7 +3599,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: ceph_cluster_total_used_bytes/ceph_cluster_total_bytes + - expr: ceph_cluster_total_used_bytes/ceph_cluster_total_bytes{application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3665,7 +3665,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: count(ceph_osd_in) + - expr: count(ceph_osd_in{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3725,7 +3725,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: count(ceph_osd_metadata) - count(ceph_osd_in) + - expr: count(ceph_osd_metadata{application="ceph",release_group="$ceph_cluster"}) - count(ceph_osd_in{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3785,7 +3785,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: sum(ceph_osd_up) + - expr: sum(ceph_osd_up{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3845,7 +3845,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: count(ceph_osd_metadata) - count(ceph_osd_up) + - expr: count(ceph_osd_metadata{application="ceph",release_group="$ceph_cluster"}) - count(ceph_osd_up{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3905,7 +3905,7 @@ conf: lineColor: rgb(31, 120, 193) show: true targets: - - expr: avg(ceph_osd_numpg) + - expr: avg(ceph_osd_numpg{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -3973,7 +3973,7 @@ conf: stack: true steppedLine: false targets: - - expr: ceph_cluster_total_bytes - ceph_cluster_total_used_bytes + - expr: ceph_cluster_total_bytes{application="ceph",release_group="$ceph_cluster"} - ceph_cluster_total_used_bytes{application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Available @@ -4060,13 +4060,13 @@ conf: stack: true steppedLine: false targets: - - expr: sum(ceph_osd_op_w) + - expr: sum(ceph_osd_op_w{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Write refId: A step: 60 - - expr: sum(ceph_osd_op_r) + - expr: sum(ceph_osd_op_r{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Read @@ -4133,13 +4133,13 @@ conf: stack: true steppedLine: false targets: - - expr: sum(ceph_osd_op_in_bytes) + - expr: sum(ceph_osd_op_in_bytes{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Write refId: A step: 60 - - expr: sum(ceph_osd_op_out_bytes) + - expr: sum(ceph_osd_op_out_bytes{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Read @@ -4214,7 +4214,7 @@ conf: stack: true steppedLine: false targets: - - expr: ceph_cluster_total_objects + - expr: ceph_cluster_total_objects{application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Total @@ -4282,37 +4282,37 @@ conf: stack: true steppedLine: false targets: - - expr: sum(ceph_osd_numpg) + - expr: sum(ceph_osd_numpg{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Total refId: A step: 60 - - expr: sum(ceph_pg_active) + - expr: sum(ceph_pg_active{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Active refId: B step: 60 - - expr: sum(ceph_pg_inconsistent) + - expr: sum(ceph_pg_inconsistent{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Inconsistent refId: C step: 60 - - expr: sum(ceph_pg_creating) + - expr: sum(ceph_pg_creating{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Creating refId: D step: 60 - - expr: sum(ceph_pg_recovering) + - expr: sum(ceph_pg_recovering{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Recovering refId: E step: 60 - - expr: sum(ceph_pg_down) + - expr: sum(ceph_pg_down{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Down @@ -4380,19 +4380,19 @@ conf: stack: true steppedLine: false targets: - - expr: sum(ceph_pg_degraded) + - expr: sum(ceph_pg_degraded{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Degraded refId: A step: 60 - - expr: sum(ceph_pg_stale) + - expr: sum(ceph_pg_stale{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Stale refId: B step: 60 - - expr: sum(ceph_pg_undersized) + - expr: sum(ceph_pg_undersized{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Undersized @@ -4450,6 +4450,16 @@ conf: - 30d templating: list: + - current: {} + hide: 0 + label: Cluster + name: ceph_cluster + options: [] + type: query + query: label_values(ceph_health_status, release_group) + refresh: 1 + sort: 1 + datasource: prometheus - auto: true auto_count: 10 auto_min: 1m @@ -4599,7 +4609,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: ceph_osd_up{ceph_daemon="osd.$osd"} + - expr: ceph_osd_up{ceph_daemon="osd.$osd",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 refId: A @@ -4672,7 +4682,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: ceph_osd_in{ceph_daemon="osd.$osd"} + - expr: ceph_osd_in{ceph_daemon="osd.$osd",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 refId: A @@ -4739,7 +4749,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: count(ceph_osd_metadata) + - expr: count(ceph_osd_metadata{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 refId: A @@ -4807,13 +4817,13 @@ conf: stack: true steppedLine: false targets: - - expr: ceph_osd_numpg{ceph_daemon=~"osd.$osd"} + - expr: ceph_osd_numpg{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Number of PGs - {{ osd.$osd }} refId: A step: 60 - - expr: avg(ceph_osd_numpg) + - expr: avg(ceph_osd_numpg{application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Average Number of PGs in the Cluster @@ -4888,7 +4898,7 @@ conf: lineColor: rgb(31, 120, 193) show: true targets: - - expr: (ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd"}/ceph_osd_stat_bytes{ceph_daemon=~"osd.$osd"})*100 + - expr: (ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"}/ceph_osd_stat_bytes{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"})*100 interval: "$interval" intervalFactor: 1 legendFormat: '' @@ -4948,14 +4958,14 @@ conf: stack: true steppedLine: false targets: - - expr: ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd"} + - expr: ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Used - {{ osd.$osd }} metric: ceph_osd_used_bytes refId: A step: 60 - - expr: ceph_osd_stat_bytes{ceph_daemon=~"osd.$osd"} - ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd"} + - expr: ceph_osd_stat_bytes{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"} - ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"} hide: false interval: "$interval" intervalFactor: 1 @@ -5024,7 +5034,7 @@ conf: stack: false steppedLine: false targets: - - expr: (ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd"}/ceph_osd_stat_bytes{ceph_daemon=~"osd.$osd"}) + - expr: (ceph_osd_stat_bytes_used{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"}/ceph_osd_stat_bytes{ceph_daemon=~"osd.$osd",application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 legendFormat: Available - {{ osd.$osd }} @@ -5082,6 +5092,16 @@ conf: - 30d templating: list: + - current: {} + hide: 0 + label: Cluster + name: ceph_cluster + options: [] + type: query + query: label_values(ceph_health_status, release_group) + refresh: 1 + sort: 1 + datasource: prometheus - auto: true auto_count: 10 auto_min: 1m @@ -5140,7 +5160,7 @@ conf: multi: false name: osd options: [] - query: label_values(ceph_osd_metadata, id) + query: label_values(ceph_osd_metadata{release_group="$ceph_cluster"}, id) refresh: 1 regex: '' type: query @@ -5239,25 +5259,25 @@ conf: stack: true steppedLine: false targets: - - expr: ceph_pool_max_avail{pool_id=~"$pool"} + - expr: ceph_pool_max_avail{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Total - {{ $pool }} refId: A step: 60 - - expr: ceph_pool_bytes_used{pool_id=~"$pool"} + - expr: ceph_pool_bytes_used{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Used - {{ $pool }} refId: B step: 60 - - expr: ceph_pool_max_avail{pool_id=~"$pool"} - ceph_pool_bytes_used{pool_id=~"$pool"} + - expr: ceph_pool_max_avail{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} - ceph_pool_bytes_used{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Available - {{ $pool }} refId: C step: 60 - - expr: ceph_pool_raw_bytes_used{pool_id=~"$pool"} + - expr: ceph_pool_raw_bytes_used{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Raw - {{ $pool }} @@ -5333,7 +5353,7 @@ conf: lineColor: rgb(31, 120, 193) show: false targets: - - expr: (ceph_pool_bytes_used{pool_id=~"$pool"} / ceph_pool_max_avail{pool_id=~"$pool"}) + - expr: (ceph_pool_bytes_used{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} / ceph_pool_max_avail{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"}) interval: "$interval" intervalFactor: 1 refId: A @@ -5388,13 +5408,13 @@ conf: stack: false steppedLine: false targets: - - expr: ceph_pool_objects{pool_id=~"$pool"} + - expr: ceph_pool_objects{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Objects - {{ $pool_name }} refId: A step: 60 - - expr: ceph_pool_dirty{pool_id=~"$pool"} + - expr: ceph_pool_dirty{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"} interval: "$interval" intervalFactor: 1 legendFormat: Dirty Objects - {{ $pool_name }} @@ -5462,13 +5482,13 @@ conf: stack: true steppedLine: false targets: - - expr: irate(ceph_pool_rd{pool_id=~"$pool"}[3m]) + - expr: irate(ceph_pool_rd{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"}[3m]) interval: "$interval" intervalFactor: 1 legendFormat: Read - {{ $pool_name}} refId: B step: 60 - - expr: irate(ceph_pool_wr{pool_id=~"$pool"}[3m]) + - expr: irate(ceph_pool_wr{pool_id=~"$pool",application="ceph",release_group="$ceph_cluster"}[3m]) interval: "$interval" intervalFactor: 1 legendFormat: Write - {{ $pool_name }} @@ -5535,13 +5555,13 @@ conf: stack: true steppedLine: false targets: - - expr: irate(ceph_pool_rd_bytes{pool_id="$pool"}[3m]) + - expr: irate(ceph_pool_rd_bytes{pool_id="$pool",application="ceph",release_group="$ceph_cluster"}[3m]) interval: "$interval" intervalFactor: 1 legendFormat: Read Bytes - {{ $pool_name }} refId: A step: 60 - - expr: irate(ceph_pool_wr_bytes{pool_id="$pool"}[3m]) + - expr: irate(ceph_pool_wr_bytes{pool_id="$pool",application="ceph",release_group="$ceph_cluster"}[3m]) interval: "$interval" intervalFactor: 1 legendFormat: Written Bytes - {{ $pool_name }} @@ -5599,6 +5619,16 @@ conf: - 30d templating: list: + - current: {} + hide: 0 + label: Cluster + name: ceph_cluster + options: [] + type: query + query: label_values(ceph_health_status, release_group) + refresh: 1 + sort: 1 + datasource: prometheus - auto: true auto_count: 10 auto_min: 1m @@ -5657,7 +5687,7 @@ conf: multi: false name: pool options: [] - query: label_values(ceph_pool_objects, pool_id) + query: label_values(ceph_pool_objects{release_group="$ceph_cluster"}, pool_id) refresh: 1 regex: '' type: query @@ -5669,7 +5699,7 @@ conf: multi: false name: pool_name options: [] - query: label_values(ceph_pool_metadata{pool_id="[[pool]]" }, name) + query: label_values(ceph_pool_metadata{release_group="$ceph_cluster",pool_id="[[pool]]" }, name) refresh: 1 regex: '' type: query