Update Elasticsearch health status expressions
This updates the Elasticsearch health status expressions used in Prometheus, Nagios and Grafana. The previous Prometheus rule defined for Elasticsearch health checked for a status that was > 0 to trigger an alarm for a green health status. The correct returned values are: 1 for green, 0 for both red and yellow. This changes the expression to use arithmetic operators to give us a result that maps to: 2 for green, 1 for yellow, 0 for red. This also updates the Elasticsearch dashboard in Grafana to add a new mapping for the updated 2g,1y,0r scale. Finally, this also updates the Nagios service check to be a bit more verbose in its output. For reference, see: https://github.com/justwatchcom/elasticsearch_exporter/issues/120 Change-Id: I6ef2a7c308c6ebfdb693b46127a285bceb6ba872
This commit is contained in:
parent
6bd70a9fc6
commit
9e5a295465
@ -5805,14 +5805,14 @@ conf:
|
|||||||
show: true
|
show: true
|
||||||
tableColumn: ''
|
tableColumn: ''
|
||||||
targets:
|
targets:
|
||||||
- expr: sum(elasticsearch_cluster_health_status{cluster=~"$cluster"})
|
- expr: (sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="green"})*2)+sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="yellow"})
|
||||||
format: time_series
|
format: time_series
|
||||||
intervalFactor: 2
|
intervalFactor: 3
|
||||||
legendFormat: ''
|
legendFormat: ''
|
||||||
metric: ''
|
metric: ''
|
||||||
refId: A
|
refId: A
|
||||||
step: 40
|
step: 40
|
||||||
thresholds: '0,1'
|
thresholds: '0,1,2'
|
||||||
title: Cluster health status
|
title: Cluster health status
|
||||||
transparent: false
|
transparent: false
|
||||||
type: singlestat
|
type: singlestat
|
||||||
@ -5820,6 +5820,9 @@ conf:
|
|||||||
valueMaps:
|
valueMaps:
|
||||||
- op: "="
|
- op: "="
|
||||||
text: GREEN
|
text: GREEN
|
||||||
|
value: '2'
|
||||||
|
- op: "="
|
||||||
|
text: YELLOW
|
||||||
value: '1'
|
value: '1'
|
||||||
- op: "="
|
- op: "="
|
||||||
text: RED
|
text: RED
|
||||||
|
@ -851,7 +851,7 @@ conf:
|
|||||||
- check_es_cluster_health_status:
|
- check_es_cluster_health_status:
|
||||||
use: generic-service
|
use: generic-service
|
||||||
service_description: ES_cluster-health-status
|
service_description: ES_cluster-health-status
|
||||||
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
- check_es_cluster_number_nodes_running:
|
- check_es_cluster_number_nodes_running:
|
||||||
use: generic-service
|
use: generic-service
|
||||||
|
@ -2038,12 +2038,12 @@ conf:
|
|||||||
description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
|
description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
|
||||||
summary: 'Elasticsearch cluster health status calls are timing out.'
|
summary: 'Elasticsearch cluster health status calls are timing out.'
|
||||||
- alert: es_cluster_health_status_alert
|
- alert: es_cluster_health_status_alert
|
||||||
expr: elasticsearch_cluster_health_status > 0
|
expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.'
|
description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
|
||||||
summary: 'Elasticsearch cluster health status is not green.'
|
summary: 'Elasticsearch cluster health status is not green.'
|
||||||
- alert: es_cluster_health_too_few_nodes_running
|
- alert: es_cluster_health_too_few_nodes_running
|
||||||
expr: elasticsearch_cluster_health_number_of_nodes < 3
|
expr: elasticsearch_cluster_health_number_of_nodes < 3
|
||||||
@ -2061,6 +2061,14 @@ conf:
|
|||||||
annotations:
|
annotations:
|
||||||
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
||||||
summary: 'ElasticSearch running on less than 3 data nodes'
|
summary: 'ElasticSearch running on less than 3 data nodes'
|
||||||
|
- alert: es_cluster_health_too_few_data_nodes_running
|
||||||
|
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
||||||
|
summary: 'ElasticSearch running on less than 3 data nodes'
|
||||||
mariadb:
|
mariadb:
|
||||||
groups:
|
groups:
|
||||||
- name: mariadb.rules
|
- name: mariadb.rules
|
||||||
|
Loading…
Reference in New Issue
Block a user