Update Elasticsearch health status expressions

This updates the Elasticsearch health status expressions used in
Prometheus, Nagios and Grafana.  The previous Prometheus rule
defined for Elasticsearch health checked for a status that was
> 0 to trigger an alarm for a green health status. The correct
returned values are: 1 for green, 0 for both red and yellow. This
changes the expression to use arithmetic operators to give us a
result that maps to: 2 for green, 1 for yellow, 0 for red.

This also updates the Elasticsearch dashboard in Grafana to add a
new mapping for the updated 2g,1y,0r scale.

Finally, this also updates the Nagios service check to be a bit
more verbose in its output.

For reference, see:
https://github.com/justwatchcom/elasticsearch_exporter/issues/120

Change-Id: I6ef2a7c308c6ebfdb693b46127a285bceb6ba872
This commit is contained in:
Steve Wilkerson 2019-01-16 11:00:25 -06:00
parent 6bd70a9fc6
commit 9e5a295465
3 changed files with 17 additions and 6 deletions

View File

@ -5805,14 +5805,14 @@ conf:
show: true show: true
tableColumn: '' tableColumn: ''
targets: targets:
- expr: sum(elasticsearch_cluster_health_status{cluster=~"$cluster"}) - expr: (sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="green"})*2)+sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="yellow"})
format: time_series format: time_series
intervalFactor: 2 intervalFactor: 3
legendFormat: '' legendFormat: ''
metric: '' metric: ''
refId: A refId: A
step: 40 step: 40
thresholds: '0,1' thresholds: '0,1,2'
title: Cluster health status title: Cluster health status
transparent: false transparent: false
type: singlestat type: singlestat
@ -5820,6 +5820,9 @@ conf:
valueMaps: valueMaps:
- op: "=" - op: "="
text: GREEN text: GREEN
value: '2'
- op: "="
text: YELLOW
value: '1' value: '1'
- op: "=" - op: "="
text: RED text: RED

View File

@ -851,7 +851,7 @@ conf:
- check_es_cluster_health_status: - check_es_cluster_health_status:
use: generic-service use: generic-service
service_description: ES_cluster-health-status service_description: ES_cluster-health-status
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green. check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
hostgroup_name: prometheus-hosts hostgroup_name: prometheus-hosts
- check_es_cluster_number_nodes_running: - check_es_cluster_number_nodes_running:
use: generic-service use: generic-service

View File

@ -2038,12 +2038,12 @@ conf:
description: 'Elasticsearch cluster health status call timedout {{ $value }} times.' description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
summary: 'Elasticsearch cluster health status calls are timing out.' summary: 'Elasticsearch cluster health status calls are timing out.'
- alert: es_cluster_health_status_alert - alert: es_cluster_health_status_alert
expr: elasticsearch_cluster_health_status > 0 expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.' description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
summary: 'Elasticsearch cluster health status is not green.' summary: 'Elasticsearch cluster health status is not green.'
- alert: es_cluster_health_too_few_nodes_running - alert: es_cluster_health_too_few_nodes_running
expr: elasticsearch_cluster_health_number_of_nodes < 3 expr: elasticsearch_cluster_health_number_of_nodes < 3
@ -2061,6 +2061,14 @@ conf:
annotations: annotations:
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running' description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
summary: 'ElasticSearch running on less than 3 data nodes' summary: 'ElasticSearch running on less than 3 data nodes'
- alert: es_cluster_health_too_few_data_nodes_running
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 10m
labels:
severity: warning
annotations:
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
summary: 'ElasticSearch running on less than 3 data nodes'
mariadb: mariadb:
groups: groups:
- name: mariadb.rules - name: mariadb.rules