From 9e5a29546562773d7c244f961cba254302f6c785 Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Wed, 16 Jan 2019 11:00:25 -0600 Subject: [PATCH] Update Elasticsearch health status expressions This updates the Elasticsearch health status expressions used in Prometheus, Nagios and Grafana. The previous Prometheus rule defined for Elasticsearch health checked for a status that was > 0 to trigger an alarm for a green health status. The correct returned values are: 1 for green, 0 for both red and yellow. This changes the expression to use arithmetic operators to give us a result that maps to: 2 for green, 1 for yellow, 0 for red. This also updates the Elasticsearch dashboard in Grafana to add a new mapping for the updated 2g,1y,0r scale. Finally, this also updates the Nagios service check to be a bit more verbose in its output. For reference, see: https://github.com/justwatchcom/elasticsearch_exporter/issues/120 Change-Id: I6ef2a7c308c6ebfdb693b46127a285bceb6ba872 --- grafana/values.yaml | 9 ++++++--- nagios/values.yaml | 2 +- prometheus/values.yaml | 12 ++++++++++-- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/grafana/values.yaml b/grafana/values.yaml index f1cce35b2..c1272c61a 100644 --- a/grafana/values.yaml +++ b/grafana/values.yaml @@ -5805,14 +5805,14 @@ conf: show: true tableColumn: '' targets: - - expr: sum(elasticsearch_cluster_health_status{cluster=~"$cluster"}) + - expr: (sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="green"})*2)+sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="yellow"}) format: time_series - intervalFactor: 2 + intervalFactor: 3 legendFormat: '' metric: '' refId: A step: 40 - thresholds: '0,1' + thresholds: '0,1,2' title: Cluster health status transparent: false type: singlestat @@ -5820,6 +5820,9 @@ conf: valueMaps: - op: "=" text: GREEN + value: '2' + - op: "=" + text: YELLOW value: '1' - op: "=" text: RED diff --git a/nagios/values.yaml b/nagios/values.yaml index a11df1d58..4bd5b5f59 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -851,7 +851,7 @@ conf: - check_es_cluster_health_status: use: generic-service service_description: ES_cluster-health-status - check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green. + check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green. hostgroup_name: prometheus-hosts - check_es_cluster_number_nodes_running: use: generic-service diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 5b00260dd..28ce99e46 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -2038,12 +2038,12 @@ conf: description: 'Elasticsearch cluster health status call timedout {{ $value }} times.' summary: 'Elasticsearch cluster health status calls are timing out.' - alert: es_cluster_health_status_alert - expr: elasticsearch_cluster_health_status > 0 + expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2 for: 10m labels: severity: warning annotations: - description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.' + description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.' summary: 'Elasticsearch cluster health status is not green.' - alert: es_cluster_health_too_few_nodes_running expr: elasticsearch_cluster_health_number_of_nodes < 3 @@ -2061,6 +2061,14 @@ conf: annotations: description: 'There are only {{$value}} < 3 ElasticSearch data nodes running' summary: 'ElasticSearch running on less than 3 data nodes' + - alert: es_cluster_health_too_few_data_nodes_running + expr: elasticsearch_cluster_health_number_of_data_nodes < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'There are only {{$value}} < 3 ElasticSearch data nodes running' + summary: 'ElasticSearch running on less than 3 data nodes' mariadb: groups: - name: mariadb.rules