Update Elasticsearch health status expressions

This updates the Elasticsearch health status expressions used in Prometheus, Nagios and Grafana. The previous Prometheus rule defined for Elasticsearch health checked for a status that was > 0 to trigger an alarm for a green health status. The correct returned values are: 1 for green, 0 for both red and yellow. This changes the expression to use arithmetic operators to give us a result that maps to: 2 for green, 1 for yellow, 0 for red. This also updates the Elasticsearch dashboard in Grafana to add a new mapping for the updated 2g,1y,0r scale. Finally, this also updates the Nagios service check to be a bit more verbose in its output. For reference, see: https://github.com/justwatchcom/elasticsearch_exporter/issues/120 Change-Id: I6ef2a7c308c6ebfdb693b46127a285bceb6ba872
2019-01-16 11:00:25 -06:00 · 2019-01-16 11:00:25 -06:00 · 9e5a295465
commit 9e5a295465
parent 6bd70a9fc6
3 changed files with 17 additions and 6 deletions
--- a/grafana/values.yaml
+++ b/grafana/values.yaml
@ -5805,14 +5805,14 @@ conf:
            show: true
          tableColumn: ''
          targets:
-          - expr: sum(elasticsearch_cluster_health_status{cluster=~"$cluster"})
+          - expr: (sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="green"})*2)+sum(elasticsearch_cluster_health_status{cluster=~"$cluster",color="yellow"})
            format: time_series
-            intervalFactor: 2
+            intervalFactor: 3
            legendFormat: ''
            metric: ''
            refId: A
            step: 40
-          thresholds: '0,1'
+          thresholds: '0,1,2'
          title: Cluster health status
          transparent: false
          type: singlestat
@ -5820,6 +5820,9 @@ conf:
          valueMaps:
          - op: "="
            text: GREEN
+            value: '2'
+          - op: "="
+            text: YELLOW
            value: '1'
          - op: "="
            text: RED
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@ -851,7 +851,7 @@ conf:
      - check_es_cluster_health_status:
          use: generic-service
          service_description: ES_cluster-health-status
-          check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
+          check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
          hostgroup_name: prometheus-hosts
      - check_es_cluster_number_nodes_running:
          use: generic-service
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@ -2038,12 +2038,12 @@ conf:
              description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
              summary: 'Elasticsearch cluster health status calls are timing out.'
          - alert: es_cluster_health_status_alert
-            expr: elasticsearch_cluster_health_status > 0
+            expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
            for: 10m
            labels:
              severity: warning
            annotations:
-              description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.'
+              description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
              summary: 'Elasticsearch cluster health status is not green.'
          - alert: es_cluster_health_too_few_nodes_running
            expr: elasticsearch_cluster_health_number_of_nodes < 3
@ -2061,6 +2061,14 @@ conf:
            annotations:
              description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
              summary: 'ElasticSearch running on less than 3 data nodes'
+          - alert: es_cluster_health_too_few_data_nodes_running
+            expr: elasticsearch_cluster_health_number_of_data_nodes < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
+              summary: 'ElasticSearch running on less than 3 data nodes'
      mariadb:
        groups:
        - name: mariadb.rules