From 52c980b10cdc14c49883576965a79376c0f21253 Mon Sep 17 00:00:00 2001
From: Rakesh Patnaik <patsrakesh@gmail.com>
Date: Tue, 24 Apr 2018 21:16:42 +0000
Subject: [PATCH] Prometheus alerts, nagios defn - rabbitmq,mariadb,ES

Change-Id: I71bc9f42aebc268ad2383a5a36a3405fc47c6c9e
---
 nagios/values.yaml     | 105 ++++++++++++++++++++++++
 prometheus/values.yaml | 180 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 285 insertions(+)

diff --git a/nagios/values.yaml b/nagios/values.yaml
index f1a820ca6..c5fea267c 100644
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@@ -543,6 +543,111 @@ conf:
           service_description: Calico_datapane_failures_high
           check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
           hostgroup_name: prometheus-hosts
+      - check_rabbitmq_network_partitions_detected:
+          use: generic-service
+          service_description: Rabbitmq_network-partitions-exist
+          check_command: check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
+          hostgroup_name: prometheus-hosts
+      - check_rabbitmq_available:
+          use: generic-service
+          service_description: Rabbitmq_up
+          check_command: check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
+          hostgroup_name: prometheus-hosts
+      - check_rabbitmq_fd_usage:
+          use: generic-service
+          service_description: Rabbitmq_file-descriptor-usage
+          check_command: check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
+          hostgroup_name: prometheus-hosts
+      - check_rabbitmq_node_disk_alarm:
+          use: generic-service
+          service_description: Rabbitmq_node-disk-alarm
+          check_command: check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
+          hostgroup_name: prometheus-hosts
+      - check_rabbitmq_node_memory_alarm:
+          use: generic-service
+          service_description: Rabbitmq_node-memory-alarm
+          check_command: check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
+          hostgroup_name: prometheus-hosts
+      - check_rabbitmq_availability:
+          use: generic-service
+          service_description: Rabbitmq_high-availability
+          check_command: check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
+          hostgroup_name: prometheus-hosts
+      - check_queue_message_return_percent:
+          use: generic-service
+          service_description: Rabbitmq_message-return-percent
+          check_command: check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
+          hostgroup_name: prometheus-hosts
+      - check_queue_consumer_util:
+          use: generic-service
+          service_description: Rabbitmq_consumer-utilization
+          check_command: check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
+          hostgroup_name: prometheus-hosts
+      - check_queue_load:
+          use: generic-service
+          service_description: Rabbitmq_rabbitmq-queue-health
+          check_command: check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
+          hostgroup_name: prometheus-hosts
+      - check_es_high_process_open_file_count:
+          use: generic-service
+          service_description: ES_high-process-open-file-count
+          check_command: check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
+          hostgroup_name: prometheus-hosts
+      - check_es_high_process_cpu_percent:
+          use: generic-service
+          service_description: ES_high-process-cpu-percent
+          check_command: check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
+          hostgroup_name: prometheus-hosts
+      - check_es_fs_usage:
+          use: generic-service
+          service_description: ES_high-filesystem-usage
+          check_command: check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
+          hostgroup_name: prometheus-hosts
+      - check_es_unassigned_shards:
+          use: generic-service
+          service_description: ES_unassigned-shards
+          check_command: check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
+          hostgroup_name: prometheus-hosts
+      - check_es_cluster_health_timedout:
+          use: generic-service
+          service_description: ES_cluster-health-timedout
+          check_command: check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
+          hostgroup_name: prometheus-hosts
+      - check_es_cluster_health_status:
+          use: generic-service
+          service_description: ES_cluster-health-status
+          check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
+          hostgroup_name: prometheus-hosts
+      - check_es_cluster_number_nodes_running:
+          use: generic-service
+          service_description: ES_cluster-running-node-count
+          check_command: check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
+          hostgroup_name: prometheus-hosts
+      - check_es_cluster_number_data_nodes_running:
+          use: generic-service
+          service_description: ES_cluster-running-data-node-count
+          check_command: check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
+          hostgroup_name: prometheus-hosts
+      - check_mariadb_table_lock_waits:
+          use: generic-service
+          service_description: Mariadb_table-lock-waits-high
+          check_command: check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
+          hostgroup_name: prometheus-hosts
+      - check_mariadb_node_ready:
+          use: generic-service
+          service_description: Mariadb_node-ready
+          check_command: check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
+          hostgroup_name: prometheus-hosts
+      - check_mariadb_node_out_of_sync:
+          use: generic-service
+          service_description: Mariadb_node-synchronized
+          check_command: check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
+          hostgroup_name: prometheus-hosts
+      - check_mariadb_innodb_replication_lag:
+          use: generic-service
+          service_description: Mariadb_innodb-replication-lag
+          check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
+          hostgroup_name: prometheus-hosts
       - check_filespace_mounts-usage-rate-fullin4hrs:
           use: notifying_service
           hostgroup_name: base-os
diff --git a/prometheus/values.yaml b/prometheus/values.yaml
index 24b6cebd7..0c1ae2909 100644
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@@ -1309,3 +1309,183 @@ conf:
             annotations:
               description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
               summary: 'A high number of iptable restore errors within Felix are happening'
+      rabbitmq:
+        groups:
+        - name: rabbitmq.rules
+          rules:
+          - alert: rabbitmq_network_pratitions_detected
+            expr: min(partitions) by(instance) > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
+              summary: 'RabbitMQ Network partitions detected'
+          - alert: rabbitmq_down
+            expr:  min(rabbitmq_up) by(instance) != 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
+              summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
+          - alert: rabbitmq_file_descriptor_usage_high
+            expr:  fd_used * 100 /fd_total > 80
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
+              summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
+          - alert: rabbitmq_node_disk_free_alarm
+            expr:  node_disk_free_alarm > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
+              summary: 'RabbitMQ disk space usage is high'
+          - alert: rabbitmq_node_memory_alarm
+            expr:  node_mem_alarm > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
+              summary: 'RabbitMQ memory usage is high'
+          - alert: rabbitmq_less_than_3_nodes
+            expr:  running < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server has less than 3 nodes running.'
+              summary: 'RabbitMQ server is at risk of loosing data'
+          - alert: rabbitmq_queue_messages_returned_high
+            expr:  queue_messages_returned_total/queue_messages_published_total * 100 > 50
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
+              summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
+          - alert: rabbitmq_consumers_low_utilization
+            expr:  queue_consumer_utilisation < .4
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ consumers message consumption speed is low'
+              summary: 'RabbitMQ consumers message consumption speed is low'
+          - alert: rabbitmq_high_message_load
+            expr:  queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
+              summary: 'RabbitMQ has high message load'
+      elasticsearch:
+        groups:
+        - name: elasticsearch.rules
+          rules:
+          - alert: es_high_process_open_files_count
+            expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
+              summary: 'Elasticsearch has a very high process open file count.'
+          - alert: es_high_process_cpu_percent
+            expr: elasticsearch_process_cpu_percent > 95
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
+              summary: 'Elasticsearch process cpu usage is more than 95 percent.'
+          - alert: es_fs_usage_high
+            expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
+              summary: 'Elasticsearch filesystem usage is high.'
+          - alert: es_unassigned_shards
+            expr: elasticsearch_cluster_health_unassigned_shards > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch has {{ $value }} unassigned shards.'
+              summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
+          - alert: es_cluster_health_timed_out
+            expr: elasticsearch_cluster_health_timed_out > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
+              summary: 'Elasticsearch cluster health status calls are timing out.'
+          - alert: es_cluster_health_status_alert
+            expr: elasticsearch_cluster_health_status > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.'
+              summary: 'Elasticsearch cluster health status is not green.'
+          - alert: es_cluster_health_too_few_nodes_running
+            expr: elasticsearch_cluster_health_number_of_nodes < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
+              summary: 'ElasticSearch running on less than 3 nodes'
+          - alert: es_cluster_health_too_few_data_nodes_running
+            expr: elasticsearch_cluster_health_number_of_data_nodes < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
+              summary: 'ElasticSearch running on less than 3 data nodes'
+      mariadb:
+        groups:
+        - name: mariadb.rules
+          rules:
+          - alert: mariadb_table_lock_wait_high
+            expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Mariadb has high table lock waits of {{ $value }} percentage'
+              summary: 'Mariadb table lock waits are high'
+          - alert: mariadb_node_not_ready
+            expr:  mysql_global_status_wsrep_ready != 1
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
+              summary: 'Galera cluster node not ready'
+          - alert: mariadb_galera_node_out_of_sync
+            expr:  mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
+              summary: 'Galera cluster node out of sync'
+          - alert: mariadb_innodb_replication_fallen_behind
+            expr:  (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'The mysql innodb replication has fallen behind and is not recovering'
+              summary: 'MySQL innodb replication is lagging'