From 52c980b10cdc14c49883576965a79376c0f21253 Mon Sep 17 00:00:00 2001 From: Rakesh Patnaik Date: Tue, 24 Apr 2018 21:16:42 +0000 Subject: [PATCH] Prometheus alerts, nagios defn - rabbitmq,mariadb,ES Change-Id: I71bc9f42aebc268ad2383a5a36a3405fc47c6c9e --- nagios/values.yaml | 105 ++++++++++++++++++++++++ prometheus/values.yaml | 180 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 285 insertions(+) diff --git a/nagios/values.yaml b/nagios/values.yaml index f1a820ca6..c5fea267c 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -543,6 +543,111 @@ conf: service_description: Calico_datapane_failures_high check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low hostgroup_name: prometheus-hosts + - check_rabbitmq_network_partitions_detected: + use: generic-service + service_description: Rabbitmq_network-partitions-exist + check_command: check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq + hostgroup_name: prometheus-hosts + - check_rabbitmq_available: + use: generic-service + service_description: Rabbitmq_up + check_command: check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available + hostgroup_name: prometheus-hosts + - check_rabbitmq_fd_usage: + use: generic-service + service_description: Rabbitmq_file-descriptor-usage + check_command: check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal + hostgroup_name: prometheus-hosts + - check_rabbitmq_node_disk_alarm: + use: generic-service + service_description: Rabbitmq_node-disk-alarm + check_command: check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms + hostgroup_name: prometheus-hosts + - check_rabbitmq_node_memory_alarm: + use: generic-service + service_description: Rabbitmq_node-memory-alarm + check_command: check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms + hostgroup_name: prometheus-hosts + - check_rabbitmq_availability: + use: generic-service + service_description: Rabbitmq_high-availability + check_command: check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving + hostgroup_name: prometheus-hosts + - check_queue_message_return_percent: + use: generic-service + service_description: Rabbitmq_message-return-percent + check_command: check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist. + hostgroup_name: prometheus-hosts + - check_queue_consumer_util: + use: generic-service + service_description: Rabbitmq_consumer-utilization + check_command: check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal + hostgroup_name: prometheus-hosts + - check_queue_load: + use: generic-service + service_description: Rabbitmq_rabbitmq-queue-health + check_command: check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high + hostgroup_name: prometheus-hosts + - check_es_high_process_open_file_count: + use: generic-service + service_description: ES_high-process-open-file-count + check_command: check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal. + hostgroup_name: prometheus-hosts + - check_es_high_process_cpu_percent: + use: generic-service + service_description: ES_high-process-cpu-percent + check_command: check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal. + hostgroup_name: prometheus-hosts + - check_es_fs_usage: + use: generic-service + service_description: ES_high-filesystem-usage + check_command: check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal. + hostgroup_name: prometheus-hosts + - check_es_unassigned_shards: + use: generic-service + service_description: ES_unassigned-shards + check_command: check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards. + hostgroup_name: prometheus-hosts + - check_es_cluster_health_timedout: + use: generic-service + service_description: ES_cluster-health-timedout + check_command: check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable. + hostgroup_name: prometheus-hosts + - check_es_cluster_health_status: + use: generic-service + service_description: ES_cluster-health-status + check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green. + hostgroup_name: prometheus-hosts + - check_es_cluster_number_nodes_running: + use: generic-service + service_description: ES_cluster-running-node-count + check_command: check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running. + hostgroup_name: prometheus-hosts + - check_es_cluster_number_data_nodes_running: + use: generic-service + service_description: ES_cluster-running-data-node-count + check_command: check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running. + hostgroup_name: prometheus-hosts + - check_mariadb_table_lock_waits: + use: generic-service + service_description: Mariadb_table-lock-waits-high + check_command: check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits. + hostgroup_name: prometheus-hosts + - check_mariadb_node_ready: + use: generic-service + service_description: Mariadb_node-ready + check_command: check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready. + hostgroup_name: prometheus-hosts + - check_mariadb_node_out_of_sync: + use: generic-service + service_description: Mariadb_node-synchronized + check_command: check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync + hostgroup_name: prometheus-hosts + - check_mariadb_innodb_replication_lag: + use: generic-service + service_description: Mariadb_innodb-replication-lag + check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal. + hostgroup_name: prometheus-hosts - check_filespace_mounts-usage-rate-fullin4hrs: use: notifying_service hostgroup_name: base-os diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 24b6cebd7..0c1ae2909 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1309,3 +1309,183 @@ conf: annotations: description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour' summary: 'A high number of iptable restore errors within Felix are happening' + rabbitmq: + groups: + - name: rabbitmq.rules + rules: + - alert: rabbitmq_network_pratitions_detected + expr: min(partitions) by(instance) > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions' + summary: 'RabbitMQ Network partitions detected' + - alert: rabbitmq_down + expr: min(rabbitmq_up) by(instance) != 1 + for: 10m + labels: + severity: page + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} is down' + summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins' + - alert: rabbitmq_file_descriptor_usage_high + expr: fd_used * 100 /fd_total > 80 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.' + summary: 'RabbitMQ file descriptors usage is high for last 10 mins' + - alert: rabbitmq_node_disk_free_alarm + expr: node_disk_free_alarm > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.' + summary: 'RabbitMQ disk space usage is high' + - alert: rabbitmq_node_memory_alarm + expr: node_mem_alarm > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.' + summary: 'RabbitMQ memory usage is high' + - alert: rabbitmq_less_than_3_nodes + expr: running < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server has less than 3 nodes running.' + summary: 'RabbitMQ server is at risk of loosing data' + - alert: rabbitmq_queue_messages_returned_high + expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server is returing more than 50 percent of messages received.' + summary: 'RabbitMQ server is returning more than 50 percent of messages received.' + - alert: rabbitmq_consumers_low_utilization + expr: queue_consumer_utilisation < .4 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ consumers message consumption speed is low' + summary: 'RabbitMQ consumers message consumption speed is low' + - alert: rabbitmq_high_message_load + expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.' + summary: 'RabbitMQ has high message load' + elasticsearch: + groups: + - name: elasticsearch.rules + rules: + - alert: es_high_process_open_files_count + expr: sum(elasticsearch_process_open_files_count) by (host) > 64000 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.' + summary: 'Elasticsearch has a very high process open file count.' + - alert: es_high_process_cpu_percent + expr: elasticsearch_process_cpu_percent > 95 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.' + summary: 'Elasticsearch process cpu usage is more than 95 percent.' + - alert: es_fs_usage_high + expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.' + summary: 'Elasticsearch filesystem usage is high.' + - alert: es_unassigned_shards + expr: elasticsearch_cluster_health_unassigned_shards > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch has {{ $value }} unassigned shards.' + summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.' + - alert: es_cluster_health_timed_out + expr: elasticsearch_cluster_health_timed_out > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch cluster health status call timedout {{ $value }} times.' + summary: 'Elasticsearch cluster health status calls are timing out.' + - alert: es_cluster_health_status_alert + expr: elasticsearch_cluster_health_status > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.' + summary: 'Elasticsearch cluster health status is not green.' + - alert: es_cluster_health_too_few_nodes_running + expr: elasticsearch_cluster_health_number_of_nodes < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'There are only {{$value}} < 3 ElasticSearch nodes running' + summary: 'ElasticSearch running on less than 3 nodes' + - alert: es_cluster_health_too_few_data_nodes_running + expr: elasticsearch_cluster_health_number_of_data_nodes < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'There are only {{$value}} < 3 ElasticSearch data nodes running' + summary: 'ElasticSearch running on less than 3 data nodes' + mariadb: + groups: + - name: mariadb.rules + rules: + - alert: mariadb_table_lock_wait_high + expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30 + for: 10m + labels: + severity: warning + annotations: + description: 'Mariadb has high table lock waits of {{ $value }} percentage' + summary: 'Mariadb table lock waits are high' + - alert: mariadb_node_not_ready + expr: mysql_global_status_wsrep_ready != 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{$labels.job}} on {{$labels.instance}} is not ready.' + summary: 'Galera cluster node not ready' + - alert: mariadb_galera_node_out_of_sync + expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0 + for: 10m + labels: + severity: warning + annotations: + description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)' + summary: 'Galera cluster node out of sync' + - alert: mariadb_innodb_replication_fallen_behind + expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0) + for: 10m + labels: + severity: warning + annotations: + description: 'The mysql innodb replication has fallen behind and is not recovering' + summary: 'MySQL innodb replication is lagging'