fbd34421f2
This updates the Prometheus chart to support federation. This moves to defining the Prometheus configuration file via a template in the values.yaml file instead of through raw yaml. This allows for overriding the chart's default configuration wholesale, as this would be required for a hierarchical federated setup. This also strips out all of the default rules defined in the chart for the same reason. There are example rules defined for the various aspects of OSH's infrastructure in the prometheus/values_overrides directory that are executed as part of the normal CI jobs. This also adds a nonvoting federated-monitoring job that vets out the ability to federate prometheus in a hierarchical fashion with extremely basic overrides Change-Id: I0f121ad5e4f80be4c790dc869955c6b299ca9f26 Signed-off-by: Steve Wilkerson <sw5822@att.com>
106 lines
4.9 KiB
YAML
106 lines
4.9 KiB
YAML
conf:
|
|
prometheus:
|
|
rules:
|
|
logging:
|
|
groups:
|
|
- name: fluentd.rules
|
|
rules:
|
|
- alert: prom_exporter_fluentd_unavailable
|
|
expr: absent(fluentd_up)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
|
|
title: Fluentd exporter is not collecting metrics or is not available
|
|
- alert: fluentd_not_running
|
|
expr: fluentd_up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
|
|
summary: 'Fluentd is down'
|
|
- name: elasticsearch.rules
|
|
rules:
|
|
- alert: prom_exporter_elasticsearch_unavailable
|
|
expr: absent(elasticsearch_cluster_health_status)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
|
|
title: Elasticsearch exporter is not collecting metrics or is not available
|
|
- alert: es_high_process_open_files_count
|
|
expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
|
|
summary: 'Elasticsearch has a very high process open file count.'
|
|
- alert: es_high_process_cpu_percent
|
|
expr: elasticsearch_process_cpu_percent > 95
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
|
|
summary: 'Elasticsearch process cpu usage is more than 95 percent.'
|
|
- alert: es_fs_usage_high
|
|
expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
|
|
summary: 'Elasticsearch filesystem usage is high.'
|
|
- alert: es_unassigned_shards
|
|
expr: elasticsearch_cluster_health_unassigned_shards > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Elasticsearch has {{ $value }} unassigned shards.'
|
|
summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
|
|
- alert: es_cluster_health_timed_out
|
|
expr: elasticsearch_cluster_health_timed_out > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
|
|
summary: 'Elasticsearch cluster health status calls are timing out.'
|
|
- alert: es_cluster_health_status_alert
|
|
expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
|
|
summary: 'Elasticsearch cluster health status is not green.'
|
|
- alert: es_cluster_health_too_few_nodes_running
|
|
expr: elasticsearch_cluster_health_number_of_nodes < 3
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
|
|
summary: 'ElasticSearch running on less than 3 nodes'
|
|
- alert: es_cluster_health_too_few_data_nodes_running
|
|
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
|
summary: 'ElasticSearch running on less than 3 data nodes'
|
|
- alert: es_cluster_health_too_few_data_nodes_running
|
|
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
|
summary: 'ElasticSearch running on less than 3 data nodes'
|