Merge "Monitor postgresql, Openstack virt resources, api, logs, pod and nodes status"

This commit is contained in:
Zuul 2018-09-21 12:12:00 +00:00 committed by Gerrit Code Review
commit 4cd00f3ac5
4 changed files with 452 additions and 34 deletions

View File

@ -129,8 +129,6 @@ spec:
- name: nagios
containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
env:
- name: PROMETHEUS_SERVICE
value: {{ tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
- name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT
value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }}
- name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT
@ -139,6 +137,16 @@ spec:
value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }}
- name: REST_NOTIF_SECONDARY_TARGET_URL
value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }}
- name: PROMETHEUS_SERVICE
valueFrom:
secretKeyRef:
name: {{ $nagiosUserSecret }}
key: PROMETHEUS_SERVICE
- name: ELASTICSEARCH_SERVICE
valueFrom:
secretKeyRef:
name: {{ $nagiosUserSecret }}
key: ELASTICSEARCH_SERVICE
- name: NAGIOSADMIN_USER
valueFrom:
secretKeyRef:

View File

@ -17,6 +17,8 @@ limitations under the License.
{{- if .Values.manifests.secret_nagios }}
{{- $envAll := . }}
{{- $secretName := index $envAll.Values.secrets.nagios.admin }}
{{- $prometheusService := tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
{{- $elasticsearchService := tuple "elasticsearch" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
---
apiVersion: v1
kind: Secret
@ -28,4 +30,6 @@ data:
NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }}
BIND_DN: {{ .Values.endpoints.ldap.auth.admin.bind | b64enc }}
BIND_PASSWORD: {{ .Values.endpoints.ldap.auth.admin.password | b64enc }}
PROMETHEUS_SERVICE: {{ $prometheusService | b64enc }}
ELASTICSEARCH_SERVICE: {{ $elasticsearchService | b64enc }}
{{- end }}

View File

@ -19,7 +19,7 @@
images:
tags:
apache_proxy: docker.io/httpd:2.4
nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3
nagios: quay.io/attcomdev/nagios:389472c05ea4bc9f9b9e407e05e17527bfdce3cc
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
@ -137,6 +137,24 @@ endpoints:
port:
ldap:
default: 389
elasticsearch:
name: elasticsearch
namespace: null
auth:
admin:
username: admin
password: changeme
hosts:
default: elasticsearch-logging
host_fqdn_override:
default: null
path:
default: /
scheme:
default: http
port:
http:
default: 80
network:
nagios:
@ -292,7 +310,7 @@ conf:
AuthUserFile /usr/local/apache2/conf/.htpasswd
AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
Require valid-user
</Proxy>
</VirtualHost>
@ -356,10 +374,10 @@ conf:
command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
- send_service_http_post:
command_name: send_service_http_post
command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
- send_host_http_post:
command_name: send_host_http_post
command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
- check_prometheus_host_alive:
command_name: check-prometheus-host-alive
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
@ -369,6 +387,9 @@ conf:
- check_prom_alert:
command_name: check_prom_alert
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
- check_es_alert:
command_name: check_es_alert
command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'"
- check_filespace_mounts-usage-rate-fullin4hrs:
command_name: check_filespace_mounts-usage-rate-fullin4hrs
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
@ -432,6 +453,9 @@ conf:
- check_ceph_health:
command_name: check_ceph_health
command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
- check_prometheus_hosts:
command_name: check_prometheus_hosts
command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
services:
- notifying_service:
name: notifying_service
@ -449,6 +473,12 @@ conf:
service_description: "CEPH_health"
check_command: check_ceph_health
check_interval: 60
- check_hosts_health:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Nodes_health"
check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready.
check_interval: 60
- check_prometheus_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
@ -515,6 +545,12 @@ conf:
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_error_crash_loop_back_off:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-crashLoopBackOff"
check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status
check_interval: 60
- check_replicaset_missing_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
@ -531,31 +567,66 @@ conf:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_glance"
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
check_interval: 60
- check_nova_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_nova"
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
check_interval: 60
- check_keystone_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_keystone"
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
check_interval: 60
- check_neutron_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_neutron"
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
check_interval: 60
- check_neutron_metadata_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-metadata-agent"
check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
check_interval: 60
- check_neutron_openvswitch_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-openvswitch-agent"
check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
check_interval: 60
- check_neutron_dhcp_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-dhcp-agent"
check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
check_interval: 60
- check_neutron_l3_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-l3-agent"
check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
check_interval: 60
- check_swift_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_swift"
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
check_interval: 60
- check_cinder_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_cinder"
check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
- check_glance_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_heat"
check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
check_interval: 60
- check_cinder_api:
use: notifying_service
@ -573,25 +644,43 @@ conf:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-compute"
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
check_interval: 60
- check_service_nova_conductor:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-conductor"
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
check_interval: 60
- check_service_nova_consoleauth:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-consoleauth"
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
check_interval: 60
- check_service_nova_scheduler:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-scheduler"
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
check_interval: 60
- check_os_vm_vcpu_usage:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "OS-Total-Quota_VCPU-usage"
check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
check_interval: 60
- check_os_vm_ram_usage:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "OS-Total-Quota_RAM-usage"
check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
check_interval: 60
- check_os_vm_disk_usage:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "OS-Total-Quota_Disk-usage"
check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
check_interval: 60
- check_ceph_monitor_quorum:
use: notifying_service
@ -777,6 +866,107 @@ conf:
service_description: Mariadb_innodb-replication-lag
check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
hostgroup_name: prometheus-hosts
- check_prometheus_hosts:
use: notifying_service
service_description: Prometheus_hosts-update
check_command: check_prometheus_hosts
hostgroup_name: prometheus-hosts
check_interval: 900
- check_postgresql_replication_lag:
use: generic-service
service_description: Postgresql_replication-lag
check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
hostgroup_name: prometheus-hosts
- check_postgresql_connections:
use: generic-service
service_description: Postgresql_connections
check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
hostgroup_name: prometheus-hosts
- check_postgresql_deadlocks:
use: generic-service
service_description: Postgresql_deadlocks
check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
hostgroup_name: prometheus-hosts
- check_prom_exporter_ceph:
use: generic-service
service_description: Prometheus-exporter_CEPH
check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_openstack:
use: generic-service
service_description: Prometheus-exporter_Openstack
check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_mariadb:
use: generic-service
service_description: Prometheus-exporter_MariaDB
check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_kube_state_metrics:
use: generic-service
service_description: Prometheus-exporter_Kube-state-metrics
check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_postgresql:
use: generic-service
service_description: Prometheus-exporter_Postgresql
check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_node:
use: generic-service
service_description: Prometheus-exporter_Node
check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_calico:
use: generic-service
service_description: Prometheus-exporter_Calico
check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_elasticsearch:
use: generic-service
service_description: Prometheus-exporter_Elasticsearch
check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_fluentd:
use: generic-service
service_description: Prometheus-exporter_Fluentd
check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_logmon_glance:
use: generic-service
service_description: Logmon_glance-error
check_command: check_es_alert!glance!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_keystone:
use: generic-service
service_description: Logmon_keystone-error
check_command: check_es_alert!keystone!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_nova:
use: generic-service
service_description: Logmon_nova-error
check_command: check_es_alert!nova!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_neutron:
use: generic-service
service_description: Logmon_neutron-error
check_command: check_es_alert!neutron!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_cinder:
use: generic-service
service_description: Logmon_cinder-error
check_command: check_es_alert!cinder!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_heat:
use: generic-service
service_description: Logmon_heat-error
check_command: check_es_alert!heat!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_horizon:
use: generic-service
service_description: Logmon_horizon-error
check_command: check_es_alert!horizon!15!CRITICAL,ERROR!10!docker_fluentd
hostgroup_name: prometheus-hosts
- check_filespace_mounts-usage-rate-fullin4hrs:
use: notifying_service
hostgroup_name: base-os

View File

@ -1185,6 +1185,14 @@ conf:
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SNodesNotReady
expr: count(kube_node_status_ready{condition="true"} == 0) > 0
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} nodes are notReady state.'
summary: One or more Kubernetes nodes are Not Ready
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
for: 1h
@ -1296,7 +1304,7 @@ conf:
annotations:
description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
summary: '{{$labels.statefulset}}: has inssuficient replicas.'
- alert: kube_daemonsets_misscheduled
- alert: daemonsets_misscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
@ -1304,7 +1312,7 @@ conf:
annotations:
description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
summary: 'Daemonsets not scheduled correctly'
- alert: kube_daemonsets_not_scheduled
- alert: daemonsets_not_scheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
@ -1312,7 +1320,7 @@ conf:
annotations:
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
summary: 'Less than desired number of daemonsets scheduled'
- alert: kube_deployment_replicas_unavailable
- alert: deployment_replicas_unavailable
expr: kube_deployment_status_replicas_unavailable > 0
for: 10m
labels:
@ -1320,7 +1328,7 @@ conf:
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
summary: '{{$labels.deployment}}: has inssuficient replicas.'
- alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
- alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
for: 10m
labels:
@ -1328,7 +1336,7 @@ conf:
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
- alert: kube_job_status_failed
- alert: job_status_failed
expr: kube_job_status_failed > 0
for: 10m
labels:
@ -1336,7 +1344,7 @@ conf:
annotations:
description: 'Job {{$labels.exported_job}} is in failed status'
summary: '{{$labels.exported_job}} has failed status'
- alert: kube_pod_status_pending
- alert: pod_status_pending
expr: kube_pod_status_phase{phase="Pending"} == 1
for: 10m
labels:
@ -1344,7 +1352,7 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
- alert: kube_pod_error_image_pull
- alert: pod_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
@ -1352,7 +1360,7 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: kube_pod_status_error_image_pull
- alert: pod_status_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
@ -1360,7 +1368,15 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: kube_replicaset_missing_replicas
- alert: pod_error_crash_loop_back_off
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: replicaset_missing_replicas
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
for: 10m
labels:
@ -1368,7 +1384,7 @@ conf:
annotations:
description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
- alert: kube_pod_container_terminated
- alert: pod_container_terminated
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
for: 10m
labels:
@ -1618,7 +1634,7 @@ conf:
- name: openstack.rules
rules:
- alert: os_glance_api_availability
expr: check_glance_api != 1
expr: openstack_check_glance_api != 1
for: 5m
labels:
severity: page
@ -1626,7 +1642,7 @@ conf:
description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Glance API is not available at {{$labels.url}}'
- alert: os_nova_api_availability
expr: check_nova_api != 1
expr: openstack_check_nova_api != 1
for: 5m
labels:
severity: page
@ -1634,7 +1650,7 @@ conf:
description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Nova API is not available at {{$labels.url}}'
- alert: os_keystone_api_availability
expr: check_keystone_api != 1
expr: openstack_check_keystone_api != 1
for: 5m
labels:
severity: page
@ -1642,15 +1658,47 @@ conf:
description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Keystone API is not available at {{$labels.url}}'
- alert: os_neutron_api_availability
expr: check_neutron_api != 1
expr: openstack_check_neutron_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Neutron API is not available at {{$labels.url}}'
- alert: os_neutron_metadata_agent_availability
expr: openstack_services_neutron_metadata_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
summary: 'One or more neutron metadata_agents are not available'
- alert: os_neutron_openvswitch_agent_availability
expr: openstack_services_neutron_openvswitch_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
summary: 'One or more neutron openvswitch agents are not available'
- alert: os_neutron_dhcp_agent_availability
expr: openstack_services_neutron_dhcp_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
summary: 'One or more neutron dhcp agents are not available'
- alert: os_neutron_l3_agent_availability
expr: openstack_services_neutron_l3_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron L3 agents are not available for more than 5 minutes'
summary: 'One or more neutron L3 agents are not available'
- alert: os_swift_api_availability
expr: check_swift_api != 1
expr: openstack_check_swift_api != 1
for: 5m
labels:
severity: page
@ -1673,8 +1721,16 @@ conf:
annotations:
description: 'Cinder scheduler is not available for more than 5 minutes'
summary: 'Cinder scheduler is not available'
- alert: os_heat_api_availability
expr: openstack_check_heat_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Heat API is not available at {{$labels.url}}'
- alert: os_nova_compute_disabled
expr: services_nova_compute_disabled_total > 0
expr: openstack_services_nova_compute_disabled_total > 0
for: 5m
labels:
severity: page
@ -1682,7 +1738,7 @@ conf:
description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is disabled on some hosts'
- alert: os_nova_conductor_disabled
expr: services_nova_conductor_disabled_total > 0
expr: openstack_services_nova_conductor_disabled_total > 0
for: 5m
labels:
severity: page
@ -1690,7 +1746,7 @@ conf:
description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is disabled on some hosts'
- alert: os_nova_consoleauth_disabled
expr: services_nova_consoleauth_disabled_total > 0
expr: openstack_services_nova_consoleauth_disabled_total > 0
for: 5m
labels:
severity: page
@ -1698,13 +1754,69 @@ conf:
description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
- alert: os_nova_scheduler_disabled
expr: services_nova_scheduler_disabled_total > 0
expr: openstack_services_nova_scheduler_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
- alert: os_nova_compute_down
expr: openstack_services_nova_compute_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-compute is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is down on some hosts'
- alert: os_nova_conductor_down
expr: openstack_services_nova_conductor_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-conductor is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is down on some hosts'
- alert: os_nova_consoleauth_down
expr: openstack_services_nova_consoleauth_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is down on some hosts'
- alert: os_nova_scheduler_down
expr: openstack_services_nova_scheduler_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is down on some hosts'
- alert: os_vm_vcpu_usage_high
expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
summary: 'Openstack VM vcpu usage is high'
- alert: os_vm_ram_usage_high
expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM RAM usage is hight at {{$value}} percent'
summary: 'Openstack VM RAM usage is high'
- alert: os_vm_disk_usage_high
expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM Disk usage is hight at {{$value}} percent'
summary: 'Openstack VM Disk usage is high'
ceph:
groups:
- name: ceph.rules
@ -1989,3 +2101,107 @@ conf:
annotations:
description: 'The mysql innodb replication has fallen behind and is not recovering'
summary: 'MySQL innodb replication is lagging'
postgresql:
groups:
- name: postgresql.rules
rules:
- alert: pg_replication_fallen_behind
expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
title: Postgres Replication lag is over 2 minutes
- alert: pg_connections_too_high
expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
for: 5m
labels:
severity: warn
channel: database
annotations:
title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
- alert: pg_deadlocks_detected
expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
for: 5m
labels:
severity: warn
annotations:
description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
title: Postgres server is experiencing deadlocks
prometheus_exporters:
groups:
- name: prometheus_exporters.rules
rules:
- alert: prom_exporter_ceph_unavailable
expr: absent(ceph_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
title: Ceph exporter is not collecting metrics or is not available
- alert: prom_exporter_openstack_unavailable
expr: absent(openstack_exporter_cache_refresh_duration_seconds)
for: 10m
labels:
severity: warning
annotations:
description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
title: Openstack exporter is not collecting metrics or is not available
- alert: prom_exporter_mariadb_unavailable
expr: absent(mysql_up)
for: 10m
labels:
severity: warning
annotations:
description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
title: MariaDB exporter is not collecting metrics or is not available
- alert: prom_exporter_kube_state_metrics_unavailable
expr: absent(kube_node_info)
for: 10m
labels:
severity: warning
annotations:
description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
title: kube-state-metrics exporter is not collecting metrics or is not available
- alert: prom_exporter_postgresql_unavailable
expr: absent(pg_static)
for: 10m
labels:
severity: warning
annotations:
description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
title: postgresql exporter is not collecting metrics or is not available
- alert: prom_exporter_node_unavailable
expr: absent(node_uname_info)
for: 10m
labels:
severity: warning
annotations:
description: node exporter is not collecting metrics or is not available for past 10 minutes
title: node exporter is not collecting metrics or is not available
- alert: prom_exporter_calico_unavailable
expr: absent(felix_host)
for: 10m
labels:
severity: warning
annotations:
description: Calico exporter is not collecting metrics or is not available for past 10 minutes
title: Calico exporter is not collecting metrics or is not available
- alert: prom_exporter_elasticsearch_unavailable
expr: absent(elasticsearch_cluster_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
title: Elasticsearch exporter is not collecting metrics or is not available
- alert: prom_exporter_fluentd_unavailable
expr: absent(fluentd_up)
for: 10m
labels:
severity: warning
annotations:
description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
title: Fluentd exporter is not collecting metrics or is not available