Monitor postgresql, Openstack virt resources, api, logs, pod and nodes status

Fixing opebstack API monitors

Adding additional neutron services monitors
Adding new Pod CrashLoopBaackOff status check
Adding new Host readiness check

Updated the nagios image reference(https://review.gerrithub.io/c/att-comdev/nagios/+/420590 - Pending)

This updated image provides a mechanism for querying Elasticsearch
with the goal of triggering alerts based on specified applications
and log levels.

Finally, this moves the endpoints resulting from the authenticated
endpoint lookups required for Nagios to the nagios secret instead
of handled via plain text environment variables

Change-Id: I517d8e6e6e8fa1d359382be8a131a8e45bf243e2
This commit is contained in:
rakesh-patnaik 2018-07-03 20:19:56 +00:00 committed by Chris Wedgwood
parent f2271a60a6
commit db0d653b4d
4 changed files with 452 additions and 34 deletions

View File

@ -129,8 +129,6 @@ spec:
- name: nagios
containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
env:
- name: PROMETHEUS_SERVICE
value: {{ tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
- name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT
value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }}
- name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT
@ -139,6 +137,16 @@ spec:
value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }}
- name: REST_NOTIF_SECONDARY_TARGET_URL
value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }}
- name: PROMETHEUS_SERVICE
valueFrom:
secretKeyRef:
name: {{ $nagiosUserSecret }}
key: PROMETHEUS_SERVICE
- name: ELASTICSEARCH_SERVICE
valueFrom:
secretKeyRef:
name: {{ $nagiosUserSecret }}
key: ELASTICSEARCH_SERVICE
- name: NAGIOSADMIN_USER
valueFrom:
secretKeyRef:

View File

@ -17,6 +17,8 @@ limitations under the License.
{{- if .Values.manifests.secret_nagios }}
{{- $envAll := . }}
{{- $secretName := index $envAll.Values.secrets.nagios.admin }}
{{- $prometheusService := tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
{{- $elasticsearchService := tuple "elasticsearch" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
---
apiVersion: v1
kind: Secret
@ -28,4 +30,6 @@ data:
NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }}
BIND_DN: {{ .Values.endpoints.ldap.auth.admin.bind | b64enc }}
BIND_PASSWORD: {{ .Values.endpoints.ldap.auth.admin.password | b64enc }}
PROMETHEUS_SERVICE: {{ $prometheusService | b64enc }}
ELASTICSEARCH_SERVICE: {{ $elasticsearchService | b64enc }}
{{- end }}

View File

@ -19,7 +19,7 @@
images:
tags:
apache_proxy: docker.io/httpd:2.4
nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3
nagios: quay.io/attcomdev/nagios:389472c05ea4bc9f9b9e407e05e17527bfdce3cc
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
@ -137,6 +137,24 @@ endpoints:
port:
ldap:
default: 389
elasticsearch:
name: elasticsearch
namespace: null
auth:
admin:
username: admin
password: changeme
hosts:
default: elasticsearch-logging
host_fqdn_override:
default: null
path:
default: /
scheme:
default: http
port:
http:
default: 80
network:
nagios:
@ -292,7 +310,7 @@ conf:
AuthUserFile /usr/local/apache2/conf/.htpasswd
AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
Require valid-user
</Proxy>
</VirtualHost>
@ -356,10 +374,10 @@ conf:
command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
- send_service_http_post:
command_name: send_service_http_post
command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
- send_host_http_post:
command_name: send_host_http_post
command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
- check_prometheus_host_alive:
command_name: check-prometheus-host-alive
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
@ -369,6 +387,9 @@ conf:
- check_prom_alert:
command_name: check_prom_alert
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
- check_es_alert:
command_name: check_es_alert
command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'"
- check_filespace_mounts-usage-rate-fullin4hrs:
command_name: check_filespace_mounts-usage-rate-fullin4hrs
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
@ -432,6 +453,9 @@ conf:
- check_ceph_health:
command_name: check_ceph_health
command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
- check_prometheus_hosts:
command_name: check_prometheus_hosts
command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
services:
- notifying_service:
name: notifying_service
@ -449,6 +473,12 @@ conf:
service_description: "CEPH_health"
check_command: check_ceph_health
check_interval: 60
- check_hosts_health:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Nodes_health"
check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready.
check_interval: 60
- check_prometheus_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
@ -515,6 +545,12 @@ conf:
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_error_crash_loop_back_off:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-crashLoopBackOff"
check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status
check_interval: 60
- check_replicaset_missing_replicas:
use: notifying_service
hostgroup_name: prometheus-hosts
@ -531,31 +567,66 @@ conf:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_glance"
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
check_interval: 60
- check_nova_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_nova"
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
check_interval: 60
- check_keystone_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_keystone"
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
check_interval: 60
- check_neutron_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_neutron"
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
check_interval: 60
- check_neutron_metadata_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-metadata-agent"
check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
check_interval: 60
- check_neutron_openvswitch_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-openvswitch-agent"
check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
check_interval: 60
- check_neutron_dhcp_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-dhcp-agent"
check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
check_interval: 60
- check_neutron_l3_agent:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_neutron-l3-agent"
check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
check_interval: 60
- check_swift_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_swift"
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
check_interval: 60
- check_cinder_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_cinder"
check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
- check_glance_api:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "API_heat"
check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
check_interval: 60
- check_cinder_api:
use: notifying_service
@ -573,25 +644,43 @@ conf:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-compute"
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
check_interval: 60
- check_service_nova_conductor:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-conductor"
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
check_interval: 60
- check_service_nova_consoleauth:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-consoleauth"
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
check_interval: 60
- check_service_nova_scheduler:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-scheduler"
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
check_interval: 60
- check_os_vm_vcpu_usage:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "OS-Total-Quota_VCPU-usage"
check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
check_interval: 60
- check_os_vm_ram_usage:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "OS-Total-Quota_RAM-usage"
check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
check_interval: 60
- check_os_vm_disk_usage:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "OS-Total-Quota_Disk-usage"
check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
check_interval: 60
- check_ceph_monitor_quorum:
use: notifying_service
@ -777,6 +866,107 @@ conf:
service_description: Mariadb_innodb-replication-lag
check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
hostgroup_name: prometheus-hosts
- check_prometheus_hosts:
use: notifying_service
service_description: Prometheus_hosts-update
check_command: check_prometheus_hosts
hostgroup_name: prometheus-hosts
check_interval: 900
- check_postgresql_replication_lag:
use: generic-service
service_description: Postgresql_replication-lag
check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
hostgroup_name: prometheus-hosts
- check_postgresql_connections:
use: generic-service
service_description: Postgresql_connections
check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
hostgroup_name: prometheus-hosts
- check_postgresql_deadlocks:
use: generic-service
service_description: Postgresql_deadlocks
check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
hostgroup_name: prometheus-hosts
- check_prom_exporter_ceph:
use: generic-service
service_description: Prometheus-exporter_CEPH
check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_openstack:
use: generic-service
service_description: Prometheus-exporter_Openstack
check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_mariadb:
use: generic-service
service_description: Prometheus-exporter_MariaDB
check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_kube_state_metrics:
use: generic-service
service_description: Prometheus-exporter_Kube-state-metrics
check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_postgresql:
use: generic-service
service_description: Prometheus-exporter_Postgresql
check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_node:
use: generic-service
service_description: Prometheus-exporter_Node
check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_calico:
use: generic-service
service_description: Prometheus-exporter_Calico
check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_elasticsearch:
use: generic-service
service_description: Prometheus-exporter_Elasticsearch
check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_prom_exporter_fluentd:
use: generic-service
service_description: Prometheus-exporter_Fluentd
check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
hostgroup_name: prometheus-hosts
- check_logmon_glance:
use: generic-service
service_description: Logmon_glance-error
check_command: check_es_alert!glance!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_keystone:
use: generic-service
service_description: Logmon_keystone-error
check_command: check_es_alert!keystone!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_nova:
use: generic-service
service_description: Logmon_nova-error
check_command: check_es_alert!nova!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_neutron:
use: generic-service
service_description: Logmon_neutron-error
check_command: check_es_alert!neutron!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_cinder:
use: generic-service
service_description: Logmon_cinder-error
check_command: check_es_alert!cinder!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_heat:
use: generic-service
service_description: Logmon_heat-error
check_command: check_es_alert!heat!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
hostgroup_name: prometheus-hosts
- check_logmon_horizon:
use: generic-service
service_description: Logmon_horizon-error
check_command: check_es_alert!horizon!15!CRITICAL,ERROR!10!docker_fluentd
hostgroup_name: prometheus-hosts
- check_filespace_mounts-usage-rate-fullin4hrs:
use: notifying_service
hostgroup_name: base-os

View File

@ -1185,6 +1185,14 @@ conf:
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SNodesNotReady
expr: count(kube_node_status_ready{condition="true"} == 0) > 0
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} nodes are notReady state.'
summary: One or more Kubernetes nodes are Not Ready
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
for: 1h
@ -1296,7 +1304,7 @@ conf:
annotations:
description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
summary: '{{$labels.statefulset}}: has inssuficient replicas.'
- alert: kube_daemonsets_misscheduled
- alert: daemonsets_misscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
@ -1304,7 +1312,7 @@ conf:
annotations:
description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
summary: 'Daemonsets not scheduled correctly'
- alert: kube_daemonsets_not_scheduled
- alert: daemonsets_not_scheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
@ -1312,7 +1320,7 @@ conf:
annotations:
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
summary: 'Less than desired number of daemonsets scheduled'
- alert: kube_deployment_replicas_unavailable
- alert: deployment_replicas_unavailable
expr: kube_deployment_status_replicas_unavailable > 0
for: 10m
labels:
@ -1320,7 +1328,7 @@ conf:
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
summary: '{{$labels.deployment}}: has inssuficient replicas.'
- alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
- alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
for: 10m
labels:
@ -1328,7 +1336,7 @@ conf:
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
- alert: kube_job_status_failed
- alert: job_status_failed
expr: kube_job_status_failed > 0
for: 10m
labels:
@ -1336,7 +1344,7 @@ conf:
annotations:
description: 'Job {{$labels.exported_job}} is in failed status'
summary: '{{$labels.exported_job}} has failed status'
- alert: kube_pod_status_pending
- alert: pod_status_pending
expr: kube_pod_status_phase{phase="Pending"} == 1
for: 10m
labels:
@ -1344,7 +1352,7 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
- alert: kube_pod_error_image_pull
- alert: pod_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
@ -1352,7 +1360,7 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: kube_pod_status_error_image_pull
- alert: pod_status_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
@ -1360,7 +1368,15 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: kube_replicaset_missing_replicas
- alert: pod_error_crash_loop_back_off
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: replicaset_missing_replicas
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
for: 10m
labels:
@ -1368,7 +1384,7 @@ conf:
annotations:
description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
- alert: kube_pod_container_terminated
- alert: pod_container_terminated
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
for: 10m
labels:
@ -1618,7 +1634,7 @@ conf:
- name: openstack.rules
rules:
- alert: os_glance_api_availability
expr: check_glance_api != 1
expr: openstack_check_glance_api != 1
for: 5m
labels:
severity: page
@ -1626,7 +1642,7 @@ conf:
description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Glance API is not available at {{$labels.url}}'
- alert: os_nova_api_availability
expr: check_nova_api != 1
expr: openstack_check_nova_api != 1
for: 5m
labels:
severity: page
@ -1634,7 +1650,7 @@ conf:
description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Nova API is not available at {{$labels.url}}'
- alert: os_keystone_api_availability
expr: check_keystone_api != 1
expr: openstack_check_keystone_api != 1
for: 5m
labels:
severity: page
@ -1642,15 +1658,47 @@ conf:
description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Keystone API is not available at {{$labels.url}}'
- alert: os_neutron_api_availability
expr: check_neutron_api != 1
expr: openstack_check_neutron_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Neutron API is not available at {{$labels.url}}'
- alert: os_neutron_metadata_agent_availability
expr: openstack_services_neutron_metadata_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
summary: 'One or more neutron metadata_agents are not available'
- alert: os_neutron_openvswitch_agent_availability
expr: openstack_services_neutron_openvswitch_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
summary: 'One or more neutron openvswitch agents are not available'
- alert: os_neutron_dhcp_agent_availability
expr: openstack_services_neutron_dhcp_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
summary: 'One or more neutron dhcp agents are not available'
- alert: os_neutron_l3_agent_availability
expr: openstack_services_neutron_l3_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron L3 agents are not available for more than 5 minutes'
summary: 'One or more neutron L3 agents are not available'
- alert: os_swift_api_availability
expr: check_swift_api != 1
expr: openstack_check_swift_api != 1
for: 5m
labels:
severity: page
@ -1673,8 +1721,16 @@ conf:
annotations:
description: 'Cinder scheduler is not available for more than 5 minutes'
summary: 'Cinder scheduler is not available'
- alert: os_heat_api_availability
expr: openstack_check_heat_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Heat API is not available at {{$labels.url}}'
- alert: os_nova_compute_disabled
expr: services_nova_compute_disabled_total > 0
expr: openstack_services_nova_compute_disabled_total > 0
for: 5m
labels:
severity: page
@ -1682,7 +1738,7 @@ conf:
description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is disabled on some hosts'
- alert: os_nova_conductor_disabled
expr: services_nova_conductor_disabled_total > 0
expr: openstack_services_nova_conductor_disabled_total > 0
for: 5m
labels:
severity: page
@ -1690,7 +1746,7 @@ conf:
description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is disabled on some hosts'
- alert: os_nova_consoleauth_disabled
expr: services_nova_consoleauth_disabled_total > 0
expr: openstack_services_nova_consoleauth_disabled_total > 0
for: 5m
labels:
severity: page
@ -1698,13 +1754,69 @@ conf:
description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
- alert: os_nova_scheduler_disabled
expr: services_nova_scheduler_disabled_total > 0
expr: openstack_services_nova_scheduler_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
- alert: os_nova_compute_down
expr: openstack_services_nova_compute_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-compute is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is down on some hosts'
- alert: os_nova_conductor_down
expr: openstack_services_nova_conductor_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-conductor is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is down on some hosts'
- alert: os_nova_consoleauth_down
expr: openstack_services_nova_consoleauth_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is down on some hosts'
- alert: os_nova_scheduler_down
expr: openstack_services_nova_scheduler_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is down on some hosts'
- alert: os_vm_vcpu_usage_high
expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
summary: 'Openstack VM vcpu usage is high'
- alert: os_vm_ram_usage_high
expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM RAM usage is hight at {{$value}} percent'
summary: 'Openstack VM RAM usage is high'
- alert: os_vm_disk_usage_high
expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM Disk usage is hight at {{$value}} percent'
summary: 'Openstack VM Disk usage is high'
ceph:
groups:
- name: ceph.rules
@ -1989,3 +2101,107 @@ conf:
annotations:
description: 'The mysql innodb replication has fallen behind and is not recovering'
summary: 'MySQL innodb replication is lagging'
postgresql:
groups:
- name: postgresql.rules
rules:
- alert: pg_replication_fallen_behind
expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
title: Postgres Replication lag is over 2 minutes
- alert: pg_connections_too_high
expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
for: 5m
labels:
severity: warn
channel: database
annotations:
title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
- alert: pg_deadlocks_detected
expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
for: 5m
labels:
severity: warn
annotations:
description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
title: Postgres server is experiencing deadlocks
prometheus_exporters:
groups:
- name: prometheus_exporters.rules
rules:
- alert: prom_exporter_ceph_unavailable
expr: absent(ceph_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
title: Ceph exporter is not collecting metrics or is not available
- alert: prom_exporter_openstack_unavailable
expr: absent(openstack_exporter_cache_refresh_duration_seconds)
for: 10m
labels:
severity: warning
annotations:
description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
title: Openstack exporter is not collecting metrics or is not available
- alert: prom_exporter_mariadb_unavailable
expr: absent(mysql_up)
for: 10m
labels:
severity: warning
annotations:
description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
title: MariaDB exporter is not collecting metrics or is not available
- alert: prom_exporter_kube_state_metrics_unavailable
expr: absent(kube_node_info)
for: 10m
labels:
severity: warning
annotations:
description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
title: kube-state-metrics exporter is not collecting metrics or is not available
- alert: prom_exporter_postgresql_unavailable
expr: absent(pg_static)
for: 10m
labels:
severity: warning
annotations:
description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
title: postgresql exporter is not collecting metrics or is not available
- alert: prom_exporter_node_unavailable
expr: absent(node_uname_info)
for: 10m
labels:
severity: warning
annotations:
description: node exporter is not collecting metrics or is not available for past 10 minutes
title: node exporter is not collecting metrics or is not available
- alert: prom_exporter_calico_unavailable
expr: absent(felix_host)
for: 10m
labels:
severity: warning
annotations:
description: Calico exporter is not collecting metrics or is not available for past 10 minutes
title: Calico exporter is not collecting metrics or is not available
- alert: prom_exporter_elasticsearch_unavailable
expr: absent(elasticsearch_cluster_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
title: Elasticsearch exporter is not collecting metrics or is not available
- alert: prom_exporter_fluentd_unavailable
expr: absent(fluentd_up)
for: 10m
labels:
severity: warning
annotations:
description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
title: Fluentd exporter is not collecting metrics or is not available