From db0d653b4d2b7f37bfea205d95b9cbb3e83e14fb Mon Sep 17 00:00:00 2001 From: rakesh-patnaik Date: Tue, 3 Jul 2018 20:19:56 +0000 Subject: [PATCH] Monitor postgresql, Openstack virt resources, api, logs, pod and nodes status Fixing opebstack API monitors Adding additional neutron services monitors Adding new Pod CrashLoopBaackOff status check Adding new Host readiness check Updated the nagios image reference(https://review.gerrithub.io/c/att-comdev/nagios/+/420590 - Pending) This updated image provides a mechanism for querying Elasticsearch with the goal of triggering alerts based on specified applications and log levels. Finally, this moves the endpoints resulting from the authenticated endpoint lookups required for Nagios to the nagios secret instead of handled via plain text environment variables Change-Id: I517d8e6e6e8fa1d359382be8a131a8e45bf243e2 --- nagios/templates/deployment.yaml | 12 +- nagios/templates/secret-nagios.yaml | 4 + nagios/values.yaml | 216 +++++++++++++++++++++-- prometheus/values.yaml | 254 +++++++++++++++++++++++++--- 4 files changed, 452 insertions(+), 34 deletions(-) diff --git a/nagios/templates/deployment.yaml b/nagios/templates/deployment.yaml index 09b030252..fb469192a 100644 --- a/nagios/templates/deployment.yaml +++ b/nagios/templates/deployment.yaml @@ -129,8 +129,6 @@ spec: - name: nagios containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} env: - - name: PROMETHEUS_SERVICE - value: {{ tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }} - name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }} - name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT @@ -139,6 +137,16 @@ spec: value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }} - name: REST_NOTIF_SECONDARY_TARGET_URL value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }} + - name: PROMETHEUS_SERVICE + valueFrom: + secretKeyRef: + name: {{ $nagiosUserSecret }} + key: PROMETHEUS_SERVICE + - name: ELASTICSEARCH_SERVICE + valueFrom: + secretKeyRef: + name: {{ $nagiosUserSecret }} + key: ELASTICSEARCH_SERVICE - name: NAGIOSADMIN_USER valueFrom: secretKeyRef: diff --git a/nagios/templates/secret-nagios.yaml b/nagios/templates/secret-nagios.yaml index 56155f5db..0ec0b341a 100644 --- a/nagios/templates/secret-nagios.yaml +++ b/nagios/templates/secret-nagios.yaml @@ -17,6 +17,8 @@ limitations under the License. {{- if .Values.manifests.secret_nagios }} {{- $envAll := . }} {{- $secretName := index $envAll.Values.secrets.nagios.admin }} +{{- $prometheusService := tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }} +{{- $elasticsearchService := tuple "elasticsearch" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }} --- apiVersion: v1 kind: Secret @@ -28,4 +30,6 @@ data: NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }} BIND_DN: {{ .Values.endpoints.ldap.auth.admin.bind | b64enc }} BIND_PASSWORD: {{ .Values.endpoints.ldap.auth.admin.password | b64enc }} + PROMETHEUS_SERVICE: {{ $prometheusService | b64enc }} + ELASTICSEARCH_SERVICE: {{ $elasticsearchService | b64enc }} {{- end }} diff --git a/nagios/values.yaml b/nagios/values.yaml index 207cb1dff..83fd664c4 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -19,7 +19,7 @@ images: tags: apache_proxy: docker.io/httpd:2.4 - nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3 + nagios: quay.io/attcomdev/nagios:389472c05ea4bc9f9b9e407e05e17527bfdce3cc dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1 image_repo_sync: docker.io/docker:17.07.0 pull_policy: IfNotPresent @@ -137,6 +137,24 @@ endpoints: port: ldap: default: 389 + elasticsearch: + name: elasticsearch + namespace: null + auth: + admin: + username: admin + password: changeme + hosts: + default: elasticsearch-logging + host_fqdn_override: + default: null + path: + default: / + scheme: + default: http + port: + http: + default: 80 network: nagios: @@ -292,7 +310,7 @@ conf: AuthUserFile /usr/local/apache2/conf/.htpasswd AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }} AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }} - AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }} + AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }} Require valid-user @@ -356,10 +374,10 @@ conf: command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'" - send_service_http_post: command_name: send_service_http_post - command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'" + command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'" - send_host_http_post: command_name: send_host_http_post - command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'" + command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'" - check_prometheus_host_alive: command_name: check-prometheus-host-alive command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10" @@ -369,6 +387,9 @@ conf: - check_prom_alert: command_name: check_prom_alert command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'" + - check_es_alert: + command_name: check_es_alert + command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'" - check_filespace_mounts-usage-rate-fullin4hrs: command_name: check_filespace_mounts-usage-rate-fullin4hrs command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal' @@ -432,6 +453,9 @@ conf: - check_ceph_health: command_name: check_ceph_health command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0 + - check_prometheus_hosts: + command_name: check_prometheus_hosts + command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg services: - notifying_service: name: notifying_service @@ -449,6 +473,12 @@ conf: service_description: "CEPH_health" check_command: check_ceph_health check_interval: 60 + - check_hosts_health: + use: generic-service + hostgroup_name: prometheus-hosts + service_description: "Nodes_health" + check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready. + check_interval: 60 - check_prometheus_replicas: use: notifying_service hostgroup_name: prometheus-hosts @@ -515,6 +545,12 @@ conf: service_description: "Pod_status-error-image-pull" check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status check_interval: 60 + - check_pod_error_crash_loop_back_off: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Pod_status-crashLoopBackOff" + check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status + check_interval: 60 - check_replicaset_missing_replicas: use: notifying_service hostgroup_name: prometheus-hosts @@ -531,31 +567,66 @@ conf: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_glance" - check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available + check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available check_interval: 60 - check_nova_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_nova" - check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available + check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available check_interval: 60 - check_keystone_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_keystone" - check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available + check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available check_interval: 60 - check_neutron_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_neutron" - check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available + check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available + check_interval: 60 + - check_neutron_metadata_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-metadata-agent" + check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up + check_interval: 60 + - check_neutron_openvswitch_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-openvswitch-agent" + check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up + check_interval: 60 + - check_neutron_dhcp_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-dhcp-agent" + check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up + check_interval: 60 + - check_neutron_l3_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-l3-agent" + check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up check_interval: 60 - check_swift_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_swift" - check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available + check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available + check_interval: 60 + - check_cinder_api: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "API_cinder" + check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available + - check_glance_api: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "API_heat" + check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available check_interval: 60 - check_cinder_api: use: notifying_service @@ -573,25 +644,43 @@ conf: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-compute" - check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts + check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts check_interval: 60 - check_service_nova_conductor: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-conductor" - check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts + check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts check_interval: 60 - check_service_nova_consoleauth: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-consoleauth" - check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts + check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts check_interval: 60 - check_service_nova_scheduler: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-scheduler" - check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts + check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts + check_interval: 60 + - check_os_vm_vcpu_usage: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "OS-Total-Quota_VCPU-usage" + check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available. + check_interval: 60 + - check_os_vm_ram_usage: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "OS-Total-Quota_RAM-usage" + check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available. + check_interval: 60 + - check_os_vm_disk_usage: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "OS-Total-Quota_Disk-usage" + check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available. check_interval: 60 - check_ceph_monitor_quorum: use: notifying_service @@ -777,6 +866,107 @@ conf: service_description: Mariadb_innodb-replication-lag check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal. hostgroup_name: prometheus-hosts + - check_prometheus_hosts: + use: notifying_service + service_description: Prometheus_hosts-update + check_command: check_prometheus_hosts + hostgroup_name: prometheus-hosts + check_interval: 900 + - check_postgresql_replication_lag: + use: generic-service + service_description: Postgresql_replication-lag + check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal. + hostgroup_name: prometheus-hosts + - check_postgresql_connections: + use: generic-service + service_description: Postgresql_connections + check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds. + hostgroup_name: prometheus-hosts + - check_postgresql_deadlocks: + use: generic-service + service_description: Postgresql_deadlocks + check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks. + hostgroup_name: prometheus-hosts + - check_prom_exporter_ceph: + use: generic-service + service_description: Prometheus-exporter_CEPH + check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_openstack: + use: generic-service + service_description: Prometheus-exporter_Openstack + check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_mariadb: + use: generic-service + service_description: Prometheus-exporter_MariaDB + check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_kube_state_metrics: + use: generic-service + service_description: Prometheus-exporter_Kube-state-metrics + check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_postgresql: + use: generic-service + service_description: Prometheus-exporter_Postgresql + check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_node: + use: generic-service + service_description: Prometheus-exporter_Node + check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_calico: + use: generic-service + service_description: Prometheus-exporter_Calico + check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_elasticsearch: + use: generic-service + service_description: Prometheus-exporter_Elasticsearch + check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_fluentd: + use: generic-service + service_description: Prometheus-exporter_Fluentd + check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_logmon_glance: + use: generic-service + service_description: Logmon_glance-error + check_command: check_es_alert!glance!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_keystone: + use: generic-service + service_description: Logmon_keystone-error + check_command: check_es_alert!keystone!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_nova: + use: generic-service + service_description: Logmon_nova-error + check_command: check_es_alert!nova!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_neutron: + use: generic-service + service_description: Logmon_neutron-error + check_command: check_es_alert!neutron!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_cinder: + use: generic-service + service_description: Logmon_cinder-error + check_command: check_es_alert!cinder!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_heat: + use: generic-service + service_description: Logmon_heat-error + check_command: check_es_alert!heat!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_horizon: + use: generic-service + service_description: Logmon_horizon-error + check_command: check_es_alert!horizon!15!CRITICAL,ERROR!10!docker_fluentd + hostgroup_name: prometheus-hosts - check_filespace_mounts-usage-rate-fullin4hrs: use: notifying_service hostgroup_name: base-os diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 1c47081ef..249255662 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1185,6 +1185,14 @@ conf: annotations: description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' summary: Many Kubernetes nodes are Not Ready + - alert: K8SNodesNotReady + expr: count(kube_node_status_ready{condition="true"} == 0) > 0 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} nodes are notReady state.' + summary: One or more Kubernetes nodes are Not Ready - alert: K8SKubeletDown expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 for: 1h @@ -1296,7 +1304,7 @@ conf: annotations: description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired' summary: '{{$labels.statefulset}}: has inssuficient replicas.' - - alert: kube_daemonsets_misscheduled + - alert: daemonsets_misscheduled expr: kube_daemonset_status_number_misscheduled > 0 for: 10m labels: @@ -1304,7 +1312,7 @@ conf: annotations: description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run' summary: 'Daemonsets not scheduled correctly' - - alert: kube_daemonsets_not_scheduled + - alert: daemonsets_not_scheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 for: 10m labels: @@ -1312,7 +1320,7 @@ conf: annotations: description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number' summary: 'Less than desired number of daemonsets scheduled' - - alert: kube_deployment_replicas_unavailable + - alert: deployment_replicas_unavailable expr: kube_deployment_status_replicas_unavailable > 0 for: 10m labels: @@ -1320,7 +1328,7 @@ conf: annotations: description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable' summary: '{{$labels.deployment}}: has inssuficient replicas.' - - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable + - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0 for: 10m labels: @@ -1328,7 +1336,7 @@ conf: annotations: description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update' summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.' - - alert: kube_job_status_failed + - alert: job_status_failed expr: kube_job_status_failed > 0 for: 10m labels: @@ -1336,7 +1344,7 @@ conf: annotations: description: 'Job {{$labels.exported_job}} is in failed status' summary: '{{$labels.exported_job}} has failed status' - - alert: kube_pod_status_pending + - alert: pod_status_pending expr: kube_pod_status_phase{phase="Pending"} == 1 for: 10m labels: @@ -1344,7 +1352,7 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status' - - alert: kube_pod_error_image_pull + - alert: pod_error_image_pull expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 for: 10m labels: @@ -1352,7 +1360,7 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - - alert: kube_pod_status_error_image_pull + - alert: pod_status_error_image_pull expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 for: 10m labels: @@ -1360,7 +1368,15 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - - alert: kube_replicaset_missing_replicas + - alert: pod_error_crash_loop_back_off + expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: replicaset_missing_replicas expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0 for: 10m labels: @@ -1368,7 +1384,7 @@ conf: annotations: description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes' summary: 'Replicaset {{$labels.replicaset}} is missing replicas' - - alert: kube_pod_container_terminated + - alert: pod_container_terminated expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0 for: 10m labels: @@ -1618,7 +1634,7 @@ conf: - name: openstack.rules rules: - alert: os_glance_api_availability - expr: check_glance_api != 1 + expr: openstack_check_glance_api != 1 for: 5m labels: severity: page @@ -1626,7 +1642,7 @@ conf: description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Glance API is not available at {{$labels.url}}' - alert: os_nova_api_availability - expr: check_nova_api != 1 + expr: openstack_check_nova_api != 1 for: 5m labels: severity: page @@ -1634,7 +1650,7 @@ conf: description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Nova API is not available at {{$labels.url}}' - alert: os_keystone_api_availability - expr: check_keystone_api != 1 + expr: openstack_check_keystone_api != 1 for: 5m labels: severity: page @@ -1642,15 +1658,47 @@ conf: description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Keystone API is not available at {{$labels.url}}' - alert: os_neutron_api_availability - expr: check_neutron_api != 1 + expr: openstack_check_neutron_api != 1 for: 5m labels: severity: page annotations: description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Neutron API is not available at {{$labels.url}}' + - alert: os_neutron_metadata_agent_availability + expr: openstack_services_neutron_metadata_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron metadata_agents are not available for more than 5 minutes' + summary: 'One or more neutron metadata_agents are not available' + - alert: os_neutron_openvswitch_agent_availability + expr: openstack_services_neutron_openvswitch_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron openvswitch agents are not available for more than 5 minutes' + summary: 'One or more neutron openvswitch agents are not available' + - alert: os_neutron_dhcp_agent_availability + expr: openstack_services_neutron_dhcp_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron dhcp agents are not available for more than 5 minutes' + summary: 'One or more neutron dhcp agents are not available' + - alert: os_neutron_l3_agent_availability + expr: openstack_services_neutron_l3_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron L3 agents are not available for more than 5 minutes' + summary: 'One or more neutron L3 agents are not available' - alert: os_swift_api_availability - expr: check_swift_api != 1 + expr: openstack_check_swift_api != 1 for: 5m labels: severity: page @@ -1673,8 +1721,16 @@ conf: annotations: description: 'Cinder scheduler is not available for more than 5 minutes' summary: 'Cinder scheduler is not available' + - alert: os_heat_api_availability + expr: openstack_check_heat_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Heat API is not available at {{$labels.url}}' - alert: os_nova_compute_disabled - expr: services_nova_compute_disabled_total > 0 + expr: openstack_services_nova_compute_disabled_total > 0 for: 5m labels: severity: page @@ -1682,7 +1738,7 @@ conf: description: 'nova-compute is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-compute is disabled on some hosts' - alert: os_nova_conductor_disabled - expr: services_nova_conductor_disabled_total > 0 + expr: openstack_services_nova_conductor_disabled_total > 0 for: 5m labels: severity: page @@ -1690,7 +1746,7 @@ conf: description: 'nova-conductor is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-conductor is disabled on some hosts' - alert: os_nova_consoleauth_disabled - expr: services_nova_consoleauth_disabled_total > 0 + expr: openstack_services_nova_consoleauth_disabled_total > 0 for: 5m labels: severity: page @@ -1698,13 +1754,69 @@ conf: description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-consoleauth is disabled on some hosts' - alert: os_nova_scheduler_disabled - expr: services_nova_scheduler_disabled_total > 0 + expr: openstack_services_nova_scheduler_disabled_total > 0 for: 5m labels: severity: page annotations: description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-scheduler is disabled on some hosts' + - alert: os_nova_compute_down + expr: openstack_services_nova_compute_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-compute is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-compute is down on some hosts' + - alert: os_nova_conductor_down + expr: openstack_services_nova_conductor_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-conductor is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-conductor is down on some hosts' + - alert: os_nova_consoleauth_down + expr: openstack_services_nova_consoleauth_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-consoleauth is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-consoleauth is down on some hosts' + - alert: os_nova_scheduler_down + expr: openstack_services_nova_scheduler_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-scheduler is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-scheduler is down on some hosts' + - alert: os_vm_vcpu_usage_high + expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'Openstack VM vcpu usage is hight at {{$value}} percent' + summary: 'Openstack VM vcpu usage is high' + - alert: os_vm_ram_usage_high + expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'Openstack VM RAM usage is hight at {{$value}} percent' + summary: 'Openstack VM RAM usage is high' + - alert: os_vm_disk_usage_high + expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'Openstack VM Disk usage is hight at {{$value}} percent' + summary: 'Openstack VM Disk usage is high' ceph: groups: - name: ceph.rules @@ -1989,3 +2101,107 @@ conf: annotations: description: 'The mysql innodb replication has fallen behind and is not recovering' summary: 'MySQL innodb replication is lagging' + postgresql: + groups: + - name: postgresql.rules + rules: + - alert: pg_replication_fallen_behind + expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica == 1) + for: 5m + labels: + severity: warning + annotations: + description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }} + title: Postgres Replication lag is over 2 minutes + - alert: pg_connections_too_high + expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95 + for: 5m + labels: + severity: warn + channel: database + annotations: + title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum + - alert: pg_deadlocks_detected + expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0 + for: 5m + labels: + severity: warn + annotations: + description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}} + title: Postgres server is experiencing deadlocks + prometheus_exporters: + groups: + - name: prometheus_exporters.rules + rules: + - alert: prom_exporter_ceph_unavailable + expr: absent(ceph_health_status) + for: 10m + labels: + severity: warning + annotations: + description: Ceph exporter is not collecting metrics or is not available for past 10 minutes + title: Ceph exporter is not collecting metrics or is not available + - alert: prom_exporter_openstack_unavailable + expr: absent(openstack_exporter_cache_refresh_duration_seconds) + for: 10m + labels: + severity: warning + annotations: + description: Openstack exporter is not collecting metrics or is not available for past 10 minutes + title: Openstack exporter is not collecting metrics or is not available + - alert: prom_exporter_mariadb_unavailable + expr: absent(mysql_up) + for: 10m + labels: + severity: warning + annotations: + description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes + title: MariaDB exporter is not collecting metrics or is not available + - alert: prom_exporter_kube_state_metrics_unavailable + expr: absent(kube_node_info) + for: 10m + labels: + severity: warning + annotations: + description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes + title: kube-state-metrics exporter is not collecting metrics or is not available + - alert: prom_exporter_postgresql_unavailable + expr: absent(pg_static) + for: 10m + labels: + severity: warning + annotations: + description: postgresql exporter is not collecting metrics or is not available for past 10 minutes + title: postgresql exporter is not collecting metrics or is not available + - alert: prom_exporter_node_unavailable + expr: absent(node_uname_info) + for: 10m + labels: + severity: warning + annotations: + description: node exporter is not collecting metrics or is not available for past 10 minutes + title: node exporter is not collecting metrics or is not available + - alert: prom_exporter_calico_unavailable + expr: absent(felix_host) + for: 10m + labels: + severity: warning + annotations: + description: Calico exporter is not collecting metrics or is not available for past 10 minutes + title: Calico exporter is not collecting metrics or is not available + - alert: prom_exporter_elasticsearch_unavailable + expr: absent(elasticsearch_cluster_health_status) + for: 10m + labels: + severity: warning + annotations: + description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes + title: Elasticsearch exporter is not collecting metrics or is not available + - alert: prom_exporter_fluentd_unavailable + expr: absent(fluentd_up) + for: 10m + labels: + severity: warning + annotations: + description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes + title: Fluentd exporter is not collecting metrics or is not available