Monitor postgresql, Openstack virt resources, api, logs, pod and nodes status

Fixing opebstack API monitors Adding additional neutron services monitors Adding new Pod CrashLoopBaackOff status check Adding new Host readiness check Updated the nagios image reference(https://review.gerrithub.io/c/att-comdev/nagios/+/420590 - Pending) This updated image provides a mechanism for querying Elasticsearch with the goal of triggering alerts based on specified applications and log levels. Finally, this moves the endpoints resulting from the authenticated endpoint lookups required for Nagios to the nagios secret instead of handled via plain text environment variables Change-Id: I517d8e6e6e8fa1d359382be8a131a8e45bf243e2
2018-07-03 20:19:56 +00:00 · 2018-07-03 20:19:56 +00:00 · db0d653b4d
commit db0d653b4d
parent f2271a60a6
4 changed files with 452 additions and 34 deletions
--- a/nagios/templates/deployment.yaml
+++ b/nagios/templates/deployment.yaml
@ -129,8 +129,6 @@ spec:
            - name: nagios
              containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
          env:
-            - name: PROMETHEUS_SERVICE
-              value: {{ tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
            - name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT
              value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }}
            - name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT
@ -139,6 +137,16 @@ spec:
              value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }}
            - name: REST_NOTIF_SECONDARY_TARGET_URL
              value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }}
+            - name: PROMETHEUS_SERVICE
+              valueFrom:
+                secretKeyRef:
+                  name: {{ $nagiosUserSecret }}
+                  key: PROMETHEUS_SERVICE
+            - name: ELASTICSEARCH_SERVICE
+              valueFrom:
+                secretKeyRef:
+                  name: {{ $nagiosUserSecret }}
+                  key: ELASTICSEARCH_SERVICE
            - name: NAGIOSADMIN_USER
              valueFrom:
                secretKeyRef:
--- a/nagios/templates/secret-nagios.yaml
+++ b/nagios/templates/secret-nagios.yaml
@ -17,6 +17,8 @@ limitations under the License.
 {{- if .Values.manifests.secret_nagios }}
 {{- $envAll := . }}
 {{- $secretName := index $envAll.Values.secrets.nagios.admin }}
+{{- $prometheusService := tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
+{{- $elasticsearchService := tuple "elasticsearch" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
 ---
 apiVersion: v1
 kind: Secret
@ -28,4 +30,6 @@ data:
  NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }}
  BIND_DN: {{ .Values.endpoints.ldap.auth.admin.bind | b64enc }}
  BIND_PASSWORD: {{ .Values.endpoints.ldap.auth.admin.password | b64enc }}
+  PROMETHEUS_SERVICE: {{ $prometheusService | b64enc }}
+  ELASTICSEARCH_SERVICE: {{ $elasticsearchService | b64enc }}
 {{- end }}
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@ -19,7 +19,7 @@
 images:
  tags:
    apache_proxy: docker.io/httpd:2.4
-    nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3
+    nagios: quay.io/attcomdev/nagios:389472c05ea4bc9f9b9e407e05e17527bfdce3cc
    dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
    image_repo_sync: docker.io/docker:17.07.0
  pull_policy: IfNotPresent
@ -137,6 +137,24 @@ endpoints:
    port:
      ldap:
        default: 389
+  elasticsearch:
+    name: elasticsearch
+    namespace: null
+    auth:
+      admin:
+        username: admin
+        password: changeme
+    hosts:
+      default: elasticsearch-logging
+    host_fqdn_override:
+      default: null
+    path:
+      default: /
+    scheme:
+      default: http
+    port:
+      http:
+        default: 80

 network:
  nagios:
@ -292,7 +310,7 @@ conf:
          AuthUserFile /usr/local/apache2/conf/.htpasswd
          AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
          AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
-          AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
+          AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
          Require valid-user
      </Proxy>
    </VirtualHost>
@ -356,10 +374,10 @@ conf:
          command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
      - send_service_http_post:
          command_name: send_service_http_post
-          command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
+          command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
      - send_host_http_post:
          command_name: send_host_http_post
-          command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
+          command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
      - check_prometheus_host_alive:
          command_name: check-prometheus-host-alive
          command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
@ -369,6 +387,9 @@ conf:
      - check_prom_alert:
          command_name: check_prom_alert
          command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
+      - check_es_alert:
+          command_name: check_es_alert
+          command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'"
      - check_filespace_mounts-usage-rate-fullin4hrs:
          command_name: check_filespace_mounts-usage-rate-fullin4hrs
          command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
@ -432,6 +453,9 @@ conf:
      - check_ceph_health:
          command_name: check_ceph_health
          command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
+      - check_prometheus_hosts:
+          command_name: check_prometheus_hosts
+          command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
    services:
      - notifying_service:
          name: notifying_service
@ -449,6 +473,12 @@ conf:
          service_description: "CEPH_health"
          check_command: check_ceph_health
          check_interval: 60
+      - check_hosts_health:
+          use: generic-service
+          hostgroup_name: prometheus-hosts
+          service_description: "Nodes_health"
+          check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready.
+          check_interval: 60
      - check_prometheus_replicas:
          use: notifying_service
          hostgroup_name: prometheus-hosts
@ -515,6 +545,12 @@ conf:
          service_description: "Pod_status-error-image-pull"
          check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
          check_interval: 60
+      - check_pod_error_crash_loop_back_off:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Pod_status-crashLoopBackOff"
+          check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status
+          check_interval: 60
      - check_replicaset_missing_replicas:
          use: notifying_service
          hostgroup_name: prometheus-hosts
@ -531,31 +567,66 @@ conf:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_glance"
-          check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
+          check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
          check_interval: 60
      - check_nova_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_nova"
-          check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
+          check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
          check_interval: 60
      - check_keystone_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_keystone"
-          check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
+          check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
          check_interval: 60
      - check_neutron_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_neutron"
-          check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
+          check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
+          check_interval: 60
+      - check_neutron_metadata_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-metadata-agent"
+          check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
+          check_interval: 60
+      - check_neutron_openvswitch_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-openvswitch-agent"
+          check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
+          check_interval: 60
+      - check_neutron_dhcp_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-dhcp-agent"
+          check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
+          check_interval: 60
+      - check_neutron_l3_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-l3-agent"
+          check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
          check_interval: 60
      - check_swift_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_swift"
-          check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
+          check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
+          check_interval: 60
+      - check_cinder_api:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "API_cinder"
+          check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
+      - check_glance_api:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "API_heat"
+          check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
          check_interval: 60
      - check_cinder_api:
          use: notifying_service
@ -573,25 +644,43 @@ conf:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-compute"
-          check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
          check_interval: 60
      - check_service_nova_conductor:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-conductor"
-          check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
          check_interval: 60
      - check_service_nova_consoleauth:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-consoleauth"
-          check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
          check_interval: 60
      - check_service_nova_scheduler:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-scheduler"
-          check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
+          check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
+          check_interval: 60
+      - check_os_vm_vcpu_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_VCPU-usage"
+          check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
+          check_interval: 60
+      - check_os_vm_ram_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_RAM-usage"
+          check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
+          check_interval: 60
+      - check_os_vm_disk_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_Disk-usage"
+          check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
          check_interval: 60
      - check_ceph_monitor_quorum:
          use: notifying_service
@ -777,6 +866,107 @@ conf:
          service_description: Mariadb_innodb-replication-lag
          check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
          hostgroup_name: prometheus-hosts
+      - check_prometheus_hosts:
+          use: notifying_service
+          service_description: Prometheus_hosts-update
+          check_command: check_prometheus_hosts
+          hostgroup_name: prometheus-hosts
+          check_interval: 900
+      - check_postgresql_replication_lag:
+          use: generic-service
+          service_description: Postgresql_replication-lag
+          check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
+          hostgroup_name: prometheus-hosts
+      - check_postgresql_connections:
+          use: generic-service
+          service_description: Postgresql_connections
+          check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
+          hostgroup_name: prometheus-hosts
+      - check_postgresql_deadlocks:
+          use: generic-service
+          service_description: Postgresql_deadlocks
+          check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_ceph:
+          use: generic-service
+          service_description: Prometheus-exporter_CEPH
+          check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_openstack:
+          use: generic-service
+          service_description: Prometheus-exporter_Openstack
+          check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_mariadb:
+          use: generic-service
+          service_description: Prometheus-exporter_MariaDB
+          check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_kube_state_metrics:
+          use: generic-service
+          service_description: Prometheus-exporter_Kube-state-metrics
+          check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_postgresql:
+          use: generic-service
+          service_description: Prometheus-exporter_Postgresql
+          check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_node:
+          use: generic-service
+          service_description: Prometheus-exporter_Node
+          check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_calico:
+          use: generic-service
+          service_description: Prometheus-exporter_Calico
+          check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_elasticsearch:
+          use: generic-service
+          service_description: Prometheus-exporter_Elasticsearch
+          check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_fluentd:
+          use: generic-service
+          service_description: Prometheus-exporter_Fluentd
+          check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_logmon_glance:
+          use: generic-service
+          service_description: Logmon_glance-error
+          check_command: check_es_alert!glance!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_keystone:
+          use: generic-service
+          service_description: Logmon_keystone-error
+          check_command: check_es_alert!keystone!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_nova:
+          use: generic-service
+          service_description: Logmon_nova-error
+          check_command: check_es_alert!nova!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_neutron:
+          use: generic-service
+          service_description: Logmon_neutron-error
+          check_command: check_es_alert!neutron!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_cinder:
+          use: generic-service
+          service_description: Logmon_cinder-error
+          check_command: check_es_alert!cinder!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_heat:
+          use: generic-service
+          service_description: Logmon_heat-error
+          check_command: check_es_alert!heat!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_horizon:
+          use: generic-service
+          service_description: Logmon_horizon-error
+          check_command: check_es_alert!horizon!15!CRITICAL,ERROR!10!docker_fluentd
+          hostgroup_name: prometheus-hosts
      - check_filespace_mounts-usage-rate-fullin4hrs:
          use: notifying_service
          hostgroup_name: base-os
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@ -1185,6 +1185,14 @@ conf:
            annotations:
              description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
              summary: Many Kubernetes nodes are Not Ready
+          - alert: K8SNodesNotReady
+            expr: count(kube_node_status_ready{condition="true"} == 0) > 0
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }} nodes are notReady state.'
+              summary: One or more Kubernetes nodes are Not Ready
          - alert: K8SKubeletDown
            expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
            for: 1h
@ -1296,7 +1304,7 @@ conf:
            annotations:
              description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
              summary: '{{$labels.statefulset}}: has inssuficient replicas.'
-          - alert: kube_daemonsets_misscheduled
+          - alert: daemonsets_misscheduled
            expr: kube_daemonset_status_number_misscheduled > 0
            for: 10m
            labels:
@ -1304,7 +1312,7 @@ conf:
            annotations:
              description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
              summary: 'Daemonsets not scheduled correctly'
-          - alert: kube_daemonsets_not_scheduled
+          - alert: daemonsets_not_scheduled
            expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
            for: 10m
            labels:
@ -1312,7 +1320,7 @@ conf:
            annotations:
              description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
              summary: 'Less than desired number of daemonsets scheduled'
-          - alert: kube_deployment_replicas_unavailable
+          - alert: deployment_replicas_unavailable
            expr: kube_deployment_status_replicas_unavailable > 0
            for: 10m
            labels:
@ -1320,7 +1328,7 @@ conf:
            annotations:
              description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
              summary: '{{$labels.deployment}}: has inssuficient replicas.'
-          - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
+          - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
            expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
            for: 10m
            labels:
@ -1328,7 +1336,7 @@ conf:
            annotations:
              description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
              summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
-          - alert: kube_job_status_failed
+          - alert: job_status_failed
            expr: kube_job_status_failed > 0
            for: 10m
            labels:
@ -1336,7 +1344,7 @@ conf:
            annotations:
              description: 'Job {{$labels.exported_job}} is in failed status'
              summary: '{{$labels.exported_job}} has failed status'
-          - alert: kube_pod_status_pending
+          - alert: pod_status_pending
            expr: kube_pod_status_phase{phase="Pending"} == 1
            for: 10m
            labels:
@ -1344,7 +1352,7 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
-          - alert: kube_pod_error_image_pull
+          - alert: pod_error_image_pull
            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
            for: 10m
            labels:
@ -1352,7 +1360,7 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: kube_pod_status_error_image_pull
+          - alert: pod_status_error_image_pull
            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
            for: 10m
            labels:
@ -1360,7 +1368,15 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: kube_replicaset_missing_replicas
+          - alert: pod_error_crash_loop_back_off
+            expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff  error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: replicaset_missing_replicas
            expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
            for: 10m
            labels:
@ -1368,7 +1384,7 @@ conf:
            annotations:
              description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
              summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
-          - alert: kube_pod_container_terminated
+          - alert: pod_container_terminated
            expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
            for: 10m
            labels:
@ -1618,7 +1634,7 @@ conf:
        - name: openstack.rules
          rules:
          - alert: os_glance_api_availability
-            expr:  check_glance_api != 1
+            expr:  openstack_check_glance_api != 1
            for: 5m
            labels:
              severity: page
@ -1626,7 +1642,7 @@ conf:
              description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Glance API is not available at {{$labels.url}}'
          - alert: os_nova_api_availability
-            expr:  check_nova_api != 1
+            expr:  openstack_check_nova_api != 1
            for: 5m
            labels:
              severity: page
@ -1634,7 +1650,7 @@ conf:
              description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Nova API is not available at {{$labels.url}}'
          - alert: os_keystone_api_availability
-            expr:  check_keystone_api != 1
+            expr:  openstack_check_keystone_api != 1
            for: 5m
            labels:
              severity: page
@ -1642,15 +1658,47 @@ conf:
              description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Keystone API is not available at {{$labels.url}}'
          - alert: os_neutron_api_availability
-            expr:  check_neutron_api != 1
+            expr:  openstack_check_neutron_api != 1
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Neutron API is not available at {{$labels.url}}'
+          - alert: os_neutron_metadata_agent_availability
+            expr:  openstack_services_neutron_metadata_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
+              summary: 'One or more neutron metadata_agents are not available'
+          - alert: os_neutron_openvswitch_agent_availability
+            expr:  openstack_services_neutron_openvswitch_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
+              summary: 'One or more neutron openvswitch agents are not available'
+          - alert: os_neutron_dhcp_agent_availability
+            expr:  openstack_services_neutron_dhcp_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
+              summary: 'One or more neutron dhcp agents are not available'
+          - alert: os_neutron_l3_agent_availability
+            expr:  openstack_services_neutron_l3_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron L3 agents are not available for more than 5 minutes'
+              summary: 'One or more neutron L3 agents are not available'
          - alert: os_swift_api_availability
-            expr:  check_swift_api != 1
+            expr:  openstack_check_swift_api != 1
            for: 5m
            labels:
              severity: page
@ -1673,8 +1721,16 @@ conf:
            annotations:
              description: 'Cinder scheduler is not available for more than 5 minutes'
              summary: 'Cinder scheduler is not available'
+          - alert: os_heat_api_availability
+            expr:  openstack_check_heat_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Heat API is not available at {{$labels.url}}'
          - alert: os_nova_compute_disabled
-            expr:  services_nova_compute_disabled_total > 0
+            expr:  openstack_services_nova_compute_disabled_total > 0
            for: 5m
            labels:
              severity: page
@ -1682,7 +1738,7 @@ conf:
              description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-compute is disabled on some hosts'
          - alert: os_nova_conductor_disabled
-            expr:  services_nova_conductor_disabled_total > 0
+            expr:  openstack_services_nova_conductor_disabled_total > 0
            for: 5m
            labels:
              severity: page
@ -1690,7 +1746,7 @@ conf:
              description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-conductor is disabled on some hosts'
          - alert: os_nova_consoleauth_disabled
-            expr:  services_nova_consoleauth_disabled_total > 0
+            expr:  openstack_services_nova_consoleauth_disabled_total > 0
            for: 5m
            labels:
              severity: page
@ -1698,13 +1754,69 @@ conf:
              description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
          - alert: os_nova_scheduler_disabled
-            expr:  services_nova_scheduler_disabled_total > 0
+            expr:  openstack_services_nova_scheduler_disabled_total > 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
+          - alert: os_nova_compute_down
+            expr:  openstack_services_nova_compute_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-compute is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-compute is down on some hosts'
+          - alert: os_nova_conductor_down
+            expr:  openstack_services_nova_conductor_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-conductor is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-conductor is down on some hosts'
+          - alert: os_nova_consoleauth_down
+            expr:  openstack_services_nova_consoleauth_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-consoleauth is down on some hosts'
+          - alert: os_nova_scheduler_down
+            expr:  openstack_services_nova_scheduler_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-scheduler is down on some hosts'
+          - alert: os_vm_vcpu_usage_high
+            expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
+              summary: 'Openstack VM vcpu usage is high'
+          - alert: os_vm_ram_usage_high
+            expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM RAM usage is hight at {{$value}} percent'
+              summary: 'Openstack VM RAM usage is high'
+          - alert: os_vm_disk_usage_high
+            expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM Disk usage is hight at {{$value}} percent'
+              summary: 'Openstack VM Disk usage is high'
      ceph:
        groups:
        - name: ceph.rules
@ -1989,3 +2101,107 @@ conf:
            annotations:
              description: 'The mysql innodb replication has fallen behind and is not recovering'
              summary: 'MySQL innodb replication is lagging'
+      postgresql:
+        groups:
+        - name: postgresql.rules
+          rules:
+          - alert: pg_replication_fallen_behind
+            expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica ==  1)
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
+              title: Postgres Replication lag is over 2 minutes
+          - alert: pg_connections_too_high
+            expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
+            for: 5m
+            labels:
+              severity: warn
+              channel: database
+            annotations:
+              title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
+          - alert: pg_deadlocks_detected
+            expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
+            for: 5m
+            labels:
+              severity: warn
+            annotations:
+              description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
+              title: Postgres server is experiencing deadlocks
+      prometheus_exporters:
+        groups:
+        - name: prometheus_exporters.rules
+          rules:
+          - alert: prom_exporter_ceph_unavailable
+            expr: absent(ceph_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
+              title: Ceph exporter is not collecting metrics or is not available
+          - alert: prom_exporter_openstack_unavailable
+            expr: absent(openstack_exporter_cache_refresh_duration_seconds)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
+              title: Openstack exporter is not collecting metrics or is not available
+          - alert: prom_exporter_mariadb_unavailable
+            expr: absent(mysql_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
+              title: MariaDB exporter is not collecting metrics or is not available
+          - alert: prom_exporter_kube_state_metrics_unavailable
+            expr: absent(kube_node_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
+              title: kube-state-metrics exporter is not collecting metrics or is not available
+          - alert: prom_exporter_postgresql_unavailable
+            expr: absent(pg_static)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
+              title: postgresql exporter is not collecting metrics or is not available
+          - alert: prom_exporter_node_unavailable
+            expr: absent(node_uname_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: node exporter is not collecting metrics or is not available for past 10 minutes
+              title: node exporter is not collecting metrics or is not available
+          - alert: prom_exporter_calico_unavailable
+            expr: absent(felix_host)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Calico exporter is not collecting metrics or is not available for past 10 minutes
+              title: Calico exporter is not collecting metrics or is not available
+          - alert: prom_exporter_elasticsearch_unavailable
+            expr: absent(elasticsearch_cluster_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
+              title: Elasticsearch exporter is not collecting metrics or is not available
+          - alert: prom_exporter_fluentd_unavailable
+            expr: absent(fluentd_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
+              title: Fluentd exporter is not collecting metrics or is not available