Merge "Monitor postgresql, Openstack virt resources, api, logs, pod and nodes status"

2018-09-21 12:12:00 +00:00 · 2018-09-21 12:12:00 +00:00 · 4cd00f3ac5
commit 4cd00f3ac5
parent 87460594dd db0d653b4d
4 changed files with 452 additions and 34 deletions
--- a/nagios/templates/deployment.yaml
+++ b/nagios/templates/deployment.yaml
@ -129,8 +129,6 @@ spec:
            - name: nagios
              containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
          env:
-            - name: PROMETHEUS_SERVICE
-              value: {{ tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
            - name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT
              value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }}
            - name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT
@ -139,6 +137,16 @@ spec:
              value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }}
            - name: REST_NOTIF_SECONDARY_TARGET_URL
              value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }}
+            - name: PROMETHEUS_SERVICE
+              valueFrom:
+                secretKeyRef:
+                  name: {{ $nagiosUserSecret }}
+                  key: PROMETHEUS_SERVICE
+            - name: ELASTICSEARCH_SERVICE
+              valueFrom:
+                secretKeyRef:
+                  name: {{ $nagiosUserSecret }}
+                  key: ELASTICSEARCH_SERVICE
            - name: NAGIOSADMIN_USER
              valueFrom:
                secretKeyRef:
--- a/nagios/templates/secret-nagios.yaml
+++ b/nagios/templates/secret-nagios.yaml
@ -17,6 +17,8 @@ limitations under the License.
 {{- if .Values.manifests.secret_nagios }}
 {{- $envAll := . }}
 {{- $secretName := index $envAll.Values.secrets.nagios.admin }}
+{{- $prometheusService := tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
+{{- $elasticsearchService := tuple "elasticsearch" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
 ---
 apiVersion: v1
 kind: Secret
@ -28,4 +30,6 @@ data:
  NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }}
  BIND_DN: {{ .Values.endpoints.ldap.auth.admin.bind | b64enc }}
  BIND_PASSWORD: {{ .Values.endpoints.ldap.auth.admin.password | b64enc }}
+  PROMETHEUS_SERVICE: {{ $prometheusService | b64enc }}
+  ELASTICSEARCH_SERVICE: {{ $elasticsearchService | b64enc }}
 {{- end }}
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@ -19,7 +19,7 @@
 images:
  tags:
    apache_proxy: docker.io/httpd:2.4
-    nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3
+    nagios: quay.io/attcomdev/nagios:389472c05ea4bc9f9b9e407e05e17527bfdce3cc
    dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
    image_repo_sync: docker.io/docker:17.07.0
  pull_policy: IfNotPresent
@ -137,6 +137,24 @@ endpoints:
    port:
      ldap:
        default: 389
+  elasticsearch:
+    name: elasticsearch
+    namespace: null
+    auth:
+      admin:
+        username: admin
+        password: changeme
+    hosts:
+      default: elasticsearch-logging
+    host_fqdn_override:
+      default: null
+    path:
+      default: /
+    scheme:
+      default: http
+    port:
+      http:
+        default: 80

 network:
  nagios:
@ -292,7 +310,7 @@ conf:
          AuthUserFile /usr/local/apache2/conf/.htpasswd
          AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
          AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
-          AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
+          AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
          Require valid-user
      </Proxy>
    </VirtualHost>
@ -356,10 +374,10 @@ conf:
          command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
      - send_service_http_post:
          command_name: send_service_http_post
-          command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
+          command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
      - send_host_http_post:
          command_name: send_host_http_post
-          command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
+          command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
      - check_prometheus_host_alive:
          command_name: check-prometheus-host-alive
          command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
@ -369,6 +387,9 @@ conf:
      - check_prom_alert:
          command_name: check_prom_alert
          command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
+      - check_es_alert:
+          command_name: check_es_alert
+          command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'"
      - check_filespace_mounts-usage-rate-fullin4hrs:
          command_name: check_filespace_mounts-usage-rate-fullin4hrs
          command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
@ -432,6 +453,9 @@ conf:
      - check_ceph_health:
          command_name: check_ceph_health
          command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
+      - check_prometheus_hosts:
+          command_name: check_prometheus_hosts
+          command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
    services:
      - notifying_service:
          name: notifying_service
@ -449,6 +473,12 @@ conf:
          service_description: "CEPH_health"
          check_command: check_ceph_health
          check_interval: 60
+      - check_hosts_health:
+          use: generic-service
+          hostgroup_name: prometheus-hosts
+          service_description: "Nodes_health"
+          check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready.
+          check_interval: 60
      - check_prometheus_replicas:
          use: notifying_service
          hostgroup_name: prometheus-hosts
@ -515,6 +545,12 @@ conf:
          service_description: "Pod_status-error-image-pull"
          check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
          check_interval: 60
+      - check_pod_error_crash_loop_back_off:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Pod_status-crashLoopBackOff"
+          check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status
+          check_interval: 60
      - check_replicaset_missing_replicas:
          use: notifying_service
          hostgroup_name: prometheus-hosts
@ -531,31 +567,66 @@ conf:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_glance"
-          check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
+          check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
          check_interval: 60
      - check_nova_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_nova"
-          check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
+          check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
          check_interval: 60
      - check_keystone_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_keystone"
-          check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
+          check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
          check_interval: 60
      - check_neutron_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_neutron"
-          check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
+          check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
+          check_interval: 60
+      - check_neutron_metadata_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-metadata-agent"
+          check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
+          check_interval: 60
+      - check_neutron_openvswitch_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-openvswitch-agent"
+          check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
+          check_interval: 60
+      - check_neutron_dhcp_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-dhcp-agent"
+          check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
+          check_interval: 60
+      - check_neutron_l3_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-l3-agent"
+          check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
          check_interval: 60
      - check_swift_api:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "API_swift"
-          check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
+          check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
+          check_interval: 60
+      - check_cinder_api:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "API_cinder"
+          check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
+      - check_glance_api:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "API_heat"
+          check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
          check_interval: 60
      - check_cinder_api:
          use: notifying_service
@ -573,25 +644,43 @@ conf:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-compute"
-          check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
          check_interval: 60
      - check_service_nova_conductor:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-conductor"
-          check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
          check_interval: 60
      - check_service_nova_consoleauth:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-consoleauth"
-          check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
          check_interval: 60
      - check_service_nova_scheduler:
          use: notifying_service
          hostgroup_name: prometheus-hosts
          service_description: "Service_nova-scheduler"
-          check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
+          check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
+          check_interval: 60
+      - check_os_vm_vcpu_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_VCPU-usage"
+          check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
+          check_interval: 60
+      - check_os_vm_ram_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_RAM-usage"
+          check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
+          check_interval: 60
+      - check_os_vm_disk_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_Disk-usage"
+          check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
          check_interval: 60
      - check_ceph_monitor_quorum:
          use: notifying_service
@ -777,6 +866,107 @@ conf:
          service_description: Mariadb_innodb-replication-lag
          check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
          hostgroup_name: prometheus-hosts
+      - check_prometheus_hosts:
+          use: notifying_service
+          service_description: Prometheus_hosts-update
+          check_command: check_prometheus_hosts
+          hostgroup_name: prometheus-hosts
+          check_interval: 900
+      - check_postgresql_replication_lag:
+          use: generic-service
+          service_description: Postgresql_replication-lag
+          check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
+          hostgroup_name: prometheus-hosts
+      - check_postgresql_connections:
+          use: generic-service
+          service_description: Postgresql_connections
+          check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
+          hostgroup_name: prometheus-hosts
+      - check_postgresql_deadlocks:
+          use: generic-service
+          service_description: Postgresql_deadlocks
+          check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_ceph:
+          use: generic-service
+          service_description: Prometheus-exporter_CEPH
+          check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_openstack:
+          use: generic-service
+          service_description: Prometheus-exporter_Openstack
+          check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_mariadb:
+          use: generic-service
+          service_description: Prometheus-exporter_MariaDB
+          check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_kube_state_metrics:
+          use: generic-service
+          service_description: Prometheus-exporter_Kube-state-metrics
+          check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_postgresql:
+          use: generic-service
+          service_description: Prometheus-exporter_Postgresql
+          check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_node:
+          use: generic-service
+          service_description: Prometheus-exporter_Node
+          check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_calico:
+          use: generic-service
+          service_description: Prometheus-exporter_Calico
+          check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_elasticsearch:
+          use: generic-service
+          service_description: Prometheus-exporter_Elasticsearch
+          check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_fluentd:
+          use: generic-service
+          service_description: Prometheus-exporter_Fluentd
+          check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_logmon_glance:
+          use: generic-service
+          service_description: Logmon_glance-error
+          check_command: check_es_alert!glance!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_keystone:
+          use: generic-service
+          service_description: Logmon_keystone-error
+          check_command: check_es_alert!keystone!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_nova:
+          use: generic-service
+          service_description: Logmon_nova-error
+          check_command: check_es_alert!nova!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_neutron:
+          use: generic-service
+          service_description: Logmon_neutron-error
+          check_command: check_es_alert!neutron!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_cinder:
+          use: generic-service
+          service_description: Logmon_cinder-error
+          check_command: check_es_alert!cinder!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_heat:
+          use: generic-service
+          service_description: Logmon_heat-error
+          check_command: check_es_alert!heat!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_horizon:
+          use: generic-service
+          service_description: Logmon_horizon-error
+          check_command: check_es_alert!horizon!15!CRITICAL,ERROR!10!docker_fluentd
+          hostgroup_name: prometheus-hosts
      - check_filespace_mounts-usage-rate-fullin4hrs:
          use: notifying_service
          hostgroup_name: base-os
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@ -1185,6 +1185,14 @@ conf:
            annotations:
              description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
              summary: Many Kubernetes nodes are Not Ready
+          - alert: K8SNodesNotReady
+            expr: count(kube_node_status_ready{condition="true"} == 0) > 0
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }} nodes are notReady state.'
+              summary: One or more Kubernetes nodes are Not Ready
          - alert: K8SKubeletDown
            expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
            for: 1h
@ -1296,7 +1304,7 @@ conf:
            annotations:
              description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
              summary: '{{$labels.statefulset}}: has inssuficient replicas.'
-          - alert: kube_daemonsets_misscheduled
+          - alert: daemonsets_misscheduled
            expr: kube_daemonset_status_number_misscheduled > 0
            for: 10m
            labels:
@ -1304,7 +1312,7 @@ conf:
            annotations:
              description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
              summary: 'Daemonsets not scheduled correctly'
-          - alert: kube_daemonsets_not_scheduled
+          - alert: daemonsets_not_scheduled
            expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
            for: 10m
            labels:
@ -1312,7 +1320,7 @@ conf:
            annotations:
              description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
              summary: 'Less than desired number of daemonsets scheduled'
-          - alert: kube_deployment_replicas_unavailable
+          - alert: deployment_replicas_unavailable
            expr: kube_deployment_status_replicas_unavailable > 0
            for: 10m
            labels:
@ -1320,7 +1328,7 @@ conf:
            annotations:
              description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
              summary: '{{$labels.deployment}}: has inssuficient replicas.'
-          - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
+          - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
            expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
            for: 10m
            labels:
@ -1328,7 +1336,7 @@ conf:
            annotations:
              description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
              summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
-          - alert: kube_job_status_failed
+          - alert: job_status_failed
            expr: kube_job_status_failed > 0
            for: 10m
            labels:
@ -1336,7 +1344,7 @@ conf:
            annotations:
              description: 'Job {{$labels.exported_job}} is in failed status'
              summary: '{{$labels.exported_job}} has failed status'
-          - alert: kube_pod_status_pending
+          - alert: pod_status_pending
            expr: kube_pod_status_phase{phase="Pending"} == 1
            for: 10m
            labels:
@ -1344,7 +1352,7 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
-          - alert: kube_pod_error_image_pull
+          - alert: pod_error_image_pull
            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
            for: 10m
            labels:
@ -1352,7 +1360,7 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: kube_pod_status_error_image_pull
+          - alert: pod_status_error_image_pull
            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
            for: 10m
            labels:
@ -1360,7 +1368,15 @@ conf:
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: kube_replicaset_missing_replicas
+          - alert: pod_error_crash_loop_back_off
+            expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff  error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: replicaset_missing_replicas
            expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
            for: 10m
            labels:
@ -1368,7 +1384,7 @@ conf:
            annotations:
              description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
              summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
-          - alert: kube_pod_container_terminated
+          - alert: pod_container_terminated
            expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
            for: 10m
            labels:
@ -1618,7 +1634,7 @@ conf:
        - name: openstack.rules
          rules:
          - alert: os_glance_api_availability
-            expr:  check_glance_api != 1
+            expr:  openstack_check_glance_api != 1
            for: 5m
            labels:
              severity: page
@ -1626,7 +1642,7 @@ conf:
              description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Glance API is not available at {{$labels.url}}'
          - alert: os_nova_api_availability
-            expr:  check_nova_api != 1
+            expr:  openstack_check_nova_api != 1
            for: 5m
            labels:
              severity: page
@ -1634,7 +1650,7 @@ conf:
              description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Nova API is not available at {{$labels.url}}'
          - alert: os_keystone_api_availability
-            expr:  check_keystone_api != 1
+            expr:  openstack_check_keystone_api != 1
            for: 5m
            labels:
              severity: page
@ -1642,15 +1658,47 @@ conf:
              description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Keystone API is not available at {{$labels.url}}'
          - alert: os_neutron_api_availability
-            expr:  check_neutron_api != 1
+            expr:  openstack_check_neutron_api != 1
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Neutron API is not available at {{$labels.url}}'
+          - alert: os_neutron_metadata_agent_availability
+            expr:  openstack_services_neutron_metadata_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
+              summary: 'One or more neutron metadata_agents are not available'
+          - alert: os_neutron_openvswitch_agent_availability
+            expr:  openstack_services_neutron_openvswitch_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
+              summary: 'One or more neutron openvswitch agents are not available'
+          - alert: os_neutron_dhcp_agent_availability
+            expr:  openstack_services_neutron_dhcp_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
+              summary: 'One or more neutron dhcp agents are not available'
+          - alert: os_neutron_l3_agent_availability
+            expr:  openstack_services_neutron_l3_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron L3 agents are not available for more than 5 minutes'
+              summary: 'One or more neutron L3 agents are not available'
          - alert: os_swift_api_availability
-            expr:  check_swift_api != 1
+            expr:  openstack_check_swift_api != 1
            for: 5m
            labels:
              severity: page
@ -1673,8 +1721,16 @@ conf:
            annotations:
              description: 'Cinder scheduler is not available for more than 5 minutes'
              summary: 'Cinder scheduler is not available'
+          - alert: os_heat_api_availability
+            expr:  openstack_check_heat_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Heat API is not available at {{$labels.url}}'
          - alert: os_nova_compute_disabled
-            expr:  services_nova_compute_disabled_total > 0
+            expr:  openstack_services_nova_compute_disabled_total > 0
            for: 5m
            labels:
              severity: page
@ -1682,7 +1738,7 @@ conf:
              description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-compute is disabled on some hosts'
          - alert: os_nova_conductor_disabled
-            expr:  services_nova_conductor_disabled_total > 0
+            expr:  openstack_services_nova_conductor_disabled_total > 0
            for: 5m
            labels:
              severity: page
@ -1690,7 +1746,7 @@ conf:
              description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-conductor is disabled on some hosts'
          - alert: os_nova_consoleauth_disabled
-            expr:  services_nova_consoleauth_disabled_total > 0
+            expr:  openstack_services_nova_consoleauth_disabled_total > 0
            for: 5m
            labels:
              severity: page
@ -1698,13 +1754,69 @@ conf:
              description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
          - alert: os_nova_scheduler_disabled
-            expr:  services_nova_scheduler_disabled_total > 0
+            expr:  openstack_services_nova_scheduler_disabled_total > 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
+          - alert: os_nova_compute_down
+            expr:  openstack_services_nova_compute_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-compute is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-compute is down on some hosts'
+          - alert: os_nova_conductor_down
+            expr:  openstack_services_nova_conductor_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-conductor is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-conductor is down on some hosts'
+          - alert: os_nova_consoleauth_down
+            expr:  openstack_services_nova_consoleauth_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-consoleauth is down on some hosts'
+          - alert: os_nova_scheduler_down
+            expr:  openstack_services_nova_scheduler_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-scheduler is down on some hosts'
+          - alert: os_vm_vcpu_usage_high
+            expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
+              summary: 'Openstack VM vcpu usage is high'
+          - alert: os_vm_ram_usage_high
+            expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM RAM usage is hight at {{$value}} percent'
+              summary: 'Openstack VM RAM usage is high'
+          - alert: os_vm_disk_usage_high
+            expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM Disk usage is hight at {{$value}} percent'
+              summary: 'Openstack VM Disk usage is high'
      ceph:
        groups:
        - name: ceph.rules
@ -1989,3 +2101,107 @@ conf:
            annotations:
              description: 'The mysql innodb replication has fallen behind and is not recovering'
              summary: 'MySQL innodb replication is lagging'
+      postgresql:
+        groups:
+        - name: postgresql.rules
+          rules:
+          - alert: pg_replication_fallen_behind
+            expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica ==  1)
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
+              title: Postgres Replication lag is over 2 minutes
+          - alert: pg_connections_too_high
+            expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
+            for: 5m
+            labels:
+              severity: warn
+              channel: database
+            annotations:
+              title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
+          - alert: pg_deadlocks_detected
+            expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
+            for: 5m
+            labels:
+              severity: warn
+            annotations:
+              description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
+              title: Postgres server is experiencing deadlocks
+      prometheus_exporters:
+        groups:
+        - name: prometheus_exporters.rules
+          rules:
+          - alert: prom_exporter_ceph_unavailable
+            expr: absent(ceph_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
+              title: Ceph exporter is not collecting metrics or is not available
+          - alert: prom_exporter_openstack_unavailable
+            expr: absent(openstack_exporter_cache_refresh_duration_seconds)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
+              title: Openstack exporter is not collecting metrics or is not available
+          - alert: prom_exporter_mariadb_unavailable
+            expr: absent(mysql_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
+              title: MariaDB exporter is not collecting metrics or is not available
+          - alert: prom_exporter_kube_state_metrics_unavailable
+            expr: absent(kube_node_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
+              title: kube-state-metrics exporter is not collecting metrics or is not available
+          - alert: prom_exporter_postgresql_unavailable
+            expr: absent(pg_static)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
+              title: postgresql exporter is not collecting metrics or is not available
+          - alert: prom_exporter_node_unavailable
+            expr: absent(node_uname_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: node exporter is not collecting metrics or is not available for past 10 minutes
+              title: node exporter is not collecting metrics or is not available
+          - alert: prom_exporter_calico_unavailable
+            expr: absent(felix_host)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Calico exporter is not collecting metrics or is not available for past 10 minutes
+              title: Calico exporter is not collecting metrics or is not available
+          - alert: prom_exporter_elasticsearch_unavailable
+            expr: absent(elasticsearch_cluster_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
+              title: Elasticsearch exporter is not collecting metrics or is not available
+          - alert: prom_exporter_fluentd_unavailable
+            expr: absent(fluentd_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
+              title: Fluentd exporter is not collecting metrics or is not available