From db0d653b4d2b7f37bfea205d95b9cbb3e83e14fb Mon Sep 17 00:00:00 2001
From: rakesh-patnaik <patsrakesh@gmail.com>
Date: Tue, 3 Jul 2018 20:19:56 +0000
Subject: [PATCH] Monitor postgresql, Openstack virt resources, api, logs, pod
 and nodes status

Fixing opebstack API monitors

Adding additional neutron services monitors
Adding new Pod CrashLoopBaackOff status check
Adding new Host readiness check

Updated the nagios image reference(https://review.gerrithub.io/c/att-comdev/nagios/+/420590 - Pending)

This updated image provides a mechanism for querying Elasticsearch
with the goal of triggering alerts based on specified applications
and log levels.

Finally, this moves the endpoints resulting from the authenticated
endpoint lookups required for Nagios to the nagios secret instead
of handled via plain text environment variables

Change-Id: I517d8e6e6e8fa1d359382be8a131a8e45bf243e2
---
 nagios/templates/deployment.yaml    |  12 +-
 nagios/templates/secret-nagios.yaml |   4 +
 nagios/values.yaml                  | 216 +++++++++++++++++++++--
 prometheus/values.yaml              | 254 +++++++++++++++++++++++++---
 4 files changed, 452 insertions(+), 34 deletions(-)

diff --git a/nagios/templates/deployment.yaml b/nagios/templates/deployment.yaml
index 09b030252..fb469192a 100644
--- a/nagios/templates/deployment.yaml
+++ b/nagios/templates/deployment.yaml
@@ -129,8 +129,6 @@ spec:
             - name: nagios
               containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
           env:
-            - name: PROMETHEUS_SERVICE
-              value: {{ tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
             - name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT
               value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }}
             - name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT
@@ -139,6 +137,16 @@ spec:
               value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }}
             - name: REST_NOTIF_SECONDARY_TARGET_URL
               value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }}
+            - name: PROMETHEUS_SERVICE
+              valueFrom:
+                secretKeyRef:
+                  name: {{ $nagiosUserSecret }}
+                  key: PROMETHEUS_SERVICE
+            - name: ELASTICSEARCH_SERVICE
+              valueFrom:
+                secretKeyRef:
+                  name: {{ $nagiosUserSecret }}
+                  key: ELASTICSEARCH_SERVICE
             - name: NAGIOSADMIN_USER
               valueFrom:
                 secretKeyRef:
diff --git a/nagios/templates/secret-nagios.yaml b/nagios/templates/secret-nagios.yaml
index 56155f5db..0ec0b341a 100644
--- a/nagios/templates/secret-nagios.yaml
+++ b/nagios/templates/secret-nagios.yaml
@@ -17,6 +17,8 @@ limitations under the License.
 {{- if .Values.manifests.secret_nagios }}
 {{- $envAll := . }}
 {{- $secretName := index $envAll.Values.secrets.nagios.admin }}
+{{- $prometheusService := tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
+{{- $elasticsearchService := tuple "elasticsearch" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }}
 ---
 apiVersion: v1
 kind: Secret
@@ -28,4 +30,6 @@ data:
   NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }}
   BIND_DN: {{ .Values.endpoints.ldap.auth.admin.bind | b64enc }}
   BIND_PASSWORD: {{ .Values.endpoints.ldap.auth.admin.password | b64enc }}
+  PROMETHEUS_SERVICE: {{ $prometheusService | b64enc }}
+  ELASTICSEARCH_SERVICE: {{ $elasticsearchService | b64enc }}
 {{- end }}
diff --git a/nagios/values.yaml b/nagios/values.yaml
index 207cb1dff..83fd664c4 100644
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@@ -19,7 +19,7 @@
 images:
   tags:
     apache_proxy: docker.io/httpd:2.4
-    nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3
+    nagios: quay.io/attcomdev/nagios:389472c05ea4bc9f9b9e407e05e17527bfdce3cc
     dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
     image_repo_sync: docker.io/docker:17.07.0
   pull_policy: IfNotPresent
@@ -137,6 +137,24 @@ endpoints:
     port:
       ldap:
         default: 389
+  elasticsearch:
+    name: elasticsearch
+    namespace: null
+    auth:
+      admin:
+        username: admin
+        password: changeme
+    hosts:
+      default: elasticsearch-logging
+    host_fqdn_override:
+      default: null
+    path:
+      default: /
+    scheme:
+      default: http
+    port:
+      http:
+        default: 80
 
 network:
   nagios:
@@ -292,7 +310,7 @@ conf:
           AuthUserFile /usr/local/apache2/conf/.htpasswd
           AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
           AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
-          AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
+          AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
           Require valid-user
       </Proxy>
     </VirtualHost>
@@ -356,10 +374,10 @@ conf:
           command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
       - send_service_http_post:
           command_name: send_service_http_post
-          command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
+          command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
       - send_host_http_post:
           command_name: send_host_http_post
-          command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'"
+          command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
       - check_prometheus_host_alive:
           command_name: check-prometheus-host-alive
           command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
@@ -369,6 +387,9 @@ conf:
       - check_prom_alert:
           command_name: check_prom_alert
           command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
+      - check_es_alert:
+          command_name: check_es_alert
+          command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'"
       - check_filespace_mounts-usage-rate-fullin4hrs:
           command_name: check_filespace_mounts-usage-rate-fullin4hrs
           command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
@@ -432,6 +453,9 @@ conf:
       - check_ceph_health:
           command_name: check_ceph_health
           command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0
+      - check_prometheus_hosts:
+          command_name: check_prometheus_hosts
+          command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
     services:
       - notifying_service:
           name: notifying_service
@@ -449,6 +473,12 @@ conf:
           service_description: "CEPH_health"
           check_command: check_ceph_health
           check_interval: 60
+      - check_hosts_health:
+          use: generic-service
+          hostgroup_name: prometheus-hosts
+          service_description: "Nodes_health"
+          check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready.
+          check_interval: 60
       - check_prometheus_replicas:
           use: notifying_service
           hostgroup_name: prometheus-hosts
@@ -515,6 +545,12 @@ conf:
           service_description: "Pod_status-error-image-pull"
           check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
           check_interval: 60
+      - check_pod_error_crash_loop_back_off:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Pod_status-crashLoopBackOff"
+          check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status
+          check_interval: 60
       - check_replicaset_missing_replicas:
           use: notifying_service
           hostgroup_name: prometheus-hosts
@@ -531,31 +567,66 @@ conf:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "API_glance"
-          check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
+          check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
           check_interval: 60
       - check_nova_api:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "API_nova"
-          check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
+          check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
           check_interval: 60
       - check_keystone_api:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "API_keystone"
-          check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
+          check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
           check_interval: 60
       - check_neutron_api:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "API_neutron"
-          check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
+          check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
+          check_interval: 60
+      - check_neutron_metadata_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-metadata-agent"
+          check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
+          check_interval: 60
+      - check_neutron_openvswitch_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-openvswitch-agent"
+          check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
+          check_interval: 60
+      - check_neutron_dhcp_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-dhcp-agent"
+          check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
+          check_interval: 60
+      - check_neutron_l3_agent:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Service_neutron-l3-agent"
+          check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
           check_interval: 60
       - check_swift_api:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "API_swift"
-          check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
+          check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
+          check_interval: 60
+      - check_cinder_api:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "API_cinder"
+          check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
+      - check_glance_api:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "API_heat"
+          check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
           check_interval: 60
       - check_cinder_api:
           use: notifying_service
@@ -573,25 +644,43 @@ conf:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "Service_nova-compute"
-          check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
           check_interval: 60
       - check_service_nova_conductor:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "Service_nova-conductor"
-          check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
           check_interval: 60
       - check_service_nova_consoleauth:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "Service_nova-consoleauth"
-          check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
+          check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
           check_interval: 60
       - check_service_nova_scheduler:
           use: notifying_service
           hostgroup_name: prometheus-hosts
           service_description: "Service_nova-scheduler"
-          check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
+          check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
+          check_interval: 60
+      - check_os_vm_vcpu_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_VCPU-usage"
+          check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
+          check_interval: 60
+      - check_os_vm_ram_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_RAM-usage"
+          check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
+          check_interval: 60
+      - check_os_vm_disk_usage:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "OS-Total-Quota_Disk-usage"
+          check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
           check_interval: 60
       - check_ceph_monitor_quorum:
           use: notifying_service
@@ -777,6 +866,107 @@ conf:
           service_description: Mariadb_innodb-replication-lag
           check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
           hostgroup_name: prometheus-hosts
+      - check_prometheus_hosts:
+          use: notifying_service
+          service_description: Prometheus_hosts-update
+          check_command: check_prometheus_hosts
+          hostgroup_name: prometheus-hosts
+          check_interval: 900
+      - check_postgresql_replication_lag:
+          use: generic-service
+          service_description: Postgresql_replication-lag
+          check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
+          hostgroup_name: prometheus-hosts
+      - check_postgresql_connections:
+          use: generic-service
+          service_description: Postgresql_connections
+          check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
+          hostgroup_name: prometheus-hosts
+      - check_postgresql_deadlocks:
+          use: generic-service
+          service_description: Postgresql_deadlocks
+          check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_ceph:
+          use: generic-service
+          service_description: Prometheus-exporter_CEPH
+          check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_openstack:
+          use: generic-service
+          service_description: Prometheus-exporter_Openstack
+          check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_mariadb:
+          use: generic-service
+          service_description: Prometheus-exporter_MariaDB
+          check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_kube_state_metrics:
+          use: generic-service
+          service_description: Prometheus-exporter_Kube-state-metrics
+          check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_postgresql:
+          use: generic-service
+          service_description: Prometheus-exporter_Postgresql
+          check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_node:
+          use: generic-service
+          service_description: Prometheus-exporter_Node
+          check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_calico:
+          use: generic-service
+          service_description: Prometheus-exporter_Calico
+          check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_elasticsearch:
+          use: generic-service
+          service_description: Prometheus-exporter_Elasticsearch
+          check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_prom_exporter_fluentd:
+          use: generic-service
+          service_description: Prometheus-exporter_Fluentd
+          check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
+          hostgroup_name: prometheus-hosts
+      - check_logmon_glance:
+          use: generic-service
+          service_description: Logmon_glance-error
+          check_command: check_es_alert!glance!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_keystone:
+          use: generic-service
+          service_description: Logmon_keystone-error
+          check_command: check_es_alert!keystone!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_nova:
+          use: generic-service
+          service_description: Logmon_nova-error
+          check_command: check_es_alert!nova!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_neutron:
+          use: generic-service
+          service_description: Logmon_neutron-error
+          check_command: check_es_alert!neutron!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_cinder:
+          use: generic-service
+          service_description: Logmon_cinder-error
+          check_command: check_es_alert!cinder!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_heat:
+          use: generic-service
+          service_description: Logmon_heat-error
+          check_command: check_es_alert!heat!15!CRITICAL,ERROR!10!oslo_openstack_fluentd
+          hostgroup_name: prometheus-hosts
+      - check_logmon_horizon:
+          use: generic-service
+          service_description: Logmon_horizon-error
+          check_command: check_es_alert!horizon!15!CRITICAL,ERROR!10!docker_fluentd
+          hostgroup_name: prometheus-hosts
       - check_filespace_mounts-usage-rate-fullin4hrs:
           use: notifying_service
           hostgroup_name: base-os
diff --git a/prometheus/values.yaml b/prometheus/values.yaml
index 1c47081ef..249255662 100644
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@@ -1185,6 +1185,14 @@ conf:
             annotations:
               description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
               summary: Many Kubernetes nodes are Not Ready
+          - alert: K8SNodesNotReady
+            expr: count(kube_node_status_ready{condition="true"} == 0) > 0
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }} nodes are notReady state.'
+              summary: One or more Kubernetes nodes are Not Ready
           - alert: K8SKubeletDown
             expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
             for: 1h
@@ -1296,7 +1304,7 @@ conf:
             annotations:
               description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
               summary: '{{$labels.statefulset}}: has inssuficient replicas.'
-          - alert: kube_daemonsets_misscheduled
+          - alert: daemonsets_misscheduled
             expr: kube_daemonset_status_number_misscheduled > 0
             for: 10m
             labels:
@@ -1304,7 +1312,7 @@ conf:
             annotations:
               description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
               summary: 'Daemonsets not scheduled correctly'
-          - alert: kube_daemonsets_not_scheduled
+          - alert: daemonsets_not_scheduled
             expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
             for: 10m
             labels:
@@ -1312,7 +1320,7 @@ conf:
             annotations:
               description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
               summary: 'Less than desired number of daemonsets scheduled'
-          - alert: kube_deployment_replicas_unavailable
+          - alert: deployment_replicas_unavailable
             expr: kube_deployment_status_replicas_unavailable > 0
             for: 10m
             labels:
@@ -1320,7 +1328,7 @@ conf:
             annotations:
               description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
               summary: '{{$labels.deployment}}: has inssuficient replicas.'
-          - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
+          - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
             expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
             for: 10m
             labels:
@@ -1328,7 +1336,7 @@ conf:
             annotations:
               description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
               summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
-          - alert: kube_job_status_failed
+          - alert: job_status_failed
             expr: kube_job_status_failed > 0
             for: 10m
             labels:
@@ -1336,7 +1344,7 @@ conf:
             annotations:
               description: 'Job {{$labels.exported_job}} is in failed status'
               summary: '{{$labels.exported_job}} has failed status'
-          - alert: kube_pod_status_pending
+          - alert: pod_status_pending
             expr: kube_pod_status_phase{phase="Pending"} == 1
             for: 10m
             labels:
@@ -1344,7 +1352,7 @@ conf:
             annotations:
               description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
               summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
-          - alert: kube_pod_error_image_pull
+          - alert: pod_error_image_pull
             expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
             for: 10m
             labels:
@@ -1352,7 +1360,7 @@ conf:
             annotations:
               description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
               summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: kube_pod_status_error_image_pull
+          - alert: pod_status_error_image_pull
             expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
             for: 10m
             labels:
@@ -1360,7 +1368,15 @@ conf:
             annotations:
               description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
               summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: kube_replicaset_missing_replicas
+          - alert: pod_error_crash_loop_back_off
+            expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff  error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: replicaset_missing_replicas
             expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
             for: 10m
             labels:
@@ -1368,7 +1384,7 @@ conf:
             annotations:
               description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
               summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
-          - alert: kube_pod_container_terminated
+          - alert: pod_container_terminated
             expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
             for: 10m
             labels:
@@ -1618,7 +1634,7 @@ conf:
         - name: openstack.rules
           rules:
           - alert: os_glance_api_availability
-            expr:  check_glance_api != 1
+            expr:  openstack_check_glance_api != 1
             for: 5m
             labels:
               severity: page
@@ -1626,7 +1642,7 @@ conf:
               description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
               summary: 'Glance API is not available at {{$labels.url}}'
           - alert: os_nova_api_availability
-            expr:  check_nova_api != 1
+            expr:  openstack_check_nova_api != 1
             for: 5m
             labels:
               severity: page
@@ -1634,7 +1650,7 @@ conf:
               description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
               summary: 'Nova API is not available at {{$labels.url}}'
           - alert: os_keystone_api_availability
-            expr:  check_keystone_api != 1
+            expr:  openstack_check_keystone_api != 1
             for: 5m
             labels:
               severity: page
@@ -1642,15 +1658,47 @@ conf:
               description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
               summary: 'Keystone API is not available at {{$labels.url}}'
           - alert: os_neutron_api_availability
-            expr:  check_neutron_api != 1
+            expr:  openstack_check_neutron_api != 1
             for: 5m
             labels:
               severity: page
             annotations:
               description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
               summary: 'Neutron API is not available at {{$labels.url}}'
+          - alert: os_neutron_metadata_agent_availability
+            expr:  openstack_services_neutron_metadata_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
+              summary: 'One or more neutron metadata_agents are not available'
+          - alert: os_neutron_openvswitch_agent_availability
+            expr:  openstack_services_neutron_openvswitch_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
+              summary: 'One or more neutron openvswitch agents are not available'
+          - alert: os_neutron_dhcp_agent_availability
+            expr:  openstack_services_neutron_dhcp_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
+              summary: 'One or more neutron dhcp agents are not available'
+          - alert: os_neutron_l3_agent_availability
+            expr:  openstack_services_neutron_l3_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron L3 agents are not available for more than 5 minutes'
+              summary: 'One or more neutron L3 agents are not available'
           - alert: os_swift_api_availability
-            expr:  check_swift_api != 1
+            expr:  openstack_check_swift_api != 1
             for: 5m
             labels:
               severity: page
@@ -1673,8 +1721,16 @@ conf:
             annotations:
               description: 'Cinder scheduler is not available for more than 5 minutes'
               summary: 'Cinder scheduler is not available'
+          - alert: os_heat_api_availability
+            expr:  openstack_check_heat_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Heat API is not available at {{$labels.url}}'
           - alert: os_nova_compute_disabled
-            expr:  services_nova_compute_disabled_total > 0
+            expr:  openstack_services_nova_compute_disabled_total > 0
             for: 5m
             labels:
               severity: page
@@ -1682,7 +1738,7 @@ conf:
               description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
               summary: 'Openstack compute service nova-compute is disabled on some hosts'
           - alert: os_nova_conductor_disabled
-            expr:  services_nova_conductor_disabled_total > 0
+            expr:  openstack_services_nova_conductor_disabled_total > 0
             for: 5m
             labels:
               severity: page
@@ -1690,7 +1746,7 @@ conf:
               description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
               summary: 'Openstack compute service nova-conductor is disabled on some hosts'
           - alert: os_nova_consoleauth_disabled
-            expr:  services_nova_consoleauth_disabled_total > 0
+            expr:  openstack_services_nova_consoleauth_disabled_total > 0
             for: 5m
             labels:
               severity: page
@@ -1698,13 +1754,69 @@ conf:
               description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
               summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
           - alert: os_nova_scheduler_disabled
-            expr:  services_nova_scheduler_disabled_total > 0
+            expr:  openstack_services_nova_scheduler_disabled_total > 0
             for: 5m
             labels:
               severity: page
             annotations:
               description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
               summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
+          - alert: os_nova_compute_down
+            expr:  openstack_services_nova_compute_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-compute is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-compute is down on some hosts'
+          - alert: os_nova_conductor_down
+            expr:  openstack_services_nova_conductor_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-conductor is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-conductor is down on some hosts'
+          - alert: os_nova_consoleauth_down
+            expr:  openstack_services_nova_consoleauth_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-consoleauth is down on some hosts'
+          - alert: os_nova_scheduler_down
+            expr:  openstack_services_nova_scheduler_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-scheduler is down on some hosts'
+          - alert: os_vm_vcpu_usage_high
+            expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
+              summary: 'Openstack VM vcpu usage is high'
+          - alert: os_vm_ram_usage_high
+            expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM RAM usage is hight at {{$value}} percent'
+              summary: 'Openstack VM RAM usage is high'
+          - alert: os_vm_disk_usage_high
+            expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM Disk usage is hight at {{$value}} percent'
+              summary: 'Openstack VM Disk usage is high'
       ceph:
         groups:
         - name: ceph.rules
@@ -1989,3 +2101,107 @@ conf:
             annotations:
               description: 'The mysql innodb replication has fallen behind and is not recovering'
               summary: 'MySQL innodb replication is lagging'
+      postgresql:
+        groups:
+        - name: postgresql.rules
+          rules:
+          - alert: pg_replication_fallen_behind
+            expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica ==  1)
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
+              title: Postgres Replication lag is over 2 minutes
+          - alert: pg_connections_too_high
+            expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
+            for: 5m
+            labels:
+              severity: warn
+              channel: database
+            annotations:
+              title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
+          - alert: pg_deadlocks_detected
+            expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
+            for: 5m
+            labels:
+              severity: warn
+            annotations:
+              description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
+              title: Postgres server is experiencing deadlocks
+      prometheus_exporters:
+        groups:
+        - name: prometheus_exporters.rules
+          rules:
+          - alert: prom_exporter_ceph_unavailable
+            expr: absent(ceph_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
+              title: Ceph exporter is not collecting metrics or is not available
+          - alert: prom_exporter_openstack_unavailable
+            expr: absent(openstack_exporter_cache_refresh_duration_seconds)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
+              title: Openstack exporter is not collecting metrics or is not available
+          - alert: prom_exporter_mariadb_unavailable
+            expr: absent(mysql_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
+              title: MariaDB exporter is not collecting metrics or is not available
+          - alert: prom_exporter_kube_state_metrics_unavailable
+            expr: absent(kube_node_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
+              title: kube-state-metrics exporter is not collecting metrics or is not available
+          - alert: prom_exporter_postgresql_unavailable
+            expr: absent(pg_static)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
+              title: postgresql exporter is not collecting metrics or is not available
+          - alert: prom_exporter_node_unavailable
+            expr: absent(node_uname_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: node exporter is not collecting metrics or is not available for past 10 minutes
+              title: node exporter is not collecting metrics or is not available
+          - alert: prom_exporter_calico_unavailable
+            expr: absent(felix_host)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Calico exporter is not collecting metrics or is not available for past 10 minutes
+              title: Calico exporter is not collecting metrics or is not available
+          - alert: prom_exporter_elasticsearch_unavailable
+            expr: absent(elasticsearch_cluster_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
+              title: Elasticsearch exporter is not collecting metrics or is not available
+          - alert: prom_exporter_fluentd_unavailable
+            expr: absent(fluentd_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
+              title: Fluentd exporter is not collecting metrics or is not available