diff --git a/nagios/templates/deployment.yaml b/nagios/templates/deployment.yaml index 09b030252..fb469192a 100644 --- a/nagios/templates/deployment.yaml +++ b/nagios/templates/deployment.yaml @@ -129,8 +129,6 @@ spec: - name: nagios containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} env: - - name: PROMETHEUS_SERVICE - value: {{ tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }} - name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }} - name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT @@ -139,6 +137,16 @@ spec: value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }} - name: REST_NOTIF_SECONDARY_TARGET_URL value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }} + - name: PROMETHEUS_SERVICE + valueFrom: + secretKeyRef: + name: {{ $nagiosUserSecret }} + key: PROMETHEUS_SERVICE + - name: ELASTICSEARCH_SERVICE + valueFrom: + secretKeyRef: + name: {{ $nagiosUserSecret }} + key: ELASTICSEARCH_SERVICE - name: NAGIOSADMIN_USER valueFrom: secretKeyRef: diff --git a/nagios/templates/secret-nagios.yaml b/nagios/templates/secret-nagios.yaml index 56155f5db..0ec0b341a 100644 --- a/nagios/templates/secret-nagios.yaml +++ b/nagios/templates/secret-nagios.yaml @@ -17,6 +17,8 @@ limitations under the License. {{- if .Values.manifests.secret_nagios }} {{- $envAll := . }} {{- $secretName := index $envAll.Values.secrets.nagios.admin }} +{{- $prometheusService := tuple "monitoring" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }} +{{- $elasticsearchService := tuple "elasticsearch" "internal" "admin" "http" . | include "helm-toolkit.endpoints.authenticated_endpoint_uri_lookup" }} --- apiVersion: v1 kind: Secret @@ -28,4 +30,6 @@ data: NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }} BIND_DN: {{ .Values.endpoints.ldap.auth.admin.bind | b64enc }} BIND_PASSWORD: {{ .Values.endpoints.ldap.auth.admin.password | b64enc }} + PROMETHEUS_SERVICE: {{ $prometheusService | b64enc }} + ELASTICSEARCH_SERVICE: {{ $elasticsearchService | b64enc }} {{- end }} diff --git a/nagios/values.yaml b/nagios/values.yaml index 207cb1dff..83fd664c4 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -19,7 +19,7 @@ images: tags: apache_proxy: docker.io/httpd:2.4 - nagios: quay.io/attcomdev/nagios:f5aac039c8e39efe467ac950936773a523bd7cb3 + nagios: quay.io/attcomdev/nagios:389472c05ea4bc9f9b9e407e05e17527bfdce3cc dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1 image_repo_sync: docker.io/docker:17.07.0 pull_policy: IfNotPresent @@ -137,6 +137,24 @@ endpoints: port: ldap: default: 389 + elasticsearch: + name: elasticsearch + namespace: null + auth: + admin: + username: admin + password: changeme + hosts: + default: elasticsearch-logging + host_fqdn_override: + default: null + path: + default: / + scheme: + default: http + port: + http: + default: 80 network: nagios: @@ -292,7 +310,7 @@ conf: AuthUserFile /usr/local/apache2/conf/.htpasswd AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }} AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }} - AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }} + AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }} Require valid-user @@ -356,10 +374,10 @@ conf: command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'" - send_service_http_post: command_name: send_service_http_post - command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'" + command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'" - send_host_http_post: command_name: send_host_http_post - command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'" + command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'" - check_prometheus_host_alive: command_name: check-prometheus-host-alive command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10" @@ -369,6 +387,9 @@ conf: - check_prom_alert: command_name: check_prom_alert command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'" + - check_es_alert: + command_name: check_es_alert + command_line: "$USER1$/check_elasticsearch_query.py --es_url $USER9$ --logger '$ARG1$' --range_mins '$ARG2$' --alert_level '$ARG3$' --critical '$ARG4$' --es_type '$ARG5$'" - check_filespace_mounts-usage-rate-fullin4hrs: command_name: check_filespace_mounts-usage-rate-fullin4hrs command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal' @@ -432,6 +453,9 @@ conf: - check_ceph_health: command_name: check_ceph_health command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0 + - check_prometheus_hosts: + command_name: check_prometheus_hosts + command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg services: - notifying_service: name: notifying_service @@ -449,6 +473,12 @@ conf: service_description: "CEPH_health" check_command: check_ceph_health check_interval: 60 + - check_hosts_health: + use: generic-service + hostgroup_name: prometheus-hosts + service_description: "Nodes_health" + check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready. + check_interval: 60 - check_prometheus_replicas: use: notifying_service hostgroup_name: prometheus-hosts @@ -515,6 +545,12 @@ conf: service_description: "Pod_status-error-image-pull" check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status check_interval: 60 + - check_pod_error_crash_loop_back_off: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Pod_status-crashLoopBackOff" + check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status + check_interval: 60 - check_replicaset_missing_replicas: use: notifying_service hostgroup_name: prometheus-hosts @@ -531,31 +567,66 @@ conf: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_glance" - check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available + check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available check_interval: 60 - check_nova_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_nova" - check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available + check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available check_interval: 60 - check_keystone_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_keystone" - check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available + check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available check_interval: 60 - check_neutron_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_neutron" - check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available + check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available + check_interval: 60 + - check_neutron_metadata_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-metadata-agent" + check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up + check_interval: 60 + - check_neutron_openvswitch_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-openvswitch-agent" + check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up + check_interval: 60 + - check_neutron_dhcp_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-dhcp-agent" + check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up + check_interval: 60 + - check_neutron_l3_agent: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Service_neutron-l3-agent" + check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up check_interval: 60 - check_swift_api: use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_swift" - check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available + check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available + check_interval: 60 + - check_cinder_api: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "API_cinder" + check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available + - check_glance_api: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "API_heat" + check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available check_interval: 60 - check_cinder_api: use: notifying_service @@ -573,25 +644,43 @@ conf: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-compute" - check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts + check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts check_interval: 60 - check_service_nova_conductor: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-conductor" - check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts + check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts check_interval: 60 - check_service_nova_consoleauth: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-consoleauth" - check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts + check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts check_interval: 60 - check_service_nova_scheduler: use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-scheduler" - check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts + check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts + check_interval: 60 + - check_os_vm_vcpu_usage: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "OS-Total-Quota_VCPU-usage" + check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available. + check_interval: 60 + - check_os_vm_ram_usage: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "OS-Total-Quota_RAM-usage" + check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available. + check_interval: 60 + - check_os_vm_disk_usage: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "OS-Total-Quota_Disk-usage" + check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available. check_interval: 60 - check_ceph_monitor_quorum: use: notifying_service @@ -777,6 +866,107 @@ conf: service_description: Mariadb_innodb-replication-lag check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal. hostgroup_name: prometheus-hosts + - check_prometheus_hosts: + use: notifying_service + service_description: Prometheus_hosts-update + check_command: check_prometheus_hosts + hostgroup_name: prometheus-hosts + check_interval: 900 + - check_postgresql_replication_lag: + use: generic-service + service_description: Postgresql_replication-lag + check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal. + hostgroup_name: prometheus-hosts + - check_postgresql_connections: + use: generic-service + service_description: Postgresql_connections + check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds. + hostgroup_name: prometheus-hosts + - check_postgresql_deadlocks: + use: generic-service + service_description: Postgresql_deadlocks + check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks. + hostgroup_name: prometheus-hosts + - check_prom_exporter_ceph: + use: generic-service + service_description: Prometheus-exporter_CEPH + check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_openstack: + use: generic-service + service_description: Prometheus-exporter_Openstack + check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_mariadb: + use: generic-service + service_description: Prometheus-exporter_MariaDB + check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_kube_state_metrics: + use: generic-service + service_description: Prometheus-exporter_Kube-state-metrics + check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_postgresql: + use: generic-service + service_description: Prometheus-exporter_Postgresql + check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_node: + use: generic-service + service_description: Prometheus-exporter_Node + check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_calico: + use: generic-service + service_description: Prometheus-exporter_Calico + check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_elasticsearch: + use: generic-service + service_description: Prometheus-exporter_Elasticsearch + check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_prom_exporter_fluentd: + use: generic-service + service_description: Prometheus-exporter_Fluentd + check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available. + hostgroup_name: prometheus-hosts + - check_logmon_glance: + use: generic-service + service_description: Logmon_glance-error + check_command: check_es_alert!glance!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_keystone: + use: generic-service + service_description: Logmon_keystone-error + check_command: check_es_alert!keystone!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_nova: + use: generic-service + service_description: Logmon_nova-error + check_command: check_es_alert!nova!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_neutron: + use: generic-service + service_description: Logmon_neutron-error + check_command: check_es_alert!neutron!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_cinder: + use: generic-service + service_description: Logmon_cinder-error + check_command: check_es_alert!cinder!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_heat: + use: generic-service + service_description: Logmon_heat-error + check_command: check_es_alert!heat!15!CRITICAL,ERROR!10!oslo_openstack_fluentd + hostgroup_name: prometheus-hosts + - check_logmon_horizon: + use: generic-service + service_description: Logmon_horizon-error + check_command: check_es_alert!horizon!15!CRITICAL,ERROR!10!docker_fluentd + hostgroup_name: prometheus-hosts - check_filespace_mounts-usage-rate-fullin4hrs: use: notifying_service hostgroup_name: base-os diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 1c47081ef..249255662 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1185,6 +1185,14 @@ conf: annotations: description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' summary: Many Kubernetes nodes are Not Ready + - alert: K8SNodesNotReady + expr: count(kube_node_status_ready{condition="true"} == 0) > 0 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} nodes are notReady state.' + summary: One or more Kubernetes nodes are Not Ready - alert: K8SKubeletDown expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 for: 1h @@ -1296,7 +1304,7 @@ conf: annotations: description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired' summary: '{{$labels.statefulset}}: has inssuficient replicas.' - - alert: kube_daemonsets_misscheduled + - alert: daemonsets_misscheduled expr: kube_daemonset_status_number_misscheduled > 0 for: 10m labels: @@ -1304,7 +1312,7 @@ conf: annotations: description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run' summary: 'Daemonsets not scheduled correctly' - - alert: kube_daemonsets_not_scheduled + - alert: daemonsets_not_scheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 for: 10m labels: @@ -1312,7 +1320,7 @@ conf: annotations: description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number' summary: 'Less than desired number of daemonsets scheduled' - - alert: kube_deployment_replicas_unavailable + - alert: deployment_replicas_unavailable expr: kube_deployment_status_replicas_unavailable > 0 for: 10m labels: @@ -1320,7 +1328,7 @@ conf: annotations: description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable' summary: '{{$labels.deployment}}: has inssuficient replicas.' - - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable + - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0 for: 10m labels: @@ -1328,7 +1336,7 @@ conf: annotations: description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update' summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.' - - alert: kube_job_status_failed + - alert: job_status_failed expr: kube_job_status_failed > 0 for: 10m labels: @@ -1336,7 +1344,7 @@ conf: annotations: description: 'Job {{$labels.exported_job}} is in failed status' summary: '{{$labels.exported_job}} has failed status' - - alert: kube_pod_status_pending + - alert: pod_status_pending expr: kube_pod_status_phase{phase="Pending"} == 1 for: 10m labels: @@ -1344,7 +1352,7 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status' - - alert: kube_pod_error_image_pull + - alert: pod_error_image_pull expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 for: 10m labels: @@ -1352,7 +1360,7 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - - alert: kube_pod_status_error_image_pull + - alert: pod_status_error_image_pull expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 for: 10m labels: @@ -1360,7 +1368,15 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - - alert: kube_replicaset_missing_replicas + - alert: pod_error_crash_loop_back_off + expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: replicaset_missing_replicas expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0 for: 10m labels: @@ -1368,7 +1384,7 @@ conf: annotations: description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes' summary: 'Replicaset {{$labels.replicaset}} is missing replicas' - - alert: kube_pod_container_terminated + - alert: pod_container_terminated expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0 for: 10m labels: @@ -1618,7 +1634,7 @@ conf: - name: openstack.rules rules: - alert: os_glance_api_availability - expr: check_glance_api != 1 + expr: openstack_check_glance_api != 1 for: 5m labels: severity: page @@ -1626,7 +1642,7 @@ conf: description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Glance API is not available at {{$labels.url}}' - alert: os_nova_api_availability - expr: check_nova_api != 1 + expr: openstack_check_nova_api != 1 for: 5m labels: severity: page @@ -1634,7 +1650,7 @@ conf: description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Nova API is not available at {{$labels.url}}' - alert: os_keystone_api_availability - expr: check_keystone_api != 1 + expr: openstack_check_keystone_api != 1 for: 5m labels: severity: page @@ -1642,15 +1658,47 @@ conf: description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Keystone API is not available at {{$labels.url}}' - alert: os_neutron_api_availability - expr: check_neutron_api != 1 + expr: openstack_check_neutron_api != 1 for: 5m labels: severity: page annotations: description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes' summary: 'Neutron API is not available at {{$labels.url}}' + - alert: os_neutron_metadata_agent_availability + expr: openstack_services_neutron_metadata_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron metadata_agents are not available for more than 5 minutes' + summary: 'One or more neutron metadata_agents are not available' + - alert: os_neutron_openvswitch_agent_availability + expr: openstack_services_neutron_openvswitch_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron openvswitch agents are not available for more than 5 minutes' + summary: 'One or more neutron openvswitch agents are not available' + - alert: os_neutron_dhcp_agent_availability + expr: openstack_services_neutron_dhcp_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron dhcp agents are not available for more than 5 minutes' + summary: 'One or more neutron dhcp agents are not available' + - alert: os_neutron_l3_agent_availability + expr: openstack_services_neutron_l3_agent_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'One or more neutron L3 agents are not available for more than 5 minutes' + summary: 'One or more neutron L3 agents are not available' - alert: os_swift_api_availability - expr: check_swift_api != 1 + expr: openstack_check_swift_api != 1 for: 5m labels: severity: page @@ -1673,8 +1721,16 @@ conf: annotations: description: 'Cinder scheduler is not available for more than 5 minutes' summary: 'Cinder scheduler is not available' + - alert: os_heat_api_availability + expr: openstack_check_heat_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Heat API is not available at {{$labels.url}}' - alert: os_nova_compute_disabled - expr: services_nova_compute_disabled_total > 0 + expr: openstack_services_nova_compute_disabled_total > 0 for: 5m labels: severity: page @@ -1682,7 +1738,7 @@ conf: description: 'nova-compute is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-compute is disabled on some hosts' - alert: os_nova_conductor_disabled - expr: services_nova_conductor_disabled_total > 0 + expr: openstack_services_nova_conductor_disabled_total > 0 for: 5m labels: severity: page @@ -1690,7 +1746,7 @@ conf: description: 'nova-conductor is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-conductor is disabled on some hosts' - alert: os_nova_consoleauth_disabled - expr: services_nova_consoleauth_disabled_total > 0 + expr: openstack_services_nova_consoleauth_disabled_total > 0 for: 5m labels: severity: page @@ -1698,13 +1754,69 @@ conf: description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-consoleauth is disabled on some hosts' - alert: os_nova_scheduler_disabled - expr: services_nova_scheduler_disabled_total > 0 + expr: openstack_services_nova_scheduler_disabled_total > 0 for: 5m labels: severity: page annotations: description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes' summary: 'Openstack compute service nova-scheduler is disabled on some hosts' + - alert: os_nova_compute_down + expr: openstack_services_nova_compute_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-compute is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-compute is down on some hosts' + - alert: os_nova_conductor_down + expr: openstack_services_nova_conductor_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-conductor is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-conductor is down on some hosts' + - alert: os_nova_consoleauth_down + expr: openstack_services_nova_consoleauth_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-consoleauth is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-consoleauth is down on some hosts' + - alert: os_nova_scheduler_down + expr: openstack_services_nova_scheduler_down_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-scheduler is down on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-scheduler is down on some hosts' + - alert: os_vm_vcpu_usage_high + expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'Openstack VM vcpu usage is hight at {{$value}} percent' + summary: 'Openstack VM vcpu usage is high' + - alert: os_vm_ram_usage_high + expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'Openstack VM RAM usage is hight at {{$value}} percent' + summary: 'Openstack VM RAM usage is high' + - alert: os_vm_disk_usage_high + expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'Openstack VM Disk usage is hight at {{$value}} percent' + summary: 'Openstack VM Disk usage is high' ceph: groups: - name: ceph.rules @@ -1989,3 +2101,107 @@ conf: annotations: description: 'The mysql innodb replication has fallen behind and is not recovering' summary: 'MySQL innodb replication is lagging' + postgresql: + groups: + - name: postgresql.rules + rules: + - alert: pg_replication_fallen_behind + expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica == 1) + for: 5m + labels: + severity: warning + annotations: + description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }} + title: Postgres Replication lag is over 2 minutes + - alert: pg_connections_too_high + expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95 + for: 5m + labels: + severity: warn + channel: database + annotations: + title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum + - alert: pg_deadlocks_detected + expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0 + for: 5m + labels: + severity: warn + annotations: + description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}} + title: Postgres server is experiencing deadlocks + prometheus_exporters: + groups: + - name: prometheus_exporters.rules + rules: + - alert: prom_exporter_ceph_unavailable + expr: absent(ceph_health_status) + for: 10m + labels: + severity: warning + annotations: + description: Ceph exporter is not collecting metrics or is not available for past 10 minutes + title: Ceph exporter is not collecting metrics or is not available + - alert: prom_exporter_openstack_unavailable + expr: absent(openstack_exporter_cache_refresh_duration_seconds) + for: 10m + labels: + severity: warning + annotations: + description: Openstack exporter is not collecting metrics or is not available for past 10 minutes + title: Openstack exporter is not collecting metrics or is not available + - alert: prom_exporter_mariadb_unavailable + expr: absent(mysql_up) + for: 10m + labels: + severity: warning + annotations: + description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes + title: MariaDB exporter is not collecting metrics or is not available + - alert: prom_exporter_kube_state_metrics_unavailable + expr: absent(kube_node_info) + for: 10m + labels: + severity: warning + annotations: + description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes + title: kube-state-metrics exporter is not collecting metrics or is not available + - alert: prom_exporter_postgresql_unavailable + expr: absent(pg_static) + for: 10m + labels: + severity: warning + annotations: + description: postgresql exporter is not collecting metrics or is not available for past 10 minutes + title: postgresql exporter is not collecting metrics or is not available + - alert: prom_exporter_node_unavailable + expr: absent(node_uname_info) + for: 10m + labels: + severity: warning + annotations: + description: node exporter is not collecting metrics or is not available for past 10 minutes + title: node exporter is not collecting metrics or is not available + - alert: prom_exporter_calico_unavailable + expr: absent(felix_host) + for: 10m + labels: + severity: warning + annotations: + description: Calico exporter is not collecting metrics or is not available for past 10 minutes + title: Calico exporter is not collecting metrics or is not available + - alert: prom_exporter_elasticsearch_unavailable + expr: absent(elasticsearch_cluster_health_status) + for: 10m + labels: + severity: warning + annotations: + description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes + title: Elasticsearch exporter is not collecting metrics or is not available + - alert: prom_exporter_fluentd_unavailable + expr: absent(fluentd_up) + for: 10m + labels: + severity: warning + annotations: + description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes + title: Fluentd exporter is not collecting metrics or is not available