Nagios chart modifications to use prometheus alert metric for monitoring
Change-Id: I6bb3c7176a725d8f26f3c11ebfb1f6d1d430ab96
This commit is contained in:
parent
d93649da5f
commit
adab0e1e30
@ -17,6 +17,8 @@ limitations under the License.
|
|||||||
{{- if .Values.manifests.deployment }}
|
{{- if .Values.manifests.deployment }}
|
||||||
{{- $envAll := . }}
|
{{- $envAll := . }}
|
||||||
|
|
||||||
|
{{- $nagiosUserSecret := .Values.secrets.nagios.admin }}
|
||||||
|
|
||||||
{{- $serviceAccountName := "nagios" }}
|
{{- $serviceAccountName := "nagios" }}
|
||||||
{{ tuple $envAll "nagios" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
{{ tuple $envAll "nagios" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
||||||
---
|
---
|
||||||
@ -80,11 +82,21 @@ spec:
|
|||||||
{{ tuple $envAll "nagios" | include "helm-toolkit.snippets.image" | indent 10 }}
|
{{ tuple $envAll "nagios" | include "helm-toolkit.snippets.image" | indent 10 }}
|
||||||
{{ tuple $envAll $envAll.Values.pod.resources.nagios | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
{{ tuple $envAll $envAll.Values.pod.resources.nagios | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||||
ports:
|
ports:
|
||||||
- name: metrics
|
- name: http
|
||||||
containerPort: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
containerPort: {{ tuple "nagios" "internal" "http" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||||
env:
|
env:
|
||||||
- name: PROMETHEUS_SERVICE
|
- name: PROMETHEUS_SERVICE
|
||||||
value: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}
|
value: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}
|
||||||
|
- name: NAGIOSADMIN_USER
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ $nagiosUserSecret }}
|
||||||
|
key: NAGIOSADMIN_USER
|
||||||
|
- name: NAGIOSADMIN_PASS
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ $nagiosUserSecret }}
|
||||||
|
key: NAGIOSADMIN_PASS
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: nagios-etc
|
- name: nagios-etc
|
||||||
mountPath: /opt/nagios/etc/nagios.cfg
|
mountPath: /opt/nagios/etc/nagios.cfg
|
||||||
|
@ -15,6 +15,6 @@ limitations under the License.
|
|||||||
*/}}
|
*/}}
|
||||||
|
|
||||||
{{- if and .Values.manifests.ingress .Values.network.nagios.ingress.public }}
|
{{- if and .Values.manifests.ingress .Values.network.nagios.ingress.public }}
|
||||||
{{- $ingressOpts := dict "envAll" . "backendService" "nagios" "backendServiceType" "nagios" "backendPort" "metrics" -}}
|
{{- $ingressOpts := dict "envAll" . "backendService" "nagios" "backendServiceType" "nagios" "backendPort" "http" -}}
|
||||||
{{ $ingressOpts | include "helm-toolkit.manifests.ingress" }}
|
{{ $ingressOpts | include "helm-toolkit.manifests.ingress" }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
29
nagios/templates/secret-nagios.yaml
Normal file
29
nagios/templates/secret-nagios.yaml
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
{{/*
|
||||||
|
Copyright 2017 The Openstack-Helm Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.manifests.secret_nagios }}
|
||||||
|
{{- $envAll := . }}
|
||||||
|
{{- $secretName := index $envAll.Values.secrets.nagios.admin }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: {{ $secretName }}
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
NAGIOSADMIN_USER: {{ .Values.endpoints.nagios.auth.admin.username | b64enc }}
|
||||||
|
NAGIOSADMIN_PASS: {{ .Values.endpoints.nagios.auth.admin.password | b64enc }}
|
||||||
|
{{- end }}
|
@ -23,8 +23,8 @@ metadata:
|
|||||||
name: {{ tuple "nagios" "internal" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
|
name: {{ tuple "nagios" "internal" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
|
||||||
spec:
|
spec:
|
||||||
ports:
|
ports:
|
||||||
- name: metrics
|
- name: http
|
||||||
port: {{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
port: {{ tuple "nagios" "internal" "http" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||||
{{ if .Values.network.nagios.node_port.enabled }}
|
{{ if .Values.network.nagios.node_port.enabled }}
|
||||||
nodePort: {{ .Values.network.nagios.node_port.port }}
|
nodePort: {{ .Values.network.nagios.node_port.port }}
|
||||||
{{ end }}
|
{{ end }}
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
images:
|
images:
|
||||||
tags:
|
tags:
|
||||||
nagios: docker.io/srwilkers/prometheus-nagios:v0.1.0
|
nagios: quay.io/attcomdev/nagios:931116b88c54931c616dfa66f424be38f74d8ad2
|
||||||
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
|
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
|
||||||
image_repo_sync: docker.io/docker:17.07.0
|
image_repo_sync: docker.io/docker:17.07.0
|
||||||
pull_policy: IfNotPresent
|
pull_policy: IfNotPresent
|
||||||
@ -52,6 +52,10 @@ dependencies:
|
|||||||
nagios:
|
nagios:
|
||||||
services: null
|
services: null
|
||||||
|
|
||||||
|
secrets:
|
||||||
|
nagios:
|
||||||
|
admin: nagios-admin-creds
|
||||||
|
|
||||||
endpoints:
|
endpoints:
|
||||||
cluster_domain_suffix: cluster.local
|
cluster_domain_suffix: cluster.local
|
||||||
local_image_registry:
|
local_image_registry:
|
||||||
@ -84,6 +88,10 @@ endpoints:
|
|||||||
nagios:
|
nagios:
|
||||||
name: nagios
|
name: nagios
|
||||||
namespace: null
|
namespace: null
|
||||||
|
auth:
|
||||||
|
admin:
|
||||||
|
username: admin
|
||||||
|
password: changeme
|
||||||
hosts:
|
hosts:
|
||||||
default: nagios-metrics
|
default: nagios-metrics
|
||||||
public: nagios
|
public: nagios
|
||||||
@ -94,8 +102,8 @@ endpoints:
|
|||||||
scheme:
|
scheme:
|
||||||
default: http
|
default: http
|
||||||
port:
|
port:
|
||||||
nagios:
|
http:
|
||||||
default: 25
|
default: 80
|
||||||
|
|
||||||
network:
|
network:
|
||||||
nagios:
|
nagios:
|
||||||
@ -122,7 +130,7 @@ pod:
|
|||||||
nagios:
|
nagios:
|
||||||
timeout: 30
|
timeout: 30
|
||||||
replicas:
|
replicas:
|
||||||
nagios: 3
|
nagios: 1
|
||||||
resources:
|
resources:
|
||||||
enabled: false
|
enabled: false
|
||||||
nagios:
|
nagios:
|
||||||
@ -147,6 +155,7 @@ manifests:
|
|||||||
deployment: true
|
deployment: true
|
||||||
ingress: true
|
ingress: true
|
||||||
job_image_repo_sync: true
|
job_image_repo_sync: true
|
||||||
|
secret_nagios: true
|
||||||
service: true
|
service: true
|
||||||
service_ingress: true
|
service_ingress: true
|
||||||
|
|
||||||
@ -157,36 +166,397 @@ conf:
|
|||||||
use: linux-server
|
use: linux-server
|
||||||
host_name: prometheus
|
host_name: prometheus
|
||||||
alias: "Prometheus Monitoring"
|
alias: "Prometheus Monitoring"
|
||||||
address: $PROMETHEUS_SERVICE
|
address: 127.0.0.1
|
||||||
hostgroups: monitoring
|
hostgroups: prometheus-hosts
|
||||||
|
check_command: check-prometheus-host-alive
|
||||||
host_groups:
|
host_groups:
|
||||||
- monitoring:
|
- prometheus-hosts:
|
||||||
hostgroup_name: monitoring
|
hostgroup_name: prometheus-hosts
|
||||||
alias: "Monitoring Instances"
|
alias: "Prometheus Virtual Host"
|
||||||
members: prometheus
|
- all:
|
||||||
|
hostgroup_name: all
|
||||||
|
alias: "all"
|
||||||
|
- base-os:
|
||||||
|
hostgroup_name: base-os
|
||||||
|
alias: "base-os"
|
||||||
commands:
|
commands:
|
||||||
- check_prometheus:
|
- check_prometheus_host_alive:
|
||||||
command_name: check_prometheus
|
command_name: check-prometheus-host-alive
|
||||||
command_line: "$USER1$/check_prometheus_metric.sh -H $HOSTADDRESS$ -q '$ARG1$' -w $ARG2$ -c $ARG3$ -n $ARG4$ -m $ARG5$"
|
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
|
||||||
- check_prometheus_nan_ok:
|
- check_prom_alert_with_labels:
|
||||||
command_name: check_prometheus_nan_ok
|
command_name: check_prom_alert_with_labels
|
||||||
command_line: "$USER1$/check_prometheus_metric.sh -H $HOSTADDRESS$ -q '$ARG1$' -w $ARG2$ -c $ARG3$ -n $ARG4$ -m $ARG5$ -O"
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$'"
|
||||||
- check_prometheus_extra_info:
|
- check_prom_alert:
|
||||||
command_name: check_prometheus_extra_info
|
command_name: check_prom_alert
|
||||||
command_line: "$USER1$/check_prometheus_metric.sh -H $HOSTADDRESS$ -q '$ARG1$' -w $ARG2$ -c $ARG3$ -n $ARG4$ -m $ARG5$ -i -t vector"
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
|
||||||
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
||||||
|
command_name: check_filespace_mounts-usage-rate-fullin4hrs
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
|
||||||
|
- check_filespace_mounts-usage:
|
||||||
|
command_name: check_filespace_mounts-usage
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
|
||||||
|
- check_node_loadavg:
|
||||||
|
command_name: check_node_loadavg
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
|
||||||
|
- check_node_cpu_util:
|
||||||
|
command_name: check_node_cpu_util
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
|
||||||
|
- check_network_connections:
|
||||||
|
command_name: check_network_connections
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
|
||||||
|
- check_memory_usage:
|
||||||
|
command_name: check_memory_usage
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
|
||||||
|
- check_disk_write_latency:
|
||||||
|
command_name: check_disk_write_latency
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
|
||||||
|
- check_disk_read_latency:
|
||||||
|
command_name: check_disk_read_latency
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
|
||||||
|
- check_entropy_availability:
|
||||||
|
command_name: check_entropy_availability
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
|
||||||
|
- check_filedescriptor_usage_rate:
|
||||||
|
command_name: check_filedescriptor_usage_rate
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
|
||||||
|
- check_hwmon_high_cpu_temp:
|
||||||
|
command_name: check_hwmon_high_cpu_temp
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
|
||||||
|
- check_network_receive_drop_high:
|
||||||
|
command_name: check_network_receive_drop_high
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
|
||||||
|
- check_network_transmit_drop_high:
|
||||||
|
command_name: check_network_transmit_drop_high
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
|
||||||
|
- check_network_receive_errors_high:
|
||||||
|
command_name: check_network_receive_errors_high
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
|
||||||
|
- check_network_transmit_errors_high:
|
||||||
|
command_name: check_network_transmit_errors_high
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
|
||||||
|
- check_vmstat_paging_rate:
|
||||||
|
command_name: check_vmstat_paging_rate
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
|
||||||
|
- check_xfs_block_allocation:
|
||||||
|
command_name: check_xfs_block_allocation
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
|
||||||
|
- check_network_bond_status:
|
||||||
|
command_name: check_network_bond_status
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
|
||||||
|
- check_numa_memory_usage:
|
||||||
|
command_name: check_numa_memory_usage
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
|
||||||
|
- check_ntp_sync:
|
||||||
|
command_name: check_ntp_sync
|
||||||
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
||||||
services:
|
services:
|
||||||
- check_prometheus_replicas:
|
- check_prometheus_replicas:
|
||||||
use: generic-service
|
use: generic-service
|
||||||
host_name: prometheus
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Check Prometheus replicas"
|
service_description: "Prometheus_replica-count"
|
||||||
check_command: check_prometheus_extra_info!kube_statefulset_status_replicas{namespace="openstack",statefulset="prometheus"}!3!2!prometheus_replicas!lt
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
|
||||||
check_interval: 1
|
check_interval: 1
|
||||||
- check_alertmanager_replicas:
|
- check_alertmanager_replicas:
|
||||||
use: generic-service
|
use: generic-service
|
||||||
host_name: prometheus
|
hostgroup_name: prometheus-hosts
|
||||||
service_description: "Check Alertmanager replicas"
|
service_description: "PrometheusAlertmanager_replica-count"
|
||||||
check_command: check_prometheus_extra_info!kube_statefulset_status_replicas{namespace="openstack",statefulset="alertmanager"}!3!2!alertmanager_replicas!lt
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
|
||||||
check_interval: 1
|
check_interval: 1
|
||||||
|
- check_statefulset_replicas:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Statefulset_replica-count"
|
||||||
|
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
|
||||||
|
check_interval: 1
|
||||||
|
- check_daemonset_misscheduled:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Daemonset_misscheduled"
|
||||||
|
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
|
||||||
|
check_interval: 1
|
||||||
|
- check_daemonset_not-scheduled:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Daemonset_not-scheduled"
|
||||||
|
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
||||||
|
check_interval: 1
|
||||||
|
- check_deployment_replicas_unavailable:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Deployment_replicas-unavailable"
|
||||||
|
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
|
||||||
|
check_interval: 1
|
||||||
|
- check_deployment_rollingupdate_replicas_unavailable:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "RollingUpdate_Deployment-replicas-unavailable"
|
||||||
|
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
|
||||||
|
check_interval: 1
|
||||||
|
- check_job_status_failed:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Job_status-failed"
|
||||||
|
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
|
||||||
|
check_interval: 1
|
||||||
|
- check_pod_status_pending:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Pod_status-pending"
|
||||||
|
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
|
||||||
|
check_interval: 1
|
||||||
|
- check_pod_status_error_image_pull:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Pod_status-error-image-pull"
|
||||||
|
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
||||||
|
check_interval: 1
|
||||||
|
- check_replicaset_missing_replicas:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Replicaset_missing-replicas"
|
||||||
|
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
|
||||||
|
check_interval: 1
|
||||||
|
- check_pod_container_terminated:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Pod_status-container-terminated"
|
||||||
|
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
|
||||||
|
check_interval: 1
|
||||||
|
- check_glance_api:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "API_glance"
|
||||||
|
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
||||||
|
check_interval: 1
|
||||||
|
- check_nova_api:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "API_nova"
|
||||||
|
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
||||||
|
check_interval: 1
|
||||||
|
- check_keystone_api:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "API_keystone"
|
||||||
|
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
||||||
|
check_interval: 1
|
||||||
|
- check_neutron_api:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "API_neutron"
|
||||||
|
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
||||||
|
check_interval: 1
|
||||||
|
- check_swift_api:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "API_swift"
|
||||||
|
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
||||||
|
check_interval: 1
|
||||||
|
- check_service_nova_compute:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Service_nova-compute"
|
||||||
|
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
|
||||||
|
check_interval: 1
|
||||||
|
- check_service_nova_conductor:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Service_nova-conductor"
|
||||||
|
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
|
||||||
|
check_interval: 1
|
||||||
|
- check_service_nova_consoleauth:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Service_nova-consoleauth"
|
||||||
|
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
|
||||||
|
check_interval: 1
|
||||||
|
- check_service_nova_scheduler:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Service_nova-scheduler"
|
||||||
|
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
|
||||||
|
check_interval: 1
|
||||||
|
- check_ceph_monitor_quorum:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "CEPH_quorum"
|
||||||
|
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
||||||
|
check_interval: 1
|
||||||
|
- check_ceph_storage_usage:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "CEPH_storage-usage"
|
||||||
|
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
||||||
|
check_interval: 1
|
||||||
|
- check_ceph_pgs_degradation:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "CEPH_PGs-degradation"
|
||||||
|
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
||||||
|
check_interval: 1
|
||||||
|
- check_ceph_osds_down:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "CEPH_OSDs-down"
|
||||||
|
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
|
||||||
|
check_interval: 1
|
||||||
|
- check_ceph_monitor_clock_skew:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "CEPH_Clock-skew"
|
||||||
|
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
||||||
|
check_interval: 1
|
||||||
|
- check_fluentd_up:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Fluentd_status"
|
||||||
|
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
||||||
|
check_interval: 1
|
||||||
|
- check_etcd_high_http_deletes_failed:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: ETCD_high-http-delete-failures
|
||||||
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
||||||
|
check_interval: 1
|
||||||
|
- check_etcd_high_http_get_failed:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: ETCD_high-http-get-failures
|
||||||
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
|
||||||
|
check_interval: 1
|
||||||
|
- check_etcd_high_http_updates_failed:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: ETCD_high-http-update-failures
|
||||||
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
|
||||||
|
check_interval: 1
|
||||||
|
- check_felix_iptables_save_errors:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Calico_iptables-save-errors
|
||||||
|
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
- check_felix_ipset_errors:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Calico_ipset-errors
|
||||||
|
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
- check_felix_int_dataplane_iface_msg_batch_size:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Calico_interface-message-batch-size
|
||||||
|
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
- check_felix_int_dataplane_addr_msg_batch_size:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Calico_address-message-batch-size
|
||||||
|
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
- check_felix_int_dataplane_failures:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Calico_datapane_failures_high
|
||||||
|
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: base-os
|
||||||
|
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
|
||||||
|
check_command: check_filespace_mounts-usage-rate-fullin4hrs
|
||||||
|
check_interval: 1
|
||||||
|
- check_filespace_mounts-usage:
|
||||||
|
use: generic-service
|
||||||
|
hostgroup_name: base-os
|
||||||
|
service_description: "Filespace_mounts-usage"
|
||||||
|
check_command: check_filespace_mounts-usage
|
||||||
|
check_interval: 1
|
||||||
|
- check_node_loadavg:
|
||||||
|
use: generic-service
|
||||||
|
service_description: CPU_Load-average
|
||||||
|
check_command: check_node_loadavg
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_node_cpu_util:
|
||||||
|
use: generic-service
|
||||||
|
service_description: CPU_utilization
|
||||||
|
check_command: check_node_cpu_util
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_network_connections:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Network_connections
|
||||||
|
check_command: check_network_connections
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_memory_usage:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Memory_usage
|
||||||
|
check_command: check_memory_usage
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_disk_write_latency:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Disk_write-latency
|
||||||
|
check_command: check_disk_write_latency
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_disk_read_latency:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Disk_read-latency
|
||||||
|
check_command: check_disk_read_latency
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_entropy_availability:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Entropy_availability
|
||||||
|
check_command: check_entropy_availability
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_filedescriptor_usage_rate:
|
||||||
|
use: generic-service
|
||||||
|
service_description: FileDescriptors_usage-rate-high
|
||||||
|
check_command: check_filedescriptor_usage_rate
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_hwmon_high_cpu_temp:
|
||||||
|
use: generic-service
|
||||||
|
service_description: HW_cpu-temp-high
|
||||||
|
check_command: check_hwmon_high_cpu_temp
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_network_receive_drop_high:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Network_receive-drop-high
|
||||||
|
check_command: check_network_receive_drop_high
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_network_transmit_drop_high:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Network_transmit-drop-high
|
||||||
|
check_command: check_network_transmit_drop_high
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_network_receive_errors_high:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Network_receive-errors-high
|
||||||
|
check_command: check_network_receive_errors_high
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_network_transmit_errors_high:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Network_transmit-errors-high
|
||||||
|
check_command: check_network_transmit_errors_high
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_vmstat_paging_rate:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Memory_vmstat-paging-rate
|
||||||
|
check_command: check_vmstat_paging_rate
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_xfs_block_allocation:
|
||||||
|
use: generic-service
|
||||||
|
service_description: XFS_block-allocation
|
||||||
|
check_command: check_xfs_block_allocation
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_network_bond_status:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Network_bondstatus
|
||||||
|
check_command: check_network_bond_status
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_numa_memory_usage:
|
||||||
|
use: generic-service
|
||||||
|
service_description: Memory_NUMA-usage
|
||||||
|
check_command: check_numa_memory_usage
|
||||||
|
hostgroup_name: base-os
|
||||||
|
- check_ntp_sync:
|
||||||
|
use: generic-service
|
||||||
|
service_description: NTP_sync
|
||||||
|
check_command: check_ntp_sync
|
||||||
|
hostgroup_name: base-os
|
||||||
config:
|
config:
|
||||||
log_file: /opt/nagios/var/nagios.log
|
log_file: /opt/nagios/var/nagios.log
|
||||||
cfg_file:
|
cfg_file:
|
||||||
@ -195,6 +565,7 @@ conf:
|
|||||||
- /opt/nagios/etc/objects/contacts.cfg
|
- /opt/nagios/etc/objects/contacts.cfg
|
||||||
- /opt/nagios/etc/objects/timeperiods.cfg
|
- /opt/nagios/etc/objects/timeperiods.cfg
|
||||||
- /opt/nagios/etc/objects/templates.cfg
|
- /opt/nagios/etc/objects/templates.cfg
|
||||||
|
- /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
|
||||||
object_cache_file: /opt/nagios/var/objects.cache
|
object_cache_file: /opt/nagios/var/objects.cache
|
||||||
precached_object_file: /opt/nagios/var/objects.precache
|
precached_object_file: /opt/nagios/var/objects.precache
|
||||||
resource_file: /opt/nagios/etc/resource.cfg
|
resource_file: /opt/nagios/etc/resource.cfg
|
||||||
@ -204,7 +575,7 @@ conf:
|
|||||||
nagios_group: nagios
|
nagios_group: nagios
|
||||||
check_external_commands: 1
|
check_external_commands: 1
|
||||||
command_file: /opt/nagios/var/rw/nagios.cmd
|
command_file: /opt/nagios/var/rw/nagios.cmd
|
||||||
lock_file: /opt/nagios/var/nagios.lock
|
lock_file: /var/run/nagios.lock
|
||||||
temp_file: /opt/nagios/var/nagios.tmp
|
temp_file: /opt/nagios/var/nagios.tmp
|
||||||
temp_path: /tmp
|
temp_path: /tmp
|
||||||
event_broker_options: -1
|
event_broker_options: -1
|
||||||
@ -290,4 +661,4 @@ conf:
|
|||||||
debug_verbosity: 1
|
debug_verbosity: 1
|
||||||
debug_file: /opt/nagios/var/nagios.debug
|
debug_file: /opt/nagios/var/nagios.debug
|
||||||
max_debug_file_size: 1000000
|
max_debug_file_size: 1000000
|
||||||
allow_empty_hostgroup_assignment: 0
|
allow_empty_hostgroup_assignment: 1
|
||||||
|
Loading…
Reference in New Issue
Block a user