From 69cd66b7c9cfb7965b86cfe13fa4b6e249223f26 Mon Sep 17 00:00:00 2001 From: Rakesh Patnaik Date: Tue, 24 Apr 2018 14:17:16 +0000 Subject: [PATCH] Nagios notificiation on alerts and ceph monitoring Change-Id: I782f54b5ad8159e7a4375d336a42524f380e65d2 --- nagios/templates/configmap-etc.yaml | 2 + nagios/templates/deployment.yaml | 8 + nagios/values.yaml | 255 ++++++++++++++++++---------- 3 files changed, 173 insertions(+), 92 deletions(-) diff --git a/nagios/templates/configmap-etc.yaml b/nagios/templates/configmap-etc.yaml index 121ddeaa5..788e1c9fe 100644 --- a/nagios/templates/configmap-etc.yaml +++ b/nagios/templates/configmap-etc.yaml @@ -29,6 +29,8 @@ data: nagios.cfg: |+ {{ include "nagios.to_nagios_conf" .Values.conf.nagios.config | indent 4 }} nagios_objects.cfg: |+ +{{- tuple "contact" .Values.conf.nagios.contacts | include "nagios.object_definition" | indent 4 }} +{{- tuple "contactgroup" .Values.conf.nagios.contactgroups | include "nagios.object_definition" | indent 4 }} {{- tuple "host" .Values.conf.nagios.hosts | include "nagios.object_definition" | indent 4 }} {{- tuple "hostgroup" .Values.conf.nagios.host_groups | include "nagios.object_definition" | indent 4 }} {{- tuple "command" .Values.conf.nagios.commands | include "nagios.object_definition" | indent 4 }} diff --git a/nagios/templates/deployment.yaml b/nagios/templates/deployment.yaml index 27937e735..73ba0941a 100644 --- a/nagios/templates/deployment.yaml +++ b/nagios/templates/deployment.yaml @@ -126,6 +126,14 @@ spec: env: - name: PROMETHEUS_SERVICE value: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }} + - name: SNMP_NOTIF_PRIMARY_TARGET_WITH_PORT + value: {{ $envAll.Values.conf.nagios.notification.snmp.primary_target }} + - name: SNMP_NOTIF_SECONDARY_TARGET_WITH_PORT + value: {{ $envAll.Values.conf.nagios.notification.snmp.secondary_target }} + - name: REST_NOTIF_PRIMARY_TARGET_URL + value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }} + - name: REST_NOTIF_SECONDARY_TARGET_URL + value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }} volumeMounts: - name: nagios-etc mountPath: /opt/nagios/etc/nagios.cfg diff --git a/nagios/values.yaml b/nagios/values.yaml index 4352fa7fc..f1a820ca6 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -186,6 +186,35 @@ conf: httpd: null elasticsearch_host: null nagios: + contacts: + - notifying_contact: + name: notifying_contact + contact_name: notifying_contact + alias: notifying contact + service_notification_period: 24x7 + host_notification_period: 24x7 + service_notification_options: w,u,c,r,f,s + host_notification_options: d,u,r,f,s + register: 0 + - snmp_notifying_contact: + use: notifying_contact + name: snmp_notifying_contact + contact_name: snmp_notifying_contact + alias: snmp contact + service_notification_commands: send_service_snmp_trap + host_notification_commands: send_host_snmp_trap + - http_notifying_contact: + use: notifying_contact + name: http_notifying_contact + contact_name: http_notifying_contact + alias: HTTP contact + service_notification_commands: send_service_http_post + host_notification_commands: send_host_http_post + contactgroups: + - snmp_and_http_notifying_contact_group: + contactgroup_name: snmp_and_http_notifying_contact_group + alias: SNMP and HTTP notifying group + members: snmp_notifying_contact,http_notifying_contact hosts: - prometheus: use: linux-server @@ -204,7 +233,22 @@ conf: - base-os: hostgroup_name: base-os alias: "base-os" + - ceph_mgr_placeholder: + hostgroup_name: ceph_mgr_placeholder + alias: "ceph_mgr_placeholder" commands: + - send_service_snmp_trap: + command_name: send_service_snmp_trap + command_line: "$USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$'" + - send_host_snmp_trap: + command_name: send_host_snmp_trap + command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'" + - send_service_http_post: + command_name: send_service_http_post + command_line: "$USER1$/post_rest_api_service_event.sh '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'" + - send_host_http_post: + command_name: send_host_http_post + command_line: "$USER1$/post_rest_api_host_event.sh '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$HOSTNAME$' '$USER6$' '$USER7$'" - check_prometheus_host_alive: command_name: check-prometheus-host-alive command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10" @@ -274,311 +318,330 @@ conf: - check_ntp_sync: command_name: check_ntp_sync command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.' + - check_ceph_health: + command_name: check_ceph_health + command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 0 --warning 0 services: - - check_prometheus_replicas: + - notifying_service: + name: notifying_service use: generic-service + flap_detection_enabled: 0 + process_perf_data: 0 + contact_groups: snmp_and_http_notifying_contact_group + check_interval: 60 + notification_interval: 120 + retry_interval: 15 + register: 0 + - check_ceph_health: + use: notifying_service + hostgroup_name: ^ceph_mgr.*$ + service_description: "CEPH_health" + check_command: check_ceph_health + check_interval: 60 + - check_prometheus_replicas: + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Prometheus_replica-count" check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas - check_interval: 1 + check_interval: 60 - check_alertmanager_replicas: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "PrometheusAlertmanager_replica-count" check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas - check_interval: 1 + check_interval: 60 - check_statefulset_replicas: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Statefulset_replica-count" check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas - check_interval: 1 + check_interval: 60 - check_daemonset_misscheduled: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Daemonset_misscheduled" check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected - check_interval: 1 + check_interval: 60 - check_daemonset_not-scheduled: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Daemonset_not-scheduled" check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired - check_interval: 1 + check_interval: 60 - check_deployment_replicas_unavailable: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Deployment_replicas-unavailable" check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas - check_interval: 1 + check_interval: 60 - check_deployment_rollingupdate_replicas_unavailable: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "RollingUpdate_Deployment-replicas-unavailable" check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas - check_interval: 1 + check_interval: 60 - check_job_status_failed: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Job_status-failed" check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures - check_interval: 1 + check_interval: 60 - check_pod_status_pending: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Pod_status-pending" check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status - check_interval: 1 + check_interval: 60 - check_pod_status_error_image_pull: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Pod_status-error-image-pull" check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status - check_interval: 1 + check_interval: 60 - check_replicaset_missing_replicas: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Replicaset_missing-replicas" check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset - check_interval: 1 + check_interval: 60 - check_pod_container_terminated: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Pod_status-container-terminated" check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good - check_interval: 1 + check_interval: 60 - check_glance_api: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_glance" check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available - check_interval: 1 + check_interval: 60 - check_nova_api: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_nova" check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available - check_interval: 1 + check_interval: 60 - check_keystone_api: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_keystone" check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available - check_interval: 1 + check_interval: 60 - check_neutron_api: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_neutron" check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available - check_interval: 1 + check_interval: 60 - check_swift_api: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "API_swift" check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available - check_interval: 1 + check_interval: 60 - check_service_nova_compute: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-compute" check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts - check_interval: 1 + check_interval: 60 - check_service_nova_conductor: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-conductor" check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts - check_interval: 1 + check_interval: 60 - check_service_nova_consoleauth: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-consoleauth" check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts - check_interval: 1 + check_interval: 60 - check_service_nova_scheduler: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Service_nova-scheduler" check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts - check_interval: 1 + check_interval: 60 - check_ceph_monitor_quorum: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "CEPH_quorum" check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists - check_interval: 1 + check_interval: 60 - check_ceph_storage_usage: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "CEPH_storage-usage" check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent - check_interval: 1 + check_interval: 60 - check_ceph_pgs_degradation: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "CEPH_PGs-degradation" check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent - check_interval: 1 + check_interval: 60 - check_ceph_osds_down: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "CEPH_OSDs-down" check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent - check_interval: 1 + check_interval: 60 - check_ceph_monitor_clock_skew: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "CEPH_Clock-skew" check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds - check_interval: 1 + check_interval: 60 - check_fluentd_up: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: "Fluentd_status" check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes - check_interval: 1 + check_interval: 60 - check_etcd_high_http_deletes_failed: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: ETCD_high-http-delete-failures check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE - check_interval: 1 + check_interval: 60 - check_etcd_high_http_get_failed: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: ETCD_high-http-get-failures check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET - check_interval: 1 + check_interval: 60 - check_etcd_high_http_updates_failed: - use: generic-service + use: notifying_service hostgroup_name: prometheus-hosts service_description: ETCD_high-http-update-failures check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT - check_interval: 1 + check_interval: 60 - check_felix_iptables_save_errors: - use: generic-service + use: notifying_service service_description: Calico_iptables-save-errors check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low hostgroup_name: prometheus-hosts - check_felix_ipset_errors: - use: generic-service + use: notifying_service service_description: Calico_ipset-errors check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low hostgroup_name: prometheus-hosts - check_felix_int_dataplane_iface_msg_batch_size: - use: generic-service + use: notifying_service service_description: Calico_interface-message-batch-size check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low hostgroup_name: prometheus-hosts - check_felix_int_dataplane_addr_msg_batch_size: - use: generic-service + use: notifying_service service_description: Calico_address-message-batch-size check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low hostgroup_name: prometheus-hosts - check_felix_int_dataplane_failures: - use: generic-service + use: notifying_service service_description: Calico_datapane_failures_high check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low hostgroup_name: prometheus-hosts - check_filespace_mounts-usage-rate-fullin4hrs: - use: generic-service + use: notifying_service hostgroup_name: base-os service_description: "Filespace_mounts-usage-rate-fullin4hrs" check_command: check_filespace_mounts-usage-rate-fullin4hrs - check_interval: 1 + check_interval: 60 - check_filespace_mounts-usage: - use: generic-service + use: notifying_service hostgroup_name: base-os service_description: "Filespace_mounts-usage" check_command: check_filespace_mounts-usage - check_interval: 1 + check_interval: 60 - check_node_loadavg: - use: generic-service + use: notifying_service service_description: CPU_Load-average check_command: check_node_loadavg hostgroup_name: base-os - check_node_cpu_util: - use: generic-service + use: notifying_service service_description: CPU_utilization check_command: check_node_cpu_util hostgroup_name: base-os - check_network_connections: - use: generic-service + use: notifying_service service_description: Network_connections check_command: check_network_connections hostgroup_name: base-os - check_memory_usage: - use: generic-service + use: notifying_service service_description: Memory_usage check_command: check_memory_usage hostgroup_name: base-os - check_disk_write_latency: - use: generic-service + use: notifying_service service_description: Disk_write-latency check_command: check_disk_write_latency hostgroup_name: base-os - check_disk_read_latency: - use: generic-service + use: notifying_service service_description: Disk_read-latency check_command: check_disk_read_latency hostgroup_name: base-os - check_entropy_availability: - use: generic-service + use: notifying_service service_description: Entropy_availability check_command: check_entropy_availability hostgroup_name: base-os - check_filedescriptor_usage_rate: - use: generic-service + use: notifying_service service_description: FileDescriptors_usage-rate-high check_command: check_filedescriptor_usage_rate hostgroup_name: base-os - check_hwmon_high_cpu_temp: - use: generic-service + use: notifying_service service_description: HW_cpu-temp-high check_command: check_hwmon_high_cpu_temp hostgroup_name: base-os - check_network_receive_drop_high: - use: generic-service + use: notifying_service service_description: Network_receive-drop-high check_command: check_network_receive_drop_high hostgroup_name: base-os - check_network_transmit_drop_high: - use: generic-service + use: notifying_service service_description: Network_transmit-drop-high check_command: check_network_transmit_drop_high hostgroup_name: base-os - check_network_receive_errors_high: - use: generic-service + use: notifying_service service_description: Network_receive-errors-high check_command: check_network_receive_errors_high hostgroup_name: base-os - check_network_transmit_errors_high: - use: generic-service + use: notifying_service service_description: Network_transmit-errors-high check_command: check_network_transmit_errors_high hostgroup_name: base-os - check_vmstat_paging_rate: - use: generic-service + use: notifying_service service_description: Memory_vmstat-paging-rate check_command: check_vmstat_paging_rate hostgroup_name: base-os - check_xfs_block_allocation: - use: generic-service + use: notifying_service service_description: XFS_block-allocation check_command: check_xfs_block_allocation hostgroup_name: base-os - check_network_bond_status: - use: generic-service + use: notifying_service service_description: Network_bondstatus check_command: check_network_bond_status hostgroup_name: base-os - check_numa_memory_usage: - use: generic-service + use: notifying_service service_description: Memory_NUMA-usage check_command: check_numa_memory_usage hostgroup_name: base-os - check_ntp_sync: - use: generic-service + use: notifying_service service_description: NTP_sync check_command: check_ntp_sync hostgroup_name: base-os @@ -633,9 +696,9 @@ conf: auto_rescheduling_interval: 30 auto_rescheduling_window: 180 service_check_timeout: 60 - host_check_timeout: 30 - event_handler_timeout: 30 - notification_timeout: 30 + host_check_timeout: 60 + event_handler_timeout: 60 + notification_timeout: 60 ocsp_timeout: 5 perfdata_timeout: 5 retain_state_information: 1 @@ -649,7 +712,7 @@ conf: retained_process_service_attribute_mask: 0 retained_contact_host_attribute_mask: 0 retained_contact_service_attribute_mask: 0 - interval_length: 60 + interval_length: 1 check_for_updates: 1 bare_update_check: 0 use_aggressive_host_checking: 0 @@ -677,7 +740,7 @@ conf: low_host_flap_threshold: 5.0 high_host_flap_threshold: 20.0 date_format: us - use_regexp_matching: 0 + use_regexp_matching: 1 use_true_regexp_matching: 0 daemon_dumps_core: 0 use_large_installation_tweaks: 0 @@ -687,3 +750,11 @@ conf: debug_file: /opt/nagios/var/nagios.debug max_debug_file_size: 1000000 allow_empty_hostgroup_assignment: 1 + illegal_macro_output_chars: "`~$&|'<>\"" + notification: + snmp: + primary_target: 127.0.0.1:15162 + secondary_target: 127.0.0.1:15162 + http: + primary_target: 127.0.0.1:3904/events + secondary_target: 127.0.0.1:3904/events