# Copyright 2017 The Openstack-Helm Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Default values for nagios. # This is a YAML-formatted file. # Declare variables to be passed into your templates. images: tags: nagios: quay.io/attcomdev/nagios:931116b88c54931c616dfa66f424be38f74d8ad2 dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1 image_repo_sync: docker.io/docker:17.07.0 pull_policy: IfNotPresent local_registry: active: false exclude: - dep_check - image_repo_sync labels: nagios: node_selector_key: openstack-control-plane node_selector_value: enabled job: node_selector_key: openstack-control-plane node_selector_value: enabled dependencies: dynamic: common: jobs: - nagios-image-repo-sync services: - service: local_image_registry endpoint: node static: image_repo_sync: services: - service: local_image_registry endpoint: internal nagios: services: null secrets: nagios: admin: nagios-admin-creds endpoints: cluster_domain_suffix: cluster.local local_image_registry: name: docker-registry namespace: docker-registry hosts: default: localhost internal: docker-registry node: localhost host_fqdn_override: default: null port: registry: node: 5000 monitoring: name: prometheus hosts: default: prom-metrics public: prometheus host_fqdn_override: default: null path: default: null scheme: default: http port: api: default: 9090 public: 80 nagios: name: nagios namespace: null auth: admin: username: admin password: changeme hosts: default: nagios-metrics public: nagios host_fqdn_override: default: null path: default: null scheme: default: http port: http: default: 80 network: nagios: ingress: public: true classes: namespace: "nginx" cluster: "nginx-cluster" annotations: nginx.ingress.kubernetes.io/rewrite-target: / node_port: enabled: false port: 30925 pod: lifecycle: upgrades: revision_history: 3 pod_replacement_strategy: RollingUpdate rolling_update: max_unavailable: 1 max_surge: 3 termination_grace_period: nagios: timeout: 30 replicas: nagios: 1 resources: enabled: false nagios: limits: memory: "1024Mi" cpu: "2000m" requests: memory: "128Mi" cpu: "100m" jobs: image_repo_sync: limits: memory: "1024Mi" cpu: "2000m" requests: memory: "128Mi" cpu: "100m" manifests: configmap_bin: true configmap_etc: true deployment: true ingress: true job_image_repo_sync: true secret_nagios: true service: true service_ingress: true conf: nagios: hosts: - prometheus: use: linux-server host_name: prometheus alias: "Prometheus Monitoring" address: 127.0.0.1 hostgroups: prometheus-hosts check_command: check-prometheus-host-alive host_groups: - prometheus-hosts: hostgroup_name: prometheus-hosts alias: "Prometheus Virtual Host" - all: hostgroup_name: all alias: "all" - base-os: hostgroup_name: base-os alias: "base-os" commands: - check_prometheus_host_alive: command_name: check-prometheus-host-alive command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10" - check_prom_alert_with_labels: command_name: check_prom_alert_with_labels command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$'" - check_prom_alert: command_name: check_prom_alert command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'" - check_filespace_mounts-usage-rate-fullin4hrs: command_name: check_filespace_mounts-usage-rate-fullin4hrs command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal' - check_filespace_mounts-usage: command_name: check_filespace_mounts-usage command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal' - check_node_loadavg: command_name: check_node_loadavg command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal' - check_node_cpu_util: command_name: check_node_cpu_util command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal' - check_network_connections: command_name: check_network_connections command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal' - check_memory_usage: command_name: check_memory_usage command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%' - check_disk_write_latency: command_name: check_disk_write_latency command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal' - check_disk_read_latency: command_name: check_disk_read_latency command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal' - check_entropy_availability: command_name: check_entropy_availability command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient' - check_filedescriptor_usage_rate: command_name: check_filedescriptor_usage_rate command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.' - check_hwmon_high_cpu_temp: command_name: check_hwmon_high_cpu_temp command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.' - check_network_receive_drop_high: command_name: check_network_receive_drop_high command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.' - check_network_transmit_drop_high: command_name: check_network_transmit_drop_high command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.' - check_network_receive_errors_high: command_name: check_network_receive_errors_high command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.' - check_network_transmit_errors_high: command_name: check_network_transmit_errors_high command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.' - check_vmstat_paging_rate: command_name: check_vmstat_paging_rate command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.' - check_xfs_block_allocation: command_name: check_xfs_block_allocation command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.' - check_network_bond_status: command_name: check_network_bond_status command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.' - check_numa_memory_usage: command_name: check_numa_memory_usage command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.' - check_ntp_sync: command_name: check_ntp_sync command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.' services: - check_prometheus_replicas: use: generic-service hostgroup_name: prometheus-hosts service_description: "Prometheus_replica-count" check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas check_interval: 1 - check_alertmanager_replicas: use: generic-service hostgroup_name: prometheus-hosts service_description: "PrometheusAlertmanager_replica-count" check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas check_interval: 1 - check_statefulset_replicas: use: generic-service hostgroup_name: prometheus-hosts service_description: "Statefulset_replica-count" check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas check_interval: 1 - check_daemonset_misscheduled: use: generic-service hostgroup_name: prometheus-hosts service_description: "Daemonset_misscheduled" check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected check_interval: 1 - check_daemonset_not-scheduled: use: generic-service hostgroup_name: prometheus-hosts service_description: "Daemonset_not-scheduled" check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired check_interval: 1 - check_deployment_replicas_unavailable: use: generic-service hostgroup_name: prometheus-hosts service_description: "Deployment_replicas-unavailable" check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas check_interval: 1 - check_deployment_rollingupdate_replicas_unavailable: use: generic-service hostgroup_name: prometheus-hosts service_description: "RollingUpdate_Deployment-replicas-unavailable" check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas check_interval: 1 - check_job_status_failed: use: generic-service hostgroup_name: prometheus-hosts service_description: "Job_status-failed" check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures check_interval: 1 - check_pod_status_pending: use: generic-service hostgroup_name: prometheus-hosts service_description: "Pod_status-pending" check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status check_interval: 1 - check_pod_status_error_image_pull: use: generic-service hostgroup_name: prometheus-hosts service_description: "Pod_status-error-image-pull" check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status check_interval: 1 - check_replicaset_missing_replicas: use: generic-service hostgroup_name: prometheus-hosts service_description: "Replicaset_missing-replicas" check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset check_interval: 1 - check_pod_container_terminated: use: generic-service hostgroup_name: prometheus-hosts service_description: "Pod_status-container-terminated" check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good check_interval: 1 - check_glance_api: use: generic-service hostgroup_name: prometheus-hosts service_description: "API_glance" check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available check_interval: 1 - check_nova_api: use: generic-service hostgroup_name: prometheus-hosts service_description: "API_nova" check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available check_interval: 1 - check_keystone_api: use: generic-service hostgroup_name: prometheus-hosts service_description: "API_keystone" check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available check_interval: 1 - check_neutron_api: use: generic-service hostgroup_name: prometheus-hosts service_description: "API_neutron" check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available check_interval: 1 - check_swift_api: use: generic-service hostgroup_name: prometheus-hosts service_description: "API_swift" check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available check_interval: 1 - check_service_nova_compute: use: generic-service hostgroup_name: prometheus-hosts service_description: "Service_nova-compute" check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts check_interval: 1 - check_service_nova_conductor: use: generic-service hostgroup_name: prometheus-hosts service_description: "Service_nova-conductor" check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts check_interval: 1 - check_service_nova_consoleauth: use: generic-service hostgroup_name: prometheus-hosts service_description: "Service_nova-consoleauth" check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts check_interval: 1 - check_service_nova_scheduler: use: generic-service hostgroup_name: prometheus-hosts service_description: "Service_nova-scheduler" check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts check_interval: 1 - check_ceph_monitor_quorum: use: generic-service hostgroup_name: prometheus-hosts service_description: "CEPH_quorum" check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists check_interval: 1 - check_ceph_storage_usage: use: generic-service hostgroup_name: prometheus-hosts service_description: "CEPH_storage-usage" check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent check_interval: 1 - check_ceph_pgs_degradation: use: generic-service hostgroup_name: prometheus-hosts service_description: "CEPH_PGs-degradation" check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent check_interval: 1 - check_ceph_osds_down: use: generic-service hostgroup_name: prometheus-hosts service_description: "CEPH_OSDs-down" check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent check_interval: 1 - check_ceph_monitor_clock_skew: use: generic-service hostgroup_name: prometheus-hosts service_description: "CEPH_Clock-skew" check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds check_interval: 1 - check_fluentd_up: use: generic-service hostgroup_name: prometheus-hosts service_description: "Fluentd_status" check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes check_interval: 1 - check_etcd_high_http_deletes_failed: use: generic-service hostgroup_name: prometheus-hosts service_description: ETCD_high-http-delete-failures check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE check_interval: 1 - check_etcd_high_http_get_failed: use: generic-service hostgroup_name: prometheus-hosts service_description: ETCD_high-http-get-failures check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET check_interval: 1 - check_etcd_high_http_updates_failed: use: generic-service hostgroup_name: prometheus-hosts service_description: ETCD_high-http-update-failures check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT check_interval: 1 - check_felix_iptables_save_errors: use: generic-service service_description: Calico_iptables-save-errors check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low hostgroup_name: prometheus-hosts - check_felix_ipset_errors: use: generic-service service_description: Calico_ipset-errors check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low hostgroup_name: prometheus-hosts - check_felix_int_dataplane_iface_msg_batch_size: use: generic-service service_description: Calico_interface-message-batch-size check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low hostgroup_name: prometheus-hosts - check_felix_int_dataplane_addr_msg_batch_size: use: generic-service service_description: Calico_address-message-batch-size check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low hostgroup_name: prometheus-hosts - check_felix_int_dataplane_failures: use: generic-service service_description: Calico_datapane_failures_high check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low hostgroup_name: prometheus-hosts - check_filespace_mounts-usage-rate-fullin4hrs: use: generic-service hostgroup_name: base-os service_description: "Filespace_mounts-usage-rate-fullin4hrs" check_command: check_filespace_mounts-usage-rate-fullin4hrs check_interval: 1 - check_filespace_mounts-usage: use: generic-service hostgroup_name: base-os service_description: "Filespace_mounts-usage" check_command: check_filespace_mounts-usage check_interval: 1 - check_node_loadavg: use: generic-service service_description: CPU_Load-average check_command: check_node_loadavg hostgroup_name: base-os - check_node_cpu_util: use: generic-service service_description: CPU_utilization check_command: check_node_cpu_util hostgroup_name: base-os - check_network_connections: use: generic-service service_description: Network_connections check_command: check_network_connections hostgroup_name: base-os - check_memory_usage: use: generic-service service_description: Memory_usage check_command: check_memory_usage hostgroup_name: base-os - check_disk_write_latency: use: generic-service service_description: Disk_write-latency check_command: check_disk_write_latency hostgroup_name: base-os - check_disk_read_latency: use: generic-service service_description: Disk_read-latency check_command: check_disk_read_latency hostgroup_name: base-os - check_entropy_availability: use: generic-service service_description: Entropy_availability check_command: check_entropy_availability hostgroup_name: base-os - check_filedescriptor_usage_rate: use: generic-service service_description: FileDescriptors_usage-rate-high check_command: check_filedescriptor_usage_rate hostgroup_name: base-os - check_hwmon_high_cpu_temp: use: generic-service service_description: HW_cpu-temp-high check_command: check_hwmon_high_cpu_temp hostgroup_name: base-os - check_network_receive_drop_high: use: generic-service service_description: Network_receive-drop-high check_command: check_network_receive_drop_high hostgroup_name: base-os - check_network_transmit_drop_high: use: generic-service service_description: Network_transmit-drop-high check_command: check_network_transmit_drop_high hostgroup_name: base-os - check_network_receive_errors_high: use: generic-service service_description: Network_receive-errors-high check_command: check_network_receive_errors_high hostgroup_name: base-os - check_network_transmit_errors_high: use: generic-service service_description: Network_transmit-errors-high check_command: check_network_transmit_errors_high hostgroup_name: base-os - check_vmstat_paging_rate: use: generic-service service_description: Memory_vmstat-paging-rate check_command: check_vmstat_paging_rate hostgroup_name: base-os - check_xfs_block_allocation: use: generic-service service_description: XFS_block-allocation check_command: check_xfs_block_allocation hostgroup_name: base-os - check_network_bond_status: use: generic-service service_description: Network_bondstatus check_command: check_network_bond_status hostgroup_name: base-os - check_numa_memory_usage: use: generic-service service_description: Memory_NUMA-usage check_command: check_numa_memory_usage hostgroup_name: base-os - check_ntp_sync: use: generic-service service_description: NTP_sync check_command: check_ntp_sync hostgroup_name: base-os config: log_file: /opt/nagios/var/nagios.log cfg_file: - /opt/nagios/etc/nagios_objects.cfg - /opt/nagios/etc/objects/commands.cfg - /opt/nagios/etc/objects/contacts.cfg - /opt/nagios/etc/objects/timeperiods.cfg - /opt/nagios/etc/objects/templates.cfg - /opt/nagios/etc/objects/prometheus_discovery_objects.cfg object_cache_file: /opt/nagios/var/objects.cache precached_object_file: /opt/nagios/var/objects.precache resource_file: /opt/nagios/etc/resource.cfg status_file: /opt/nagios/var/status.dat status_update_interval: 10 nagios_user: nagios nagios_group: nagios check_external_commands: 1 command_file: /opt/nagios/var/rw/nagios.cmd lock_file: /var/run/nagios.lock temp_file: /opt/nagios/var/nagios.tmp temp_path: /tmp event_broker_options: -1 log_rotation_method: d log_archive_path: /opt/nagios/var/archives use_syslog: 1 log_service_retries: 1 log_host_retries: 1 log_event_handlers: 1 log_initial_states: 0 log_current_states: 1 log_external_commands: 1 log_passive_checks: 1 service_inter_check_delay_method: s max_service_check_spread: 30 service_interleave_factor: s host_inter_check_delay_method: s max_host_check_spread: 30 max_concurrent_checks: 0 check_result_reaper_frequency: 10 max_check_result_reaper_time: 30 check_result_path: /opt/nagios/var/spool/checkresults max_check_result_file_age: 3600 cached_host_check_horizon: 15 cached_service_check_horizon: 15 enable_predictive_host_dependency_checks: 1 enable_predictive_service_dependency_checks: 1 soft_state_dependencies: 0 auto_reschedule_checks: 0 auto_rescheduling_interval: 30 auto_rescheduling_window: 180 service_check_timeout: 60 host_check_timeout: 30 event_handler_timeout: 30 notification_timeout: 30 ocsp_timeout: 5 perfdata_timeout: 5 retain_state_information: 1 state_retention_file: /opt/nagios/var/retention.dat retention_update_interval: 60 use_retained_program_state: 1 use_retained_scheduling_info: 1 retained_host_attribute_mask: 0 retained_service_attribute_mask: 0 retained_process_host_attribute_mask: 0 retained_process_service_attribute_mask: 0 retained_contact_host_attribute_mask: 0 retained_contact_service_attribute_mask: 0 interval_length: 60 check_for_updates: 1 bare_update_check: 0 use_aggressive_host_checking: 0 execute_service_checks: 1 accept_passive_service_checks: 1 execute_host_checks: 1 accept_passive_host_checks: 1 enable_notifications: 1 enable_event_handlers: 1 process_performance_data: 0 obsess_over_services: 0 obsess_over_hosts: 0 translate_passive_host_checks: 0 passive_host_checks_are_soft: 0 check_for_orphaned_services: 1 check_for_orphaned_hosts: 1 check_service_freshness: 1 service_freshness_check_interval: 60 check_host_freshness: 0 host_freshness_check_interval: 60 additional_freshness_latency: 15 enable_flap_detection: 1 low_service_flap_threshold: 5.0 high_service_flap_threshold: 20.0 low_host_flap_threshold: 5.0 high_host_flap_threshold: 20.0 date_format: us use_regexp_matching: 0 use_true_regexp_matching: 0 daemon_dumps_core: 0 use_large_installation_tweaks: 0 enable_environment_macros: 0 debug_level: 0 debug_verbosity: 1 debug_file: /opt/nagios/var/nagios.debug max_debug_file_size: 1000000 allow_empty_hostgroup_assignment: 1