openstack-helm-infra/nagios/values_overrides/openstack-objects.yaml

conf:
  nagios:
    objects:
      mariadb:
        template: |
          define service {
            check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
            hostgroup_name prometheus-hosts
            service_description Prometheus-exporter_MariaDB
            use generic-service
          }

          define service {
            check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
            hostgroup_name prometheus-hosts
            service_description Mariadb_table-lock-waits-high
            use generic-service
          }

          define service {
            check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
            hostgroup_name prometheus-hosts
            service_description Mariadb_node-ready
            use generic-service
          }

          define service {
            check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
            hostgroup_name prometheus-hosts
            service_description Mariadb_node-synchronized
            use generic-service
          }

          define service {
            check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
            hostgroup_name prometheus-hosts
            service_description Mariadb_innodb-replication-lag
            use generic-service
          }
      rabbitmq:
        template: |
          define service {
            check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_network-partitions-exist
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_up
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_file-descriptor-usage
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_node-disk-alarm
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_node-memory-alarm
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_high-availability
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_message-return-percent
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_consumer-utilization
            use generic-service
          }

          define service {
            check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
            hostgroup_name prometheus-hosts
            service_description Rabbitmq_rabbitmq-queue-health
            use generic-service
          }
      openstack:
        template: |
          define service {
            check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description API_glance
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description API_nova
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description API_keystone
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description API_neutron
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_neutron-metadata-agent
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_neutron-openvswitch-agent
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_neutron-dhcp-agent
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_neutron-l3-agent
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description API_swift
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
            hostgroup_name prometheus-hosts
            service_description API_cinder
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description API_heat
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description API_cinder
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_cinder-scheduler
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_nova-compute
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_nova-conductor
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_nova-consoleauth
            use notifying_service
          }

          define service {
            check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description Service_nova-scheduler
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description OS-Total-Quota_VCPU-usage
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description OS-Total-Quota_RAM-usage
            use notifying_service
          }

          define service {
            check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
            check_interval 60
            hostgroup_name prometheus-hosts
            service_description OS-Total-Quota_Disk-usage
            use notifying_service
          }

          define service {
            check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
            hostgroup_name prometheus-hosts
            service_description Prometheus-exporter_Openstack
            use generic-service
          }