conf: nagios: objects: mariadb: template: | define service { check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available. hostgroup_name prometheus-hosts service_description Prometheus-exporter_MariaDB use generic-service } define service { check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits. hostgroup_name prometheus-hosts service_description Mariadb_table-lock-waits-high use generic-service } define service { check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready. hostgroup_name prometheus-hosts service_description Mariadb_node-ready use generic-service } define service { check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync hostgroup_name prometheus-hosts service_description Mariadb_node-synchronized use generic-service } define service { check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal. hostgroup_name prometheus-hosts service_description Mariadb_innodb-replication-lag use generic-service } rabbitmq: template: | define service { check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq hostgroup_name prometheus-hosts service_description Rabbitmq_network-partitions-exist use generic-service } define service { check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available hostgroup_name prometheus-hosts service_description Rabbitmq_up use generic-service } define service { check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal hostgroup_name prometheus-hosts service_description Rabbitmq_file-descriptor-usage use generic-service } define service { check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms hostgroup_name prometheus-hosts service_description Rabbitmq_node-disk-alarm use generic-service } define service { check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms hostgroup_name prometheus-hosts service_description Rabbitmq_node-memory-alarm use generic-service } define service { check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving hostgroup_name prometheus-hosts service_description Rabbitmq_high-availability use generic-service } define service { check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist. hostgroup_name prometheus-hosts service_description Rabbitmq_message-return-percent use generic-service } define service { check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal hostgroup_name prometheus-hosts service_description Rabbitmq_consumer-utilization use generic-service } define service { check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high hostgroup_name prometheus-hosts service_description Rabbitmq_rabbitmq-queue-health use generic-service } openstack: template: | define service { check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available check_interval 60 hostgroup_name prometheus-hosts service_description API_glance use notifying_service } define service { check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available check_interval 60 hostgroup_name prometheus-hosts service_description API_nova use notifying_service } define service { check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available check_interval 60 hostgroup_name prometheus-hosts service_description API_keystone use notifying_service } define service { check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available check_interval 60 hostgroup_name prometheus-hosts service_description API_neutron use notifying_service } define service { check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up check_interval 60 hostgroup_name prometheus-hosts service_description Service_neutron-metadata-agent use notifying_service } define service { check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up check_interval 60 hostgroup_name prometheus-hosts service_description Service_neutron-openvswitch-agent use notifying_service } define service { check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up check_interval 60 hostgroup_name prometheus-hosts service_description Service_neutron-dhcp-agent use notifying_service } define service { check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up check_interval 60 hostgroup_name prometheus-hosts service_description Service_neutron-l3-agent use notifying_service } define service { check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available check_interval 60 hostgroup_name prometheus-hosts service_description API_swift use notifying_service } define service { check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available hostgroup_name prometheus-hosts service_description API_cinder use notifying_service } define service { check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available check_interval 60 hostgroup_name prometheus-hosts service_description API_heat use notifying_service } define service { check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available check_interval 60 hostgroup_name prometheus-hosts service_description API_cinder use notifying_service } define service { check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available check_interval 60 hostgroup_name prometheus-hosts service_description Service_cinder-scheduler use notifying_service } define service { check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts check_interval 60 hostgroup_name prometheus-hosts service_description Service_nova-compute use notifying_service } define service { check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts check_interval 60 hostgroup_name prometheus-hosts service_description Service_nova-conductor use notifying_service } define service { check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts check_interval 60 hostgroup_name prometheus-hosts service_description Service_nova-consoleauth use notifying_service } define service { check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts check_interval 60 hostgroup_name prometheus-hosts service_description Service_nova-scheduler use notifying_service } define service { check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available. check_interval 60 hostgroup_name prometheus-hosts service_description OS-Total-Quota_VCPU-usage use notifying_service } define service { check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available. check_interval 60 hostgroup_name prometheus-hosts service_description OS-Total-Quota_RAM-usage use notifying_service } define service { check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available. check_interval 60 hostgroup_name prometheus-hosts service_description OS-Total-Quota_Disk-usage use notifying_service } define service { check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available. hostgroup_name prometheus-hosts service_description Prometheus-exporter_Openstack use generic-service }