openstack-ansible/rpc_deployment/playbooks/monitoring/maas_local.yml
Kevin Carter 72b4fe097d Merge pull request #153 from git-harry/cdm
Add CDM checks and alarms
2014-09-29 21:33:07 -05:00

349 lines
16 KiB
YAML

---
# Copyright 2014, Rackspace US, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
- hosts: cinder_api
vars:
check_name: cinder_api_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'cinder_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["cinder_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: cinder_scheduler
vars:
check_name: cinder_scheduler_check
check_details: file=cinder_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'cinder_scheduler_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["cinder-scheduler_status"] != 1) { return new AlarmStatus(CRITICAL, "cinder-scheduler down"); }' }
user: root
roles:
- maas_local
- hosts: cinder_volume
vars:
check_name: cinder_volume_check
check_details: file=cinder_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'cinder_volume_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["cinder-volume_status"] != 1) { return new AlarmStatus(CRITICAL, "cinder-volume down"); }' }
user: root
roles:
- maas_local
- hosts: glance_api
vars:
check_name: glance_api_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'glance_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["glance_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: glance_registry
vars:
check_name: glance_registry_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'glance_registry_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["glance_registry_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: heat_api
vars:
check_name: heat_api_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'heat_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["heat_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: heat_api_cfn
vars:
check_name: heat_cfn_api_check
check_details: file=service_api_local_check.py,args=heat_cfn,args={{ ansible_ssh_host }},args=8000
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'heat_cfn_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["heat_cfn_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: heat_api_cloudwatch
vars:
check_name: heat_cw_api_check
check_details: file=service_api_local_check.py,args=heat_cw,args={{ ansible_ssh_host }},args=8003
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'heat_cw_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["heat_cw_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: horizon
vars:
check_name: horizon_local_check
check_details: file=horizon_check.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'horizon_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["horizon_local_status"] != 1) { return new AlarmStatus(CRITICAL, "Horizon unavailable"); }' }
user: root
roles:
- maas_local
- hosts: keystone
vars:
check_name: keystone_api_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'keystone_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["keystone_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: neutron_server
vars:
check_name: neutron_api_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'neutron_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["neutron_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: neutron_dhcp_agent
vars:
check_name: neutron_dhcp_agent_check
check_details: file=neutron_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'neutron_dhcp_agent_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["neutron-dhcp-agent_status"] != 1) { return new AlarmStatus(CRITICAL, "neutron-dhcp-agent down"); }' }
user: root
roles:
- maas_local
- hosts: neutron_l3_agent
vars:
check_name: neutron_l3_agent_check
check_details: file=neutron_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'neutron_l3_agent_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["neutron-l3-agent_status"] != 1) { return new AlarmStatus(CRITICAL, "neutron-l3-agent down"); }' }
user: root
roles:
- maas_local
- hosts: neutron_linuxbridge_agent
vars:
check_name: neutron_linuxbridge_agent_check
check_details: file=neutron_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'neutron_linuxbridge_agent_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["neutron-linuxbridge-agent_status"] != 1) { return new AlarmStatus(CRITICAL, "neutron-linuxbridge-agent down"); }' }
user: root
roles:
- maas_local
- hosts: neutron_metadata_agent
vars:
check_name: neutron_metadata_agent_check
check_details: file=neutron_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'neutron_metadata_agent_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["neutron-metadata-agent_status"] != 1) { return new AlarmStatus(CRITICAL, "neutron-metadata-agent down"); }' }
user: root
roles:
- maas_local
- hosts: neutron_l3_agent
vars:
check_name: neutron_metering_agent_check
check_details: file=neutron_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'neutron_metering_agent_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["neutron-metering-agent_status"] != 1) { return new AlarmStatus(CRITICAL, "neutron-metering-agent down"); }' }
- hosts: nova_api_metadata
vars:
check_name: nova_api_metadata_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_api_metadata_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova_api_metadata_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API (metadata) unavailable"); }' }
user: root
roles:
- maas_local
- hosts: nova_api_os_compute
vars:
check_name: nova_api_local_check
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API (os-compute) unavailable"); }' }
user: root
roles:
- maas_local
- hosts: nova_compute
vars:
check_name: nova_compute_check
check_details: file=nova_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_compute_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova-compute_status"] != 1) { return new AlarmStatus(CRITICAL, "nova-compute down"); }' }
user: root
roles:
- maas_local
- hosts: nova_cert
vars:
check_name: nova_cert_check
check_details: file=nova_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_cert_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova-cert_status"] != 1) { return new AlarmStatus(CRITICAL, "nova-cert down"); }' }
user: root
roles:
- maas_local
- hosts: nova_conductor
vars:
check_name: nova_conductor_check
check_details: file=nova_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_conductor_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova-conductor_status"] != 1) { return new AlarmStatus(CRITICAL, "nova-conductor down"); }' }
user: root
roles:
- maas_local
- hosts: nova_scheduler
vars:
check_name: nova_scheduler_check
check_details: file=nova_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_scheduler_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova-scheduler_status"] != 1) { return new AlarmStatus(CRITICAL, "nova-scheduler down"); }' }
user: root
roles:
- maas_local
- hosts: nova_spice_console
vars:
check_name: nova_consoleauth_check
check_details: file=nova_service_check.py,args=--host,args={{ ansible_hostname }},args={{ internal_vip_address }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_consoleauth_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova-consoleauth_status"] != 1) { return new AlarmStatus(CRITICAL, "nova-consoleauth down"); }' }
user: root
roles:
- maas_local
- hosts: nova_spice_console
vars:
check_name: nova_spice_console_check
check_details: file=service_api_local_check.py,args=nova_spice,args={{ ansible_ssh_host }},args=6082
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'nova_spice_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["nova_spice_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "API unavailable"); }' }
user: root
roles:
- maas_local
- hosts: rabbit
vars:
check_name: rabbitmq_status
check_details: file={{ check_name }}.py,args=-H,args={{ ansible_ssh_host }},args=-n,args={{ inventory_hostname.split('.')[0] }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'disk_free_alarm', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["disk_free_alarm"] != 1) { return new AlarmStatus(CRITICAL, "disk_free_alarm triggered"); }' }
- { 'name': 'mem_alarm', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["mem_alarm"] != 1) { return new AlarmStatus(CRITICAL, "mem_alarm triggered"); }' }
user: root
roles:
- maas_local
- hosts: galera
vars:
check_name: galera_check
check_details: file={{ check_name }}.py,args=-H,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
user: root
roles:
- maas_local
- hosts: memcached
vars:
check_name: memcached_status
check_details: file={{ check_name }}.py,args={{ ansible_ssh_host }}
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'memcache_api_local_status', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["memcache_api_local_status"] != 1) { return new AlarmStatus(CRITICAL, "memcached unavailable"); }' }
user: root
roles:
- maas_local
- hosts: hosts
vars:
check_name: disk_utilisation
check_details: file={{ check_name }}.py
check_period: "{{ maas_check_period }}"
check_timeout: "{{ maas_check_timeout }}"
alarms:
- { 'name': 'percentage_disk_utilisation_sda', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["disk_utilisation_sda"] >= 90.0) { return new AlarmStatus(WARNING, "Disk utilisation for sda >= 90%"); }' }
- { 'name': 'percentage_disk_utilisation_sdb', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["disk_utilisation_sdb"] >= 90.0) { return new AlarmStatus(WARNING, "Disk utilisation for sdb >= 90%"); }' }
- { 'name': 'percentage_disk_utilisation_sdc', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["disk_utilisation_sdc"] >= 90.0) { return new AlarmStatus(WARNING, "Disk utilisation for sdc >= 90%"); }' }
user: root
roles:
- maas_local