From badbea58f2c8c7a3c19e3401beae830d2e06e70a Mon Sep 17 00:00:00 2001 From: git-harry Date: Thu, 18 Sep 2014 11:05:40 +0100 Subject: [PATCH] Add CDM checks and alarms --- .../playbooks/monitoring/maas_cdm.yml | 18 +++++++ .../playbooks/monitoring/maas_local.yml | 14 ++++++ rpc_deployment/roles/maas_cdm/tasks/cdm.yml | 45 ++++++++++++++++++ rpc_deployment/roles/maas_cdm/tasks/main.yml | 47 +++++++++++++++++++ 4 files changed, 124 insertions(+) create mode 100644 rpc_deployment/playbooks/monitoring/maas_cdm.yml create mode 100644 rpc_deployment/roles/maas_cdm/tasks/cdm.yml create mode 100644 rpc_deployment/roles/maas_cdm/tasks/main.yml diff --git a/rpc_deployment/playbooks/monitoring/maas_cdm.yml b/rpc_deployment/playbooks/monitoring/maas_cdm.yml new file mode 100644 index 0000000000..3d01b30cd8 --- /dev/null +++ b/rpc_deployment/playbooks/monitoring/maas_cdm.yml @@ -0,0 +1,18 @@ +--- +# Copyright 2014, Rackspace US, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- hosts: hosts + roles: + - maas_cdm diff --git a/rpc_deployment/playbooks/monitoring/maas_local.yml b/rpc_deployment/playbooks/monitoring/maas_local.yml index dece09b80d..51c6a96393 100644 --- a/rpc_deployment/playbooks/monitoring/maas_local.yml +++ b/rpc_deployment/playbooks/monitoring/maas_local.yml @@ -323,3 +323,17 @@ user: root roles: - maas_local + +- hosts: hosts + vars: + check_name: disk_utilisation + check_details: file={{ check_name }}.py + check_period: "{{ maas_check_period }}" + check_timeout: "{{ maas_check_timeout }}" + alarms: + - { 'name': 'percentage_disk_utilisation_sda', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["disk_utilisation_sda"] >= 90.0) { return new AlarmStatus(WARNING, "Disk utilisation for sda >= 90%"); }' } + - { 'name': 'percentage_disk_utilisation_sdb', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["disk_utilisation_sdb"] >= 90.0) { return new AlarmStatus(WARNING, "Disk utilisation for sdb >= 90%"); }' } + - { 'name': 'percentage_disk_utilisation_sdc', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["disk_utilisation_sdc"] >= 90.0) { return new AlarmStatus(WARNING, "Disk utilisation for sdc >= 90%"); }' } + user: root + roles: + - maas_local diff --git a/rpc_deployment/roles/maas_cdm/tasks/cdm.yml b/rpc_deployment/roles/maas_cdm/tasks/cdm.yml new file mode 100644 index 0000000000..25b846b62b --- /dev/null +++ b/rpc_deployment/roles/maas_cdm/tasks/cdm.yml @@ -0,0 +1,45 @@ +--- +# Copyright 2014, Rackspace US, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Get entity ID for physical_host + shell: raxmon-entities-list | grep "label={{ inventory_hostname|quote }}{{ maas_fqdn_extension|default('') }} " | sed -e 's/^.* id=\(.*\) label=.*$/\1/g' + register: entity_id + +- name: Validate if check exists + shell: raxmon-checks-list --entity-id {{ entity_id.stdout|quote }} | grep "label={{ check_name|quote }}--{{ inventory_hostname|quote }}" + register: check_exists + ignore_errors: True + +- name: Create check if it does not exist + command: raxmon-checks-create --entity-id {{ entity_id.stdout }} --type {{ agent_type }} --label {{ check_name }}--{{ inventory_hostname }} --details {{ check_details }} --period {{ check_period }} --timeout {{ check_timeout }} + when: check_exists|failed + +- name: Get check ID for newly created check + shell: raxmon-checks-list --entity-id {{ entity_id.stdout|quote }} | grep "label={{ check_name|quote }}--{{ inventory_hostname|quote }}" | sed -e 's/^.* id=\(.*\) label=.*$/\1/g' + register: check_id + +- name: Validate if alarm exists + shell: raxmon-alarms-list --entity-id {{ entity_id.stdout|quote }} | grep "label={{ item.name|quote }}--{{ inventory_hostname|quote }}" + register: alarm_exists + ignore_errors: True + when: alarms is defined + with_items: alarms + +- name: Create alarm if it does not exist + shell: raxmon-alarms-create --entity-id {{ entity_id.stdout|quote }} --check-id {{ check_id.stdout|quote }} --notification-plan {{ maas_notification_plan }} --label {{ item[1].name|quote }}--{{ inventory_hostname|quote }} --criteria {{ item[1].criteria|quote }} + when: item[0]|failed and alarms is defined + with_together: + - alarm_exists.results + - alarms diff --git a/rpc_deployment/roles/maas_cdm/tasks/main.yml b/rpc_deployment/roles/maas_cdm/tasks/main.yml new file mode 100644 index 0000000000..ba6d875354 --- /dev/null +++ b/rpc_deployment/roles/maas_cdm/tasks/main.yml @@ -0,0 +1,47 @@ +--- +# Copyright 2014, Rackspace US, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- include: cdm.yml + vars: + check_name: cpu + check_details: "{}" + check_period: "{{ maas_check_period }}" + check_timeout: "{{ maas_check_timeout }}" + agent_type: "agent.cpu" + alarms: + - { 'name': 'idle_percent_average', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (metric["idle_percent_average"] <= 10.0) { return new AlarmStatus(WARNING, "CPU time spent idle has dropped to <= 10%"); }' } + user: root + +- include: cdm.yml + vars: + check_name: filesystem + check_details: "target=/" + check_period: "{{ maas_check_period }}" + check_timeout: "{{ maas_check_timeout }}" + agent_type: "agent.filesystem" + alarms: + - { 'name': 'Disk space used on /', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (percentage(metric["used"], metric["total"]) >= 95.0) { return new AlarmStatus(WARNING, "Root filesystem is >= 95% full."); }' } + user: root + +- include: cdm.yml + vars: + check_name: memory + check_details: "{}" + check_period: "{{ maas_check_period }}" + check_timeout: "{{ maas_check_timeout }}" + agent_type: "agent.memory" + alarms: + - { 'name': 'Memory used', 'criteria': ':set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (percentage(metric["actual_used"], metric["total"]) >= 95.0) { return new AlarmStatus(WARNING, "Memory is 95%+ in use."); }' } + user: root