2014-05-01 12:12:07 -06:00

157 lines
5.9 KiB
Python

#!/usr/bin/env python
#
"""smoke
Runs a smoke test of the jahmon installation on mini-mon by ensuring metrics are flowing and creating a new
notification, alarm and that the Threshold Engine changes the state of the alarm.
This requires the mon CLI and must be run on one of the mini-mon VMs. Tested running on kafka VM.
Get it by following the instructions on https://wiki.hpcloud.net/display/iaas/Monitoring+CLI.
If you want to see the notification, you must install postfix on the kakfa VM, configure it to be local, and
modify /etc/mon/notification.yaml to use localhost for the email server, then restart
TODO:
1. Add check of alarm history when that works
2. Add check of notification history when that is implemented
3. Add check of mail getting to root when postfix is added mini-mon. This script will have to run on the kafka VM
"""
from __future__ import print_function
import json
import sys
import os
import subprocess
import time
# export OS_AUTH_TOKEN=82510970543135
# export OS_NO_CLIENT_AUTH=1
# export MON_API_URL=http://192.168.10.4:8080/v2.0/
os.environ["OS_AUTH_TOKEN"] = "82510970543135"
os.environ["OS_NO_CLIENT_AUTH"] = "1"
os.environ["MON_API_URL"] = "http://192.168.10.4:8080/v2.0/"
def get_alarm_state(alarm_id):
stdout = run_mon_cli(["mon", "--json", "alarm-show", alarm_id])
response_json = json.loads(stdout)
return response_json['state']
def create_alarm(name, expression, notification_method_id, description=None):
args = ["mon", "alarm-create"]
if (description):
args.append("--description")
args.append(description)
args.append("--alarm-actions")
args.append(notification_method_id)
args.append("--ok-actions")
args.append(notification_method_id)
args.append("--undetermined-actions")
args.append(notification_method_id)
args.append(name)
args.append(expression)
print("Creating alarm")
stdout = run_mon_cli(args)
response_json = json.loads(stdout)
# Parse out id
alarm_id = response_json['id']
return alarm_id
def get_metrics(name, dimensions):
print("Getting metrics for " + name)
dimensions_arg = ""
for key, value in dimensions.iteritems():
if dimensions_arg != "":
dimensions_arg = dimensions_arg + ","
dimensions_arg = dimensions_arg + key + "=" + value
stdout = run_mon_cli(["mon", "--json", "measurement-list", "--dimensions", dimensions_arg, name, "00"])
return json.loads(stdout)
def run_mon_cli(args):
try:
stdout = subprocess.check_output(args)
return stdout
except subprocess.CalledProcessError as e:
print(e, file=sys.stderr)
sys.exit(1)
def create_notification(notification_name, notification_email_addr):
print("Creating notification")
stdout = run_mon_cli(["mon", "notification-create", notification_name, "EMAIL", notification_email_addr])
response_json = json.loads(stdout)
# Parse out id
notification_method_id = response_json['id']
return notification_method_id
def find_id_for_name(object_json, name):
for obj in object_json:
this_name = obj['name']
if name == this_name:
return obj['id']
return None
def cleanup(notification_name, alarm_name):
# Delete our alarm if it already exists
alarm_json = json.loads(run_mon_cli(["mon", "--json", "alarm-list"]))
alarm_id = find_id_for_name(alarm_json, alarm_name)
if alarm_id:
run_mon_cli(["mon", "alarm-delete", alarm_id])
# Delete our notification if it already exists
notification_json = json.loads(run_mon_cli(["mon", "--json", "notification-list"]))
notification_id = find_id_for_name(notification_json, notification_name)
if notification_id:
run_mon_cli(["mon", "notification-delete", notification_id])
def main():
notification_name = "Jahmon Smoke Test"
notification_email_addr = "root@kafka"
alarm_name = "high cpu and load"
metric_name = "cpu_user_perc"
metric_dimensions = {"hostname":"thresh"}
cleanup(notification_name, alarm_name)
# Query how many metrics there are for the Alarm
metric_json = get_metrics(metric_name, metric_dimensions)
initial_num_metrics = len(metric_json[0]['measurements'])
# Create Notification through CLI
notification_method_id = create_notification(notification_name, notification_email_addr)
# Create Alarm through CLI
alarm_id = create_alarm(alarm_name, "max(cpu_user_perc{hostname=thresh}) > 1 and max(load_avg_1_min{hostname=thresh}) > 6", notification_method_id, "CPU Utilization exceeds 1% and Load exeeds 6 per measurement period")
state = get_alarm_state(alarm_id)
# Ensure it is created in the right state
if state != 'UNDETERMINED':
print("Wrong initial alarm state, expected UNDETERMINED but was " + state)
sys.exit(1)
# Wait for it to
print("Waiting for alarm to change state")
change_time = 0
for x in range(0, 250):
time.sleep(1)
state = get_alarm_state(alarm_id)
if state != 'UNDETERMINED':
print("Alarm state changed in " + str(x) + " seconds")
change_time = x
break
if state != 'ALARM':
print("Wrong initial final state, expected ALARM but was " + state, file=sys.stderr)
sys.exit(1)
print("Final state of alarm was " + state)
# If the alarm changes state too fast, then there isn't time for the new metric to arrive.
# Unlikely, but it has been seen
if change_time < 30:
time.sleep(30 - change_time)
change_time = 30
metric_json = get_metrics(metric_name, metric_dimensions)
final_num_metrics = len(metric_json[0]['measurements'])
if final_num_metrics <= initial_num_metrics:
print("No new metrics received", file=sys.stderr)
sys.exit(1)
print("Received " + str(final_num_metrics - initial_num_metrics) + " metrics in " + str(change_time) + " seconds")
return 0
if __name__ == "__main__":
sys.exit(main())