Merge branch 'master' of github.com:hpcloud-mon/mon-vagrant

This commit is contained in:
Tim Kuhlman 2014-05-13 15:19:47 -06:00
commit 25e15e67fc
3 changed files with 377 additions and 91 deletions

98
tests/measurement_test.py Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env python
#
"""measurements
"""
from __future__ import print_function
import sys
import time
import pytz
from datetime import datetime
from monclient import client
import monclient.exc as exc
mon_client = None
def call_mon_api(method, fields):
try:
resp = method(**fields)
except exc.HTTPException as he:
print(he.code)
print(he.message)
sys.exit(1)
else:
return resp
def create_timestamp(seconds):
return pytz.utc.localize(datetime.utcfromtimestamp(seconds)).strftime("%Y-%m-%dT%H:%M:%S%z")
def main():
if len(sys.argv) == 1:
print('usage: %s metric_name count' % sys.argv[0], file=sys.stderr)
return 1
api_version = '2_0'
endpoint = 'http://192.168.10.4:8080/v2.0'
kwargs = {
'token': '82510970543135'
}
mon_client = client.Client(api_version, endpoint, **kwargs)
metric_start_time = time.time()
metric_name = sys.argv[1]
num_metrics_to_send = int(sys.argv[2])
dimensions = {'Test_Send':'Number_1'} # Should be arg
start_time = time.time()
fields = {'name':metric_name}
fields['dimensions'] = dimensions
for val in range(0, num_metrics_to_send):
fields['value'] = str(val)
fields['timestamp'] = time.time()
call_mon_api(mon_client.metrics.create, fields)
# time.sleep(1)
print("Took %d seconds to send %d measurements" %
((time.time() - start_time), num_metrics_to_send))
metric_end_time = time.time()
# API requires end time to be greater than start time
if (metric_end_time - metric_start_time) < 1:
metric_end_time = metric_start_time + 1
start_timestamp = create_timestamp(metric_start_time)
end_timestamp = create_timestamp(metric_end_time)
fields = {'name':metric_name}
fields['dimensions'] = dimensions
fields['start_time'] = start_timestamp
fields['end_time'] = end_timestamp
for i in range(0, 30):
result = call_mon_api(mon_client.metrics.list_measurements, fields)
if len(result) > 0:
measurements = result[0]['measurements']
if len(measurements) >= num_metrics_to_send:
break
print('Found %d of %d metrics so far' % (len(measurements), num_metrics_to_send))
time.sleep(1)
if len(result) == 0:
print('Did not receive any metrics in %d seconds' % i, file=sys.stderr)
return 1
if len(measurements) != num_metrics_to_send:
print('Expected %d measurements but found %d' %
(num_metrics_to_send, len(measurements)), file=sys.stderr)
return 1
print('Took %d seconds for metrics to fully arrive' % i)
expected = num_metrics_to_send - 1
result = 0
for index in range(0, num_metrics_to_send):
value = measurements[index]
if value[2] != expected:
print('Expected %d but found %d for %d' %
(expected, value[2], index), file=sys.stderr)
expected = expected - 1
return result
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,118 @@
#!/usr/bin/env python
#
"""Notification Engine Test
Cycle the state of an Alarm the given number of times
"""
from __future__ import print_function
import sys
import time
import json
import subprocess
from monclient import client
import monclient.exc as exc
mon_client = None
def call_mon_api(method, fields):
try:
resp = method(**fields)
except exc.HTTPException as he:
print(he.code)
print(he.message)
sys.exit(1)
else:
return resp
def find_alarm_id():
result = call_mon_api(mon_client.alarms.list, {})
if len(result) == 0:
print('No existing alarms, create one and rerun test', file=sys.stderr)
return None
return result[0]['id']
def get_alarm_state(alarm_id):
result = call_mon_api(mon_client.alarms.get, {'alarm_id':alarm_id})
return result['state']
def find_notifications(alarm_id):
args = ['sudo', 'grep', alarm_id, '/var/mail/root']
result = []
try:
stdout = subprocess.check_output(args)
except subprocess.CalledProcessError as e:
print(e, file=sys.stderr)
sys.exit(1)
for line in stdout.splitlines():
result.append(json.loads(line)['state']);
return result
def main():
if len(sys.argv) == 1:
print('usage: %s count [alarm-id]' % sys.argv[0], file=sys.stderr)
return 1
api_version = '2_0'
endpoint = 'http://192.168.10.4:8080/v2.0'
kwargs = {
'token': '82510970543135'
}
global mon_client
mon_client = client.Client(api_version, endpoint, **kwargs)
num_cycles = int(sys.argv[1])
if len(sys.argv) > 2:
alarm_id = sys.argv[2]
else:
alarm_id = find_alarm_id()
if alarm_id == None:
return 1
start_time = time.time()
initial_state = get_alarm_state(alarm_id)
state = initial_state
fields = {'alarm_id':alarm_id}
existing_notifications = find_notifications(alarm_id)
notifications_sent = num_cycles * 2
for _ in range(0, notifications_sent):
if state == 'OK':
state = 'ALARM'
else:
state = 'OK'
fields['state'] = state
call_mon_api(mon_client.alarms.patch, fields)
new_state = get_alarm_state(alarm_id)
if new_state != state:
print('Expected new state %s but found %s' %
(state, new_state), file=sys.stderr)
return 1
# time.sleep(1)
print("Took %d seconds to send %d alarm state changes" %
((time.time() - start_time), num_cycles * 2))
for i in range(0, 30):
all_notifications = find_notifications(alarm_id)
if (len(all_notifications) - len(existing_notifications)) >= notifications_sent:
break
print('Found %d of %d expected notifications so far' % (len(all_notifications) - len(existing_notifications), notifications_sent))
time.sleep(1)
notifications_found = len(all_notifications) - len(existing_notifications)
if notifications_found < notifications_sent:
print('Expected %d notifications but found %d' %
(notifications_sent, notifications_found), file=sys.stderr)
return 1
print('Took %d seconds for notifications to fully arrive' % i)
result = 0
return result
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,16 +1,20 @@
#!/usr/bin/env python
#
"""smoke
Runs a smoke test of the jahmon installation on mini-mon by ensuring metrics are flowing and creating a new
notification, alarm and that the Threshold Engine changes the state of the alarm.
This requires the mon CLI and must be run on one of the mini-mon VMs. Tested running on kafka VM.
Get it by following the instructions on https://wiki.hpcloud.net/display/iaas/Monitoring+CLI.
If you want to see the notification, you must install postfix on the kakfa VM, configure it to be local, and
modify /etc/mon/notification.yaml to use localhost for the email server, then restart
Runs a smoke test of the jahmon installation on mini-mon by ensuring
metrics are flowing and creating a new notification, alarm and that the
Threshold Engine changes the state of the alarm. This requires the mon
CLI and must be run on one of the mini-mon VMs. Tested running on kafka VM.
Get it by following the instructions on
https://wiki.hpcloud.net/display/iaas/Monitoring+CLI.
If you want to see the notification, you must install postfix on the kakfa
VM, configure it to be local, and modify /etc/mon/notification.yaml to use
localhost for the email server, then restart
TODO:
1. Add check of notification history when that is implemented
2. Add check of mail getting to root when postfix is added mini-mon. This script will have to run on the kafka VM
2. Add check of mail getting to root when postfix is added mini-mon.
This script will have to run on the kafka VM
"""
from __future__ import print_function
@ -25,93 +29,121 @@ import time
# export OS_NO_CLIENT_AUTH=1
# export MON_API_URL=http://192.168.10.4:8080/v2.0/
os.environ["OS_AUTH_TOKEN"] = "82510970543135"
os.environ["OS_NO_CLIENT_AUTH"] = "1"
os.environ["MON_API_URL"] = "http://192.168.10.4:8080/v2.0/"
os.environ['OS_AUTH_TOKEN'] = '82510970543135'
os.environ['OS_NO_CLIENT_AUTH'] = '1'
os.environ['MON_API_URL'] = 'http://192.168.10.4:8080/v2.0/'
def change_alarm_state(alarm_id, new_state):
print('Changing Alarm state to %s' % new_state)
result_json = run_mon_cli(['alarm-patch', '--state', new_state, alarm_id])
if result_json['state'] != new_state:
print('Alarm patch failed, expected state of %s but was %s' %
(result_json['state'], new_state), file=sys.stderr)
return 1
def get_alarm_state(alarm_id):
stdout = run_mon_cli(["mon", "--json", "alarm-show", alarm_id])
response_json = json.loads(stdout)
return response_json['state']
result_json = run_mon_cli(['alarm-show', alarm_id])
return result_json['state']
def check_alarm_history(alarm_id):
def check_alarm_history(alarm_id, states):
transitions = len(states) - 1
print('Checking Alarm History')
# Make take a little bit of time for Alarm history to flow all the way through
for x in range(0, 10):
stdout = run_mon_cli(["mon", "--json", "alarm-history", alarm_id])
response_json = json.loads(stdout)
if len(response_json) > 0:
# May take some time for Alarm history to flow all the way through
for _ in range(0, 10):
result_json = run_mon_cli(['alarm-history', alarm_id])
if len(result_json) >= transitions:
break
time.sleep(4)
result = True
if not check_expected(1, len(response_json), 'number of history entries'):
if not check_expected(transitions, len(result_json),
'number of history entries'):
return False
alarm_json = response_json[0]
if not check_expected('UNDETERMINED', alarm_json['old_state'], 'old_state'):
result_json.sort(key=lambda x: x['timestamp'])
for i in range(0, transitions):
old_state = states[i]
new_state = states[i+1]
alarm_json = result_json[i]
if not check_expected(old_state, alarm_json['old_state'], 'old_state'):
result = False
if not check_expected('ALARM', alarm_json['new_state'], 'new_state'):
if not check_expected(new_state, alarm_json['new_state'], 'new_state'):
result = False
if not check_expected(alarm_id, alarm_json['alarm_id'], 'alarm_id'):
result = False
if result:
print("Alarm History is OK")
print('Alarm History is OK')
return result
def check_expected(expected, actual, what):
if (expected == actual):
return True
print("Incorrect value for alarm history " + what + " expected '" + str(expected) + "' but was '" + str(actual) + "'", file=sys.stderr)
print("Incorrect value for alarm history %s expected '%s' but was '%s'" %
(what, str(expected), str(actual)), file=sys.stderr)
return False
def create_alarm(name, expression, notification_method_id, description=None):
args = ["mon", "alarm-create"]
args = ['alarm-create']
if (description):
args.append("--description")
args.append('--description')
args.append(description)
args.append("--alarm-actions")
args.append('--alarm-actions')
args.append(notification_method_id)
args.append("--ok-actions")
args.append('--ok-actions')
args.append(notification_method_id)
args.append("--undetermined-actions")
args.append('--undetermined-actions')
args.append(notification_method_id)
args.append(name)
args.append(expression)
print("Creating alarm")
stdout = run_mon_cli(args)
response_json = json.loads(stdout)
print('Creating alarm')
result_json = run_mon_cli(args)
# Parse out id
alarm_id = response_json['id']
alarm_id = result_json['id']
return alarm_id
def get_metrics(name, dimensions):
print("Getting metrics for " + name + str(dimensions))
dimensions_arg = ""
for key, value in dimensions.iteritems():
if dimensions_arg != "":
dimensions_arg = dimensions_arg + ","
dimensions_arg = dimensions_arg + key + "=" + value
stdout = run_mon_cli(["mon", "--json", "measurement-list", "--dimensions", dimensions_arg, name, "00"])
return json.loads(stdout)
def run_mon_cli(args):
def get_metrics(name, dimensions):
print('Getting metrics for %s ' % (name + str(dimensions)))
dimensions_arg = ''
for key, value in dimensions.iteritems():
if dimensions_arg != '':
dimensions_arg = dimensions_arg + ','
dimensions_arg = dimensions_arg + key + '=' + value
return run_mon_cli(['measurement-list', '--dimensions',
dimensions_arg, name, '00'])
def run_mon_cli(args, useJson=True):
if useJson:
args.insert(0, '--json')
args.insert(0, 'mon')
try:
stdout = subprocess.check_output(args)
if useJson:
return json.loads(stdout)
else:
return stdout
except subprocess.CalledProcessError as e:
print(e, file=sys.stderr)
sys.exit(1)
def create_notification(notification_name, notification_email_addr):
print("Creating notification")
stdout = run_mon_cli(["mon", "notification-create", notification_name, "EMAIL", notification_email_addr])
response_json = json.loads(stdout)
print('Creating notification')
result_json = run_mon_cli(['notification-create', notification_name,
'EMAIL', notification_email_addr])
# Parse out id
notification_method_id = response_json['id']
notification_method_id = result_json['id']
return notification_method_id
def find_id_for_name(object_json, name):
for obj in object_json:
this_name = obj['name']
@ -119,71 +151,109 @@ def find_id_for_name(object_json, name):
return obj['id']
return None
def cleanup(notification_name, alarm_name):
# Delete our alarm if it already exists
alarm_json = json.loads(run_mon_cli(["mon", "--json", "alarm-list"]))
alarm_json = run_mon_cli(['alarm-list'])
alarm_id = find_id_for_name(alarm_json, alarm_name)
if alarm_id:
run_mon_cli(["mon", "alarm-delete", alarm_id])
run_mon_cli(['alarm-delete', alarm_id], useJson=False)
# Delete our notification if it already exists
notification_json = json.loads(run_mon_cli(["mon", "--json", "notification-list"]))
notification_json = run_mon_cli(['notification-list'])
notification_id = find_id_for_name(notification_json, notification_name)
if notification_id:
run_mon_cli(["mon", "notification-delete", notification_id])
run_mon_cli(['notification-delete', notification_id], useJson=False)
def wait_for_alarm_state_change(alarm_id, old_state):
# Wait for it to change state
print('Waiting for alarm to change state from %s' % old_state)
for x in range(0, 250):
time.sleep(1)
state = get_alarm_state(alarm_id)
if state != old_state:
print('Alarm state changed to %s in %d seconds' % (state, x))
return state
print('State never changed from %s in %d seconds' % (old_state, x),
file=sys.stderr)
sys.exit(1)
def main():
notification_name = "Jahmon Smoke Test"
notification_email_addr = "root@kafka"
alarm_name = "high cpu and load"
metric_name = "cpu_system_perc"
metric_dimensions = {"hostname":"thresh"}
notification_name = 'Jahmon Smoke Test'
notification_email_addr = 'root@kafka'
alarm_name = 'high cpu and load'
metric_name = 'cpu_system_perc'
metric_dimensions = {'hostname': 'thresh'}
cleanup(notification_name, alarm_name)
# Query how many metrics there are for the Alarm
metric_json = get_metrics(metric_name, metric_dimensions)
if len(metric_json) == 0:
print("No measurements received for metric " + metric_name + str(metric_dimensions), file=sys.stderr)
sys.exit(1)
print('No measurements received for metric %s ' %
(metric_name + str(metric_dimensions)), file=sys.stderr)
return 1
start_time = time.time()
initial_num_metrics = len(metric_json[0]['measurements'])
# Create Notification through CLI
notification_method_id = create_notification(notification_name, notification_email_addr)
notification_method_id = create_notification(notification_name,
notification_email_addr)
# Create Alarm through CLI
alarm_id = create_alarm(alarm_name, "max(cpu_system_perc) > 1 and max(load_avg_1_min{hostname=thresh}) > 3", notification_method_id, "System CPU Utilization exceeds 1% and Load exeeds 3 per measurement period")
expression = 'max(cpu_system_perc) > 1 and ' + \
'max(load_avg_1_min{hostname=thresh}) > 1'
description = 'System CPU Utilization exceeds 1% and ' + \
'Load exceeds 3 per measurement period'
alarm_id = create_alarm(alarm_name, expression, notification_method_id,
description)
state = get_alarm_state(alarm_id)
# Ensure it is created in the right state
if state != 'UNDETERMINED':
print("Wrong initial alarm state, expected UNDETERMINED but was " + state)
sys.exit(1)
# Wait for it to
print("Waiting for alarm to change state")
change_time = 0
for x in range(0, 250):
time.sleep(1)
state = get_alarm_state(alarm_id)
if state != 'UNDETERMINED':
print("Alarm state changed in " + str(x) + " seconds")
change_time = x
break
print('Wrong initial alarm state, expected UNDETERMINED but is %s' %
state)
return 1
state = wait_for_alarm_state_change(alarm_id, 'UNDETERMINED')
if state != 'ALARM':
print("Wrong final state, expected ALARM but was " + state, file=sys.stderr)
sys.exit(1)
print("Final state of alarm was " + state)
# If the alarm changes state too fast, then there isn't time for the new metric to arrive.
# Unlikely, but it has been seen
print('Wrong final state, expected ALARM but was %s' % state,
file=sys.stderr)
return 1
state_changes = ['UNDETERMINED', 'ALARM']
new_state = 'OK'
state_changes.append(new_state)
change_alarm_state(alarm_id, new_state)
# There is a bug in the API which allows this to work. Soon that
# will be fixed and this will fail
if len(sys.argv) > 1:
final_state = 'ALARM'
state_changes.append(final_state)
state = wait_for_alarm_state_change(alarm_id, new_state)
if state != final_state:
print('Wrong final state, expected %s but was %s' %
(final_state, state), file=sys.stderr)
return 1
# If the alarm changes state too fast, then there isn't time for the new
# metric to arrive. Unlikely, but it has been seen
change_time = time.time() - start_time
if change_time < 30:
time.sleep(30 - change_time)
change_time = 30
metric_json = get_metrics(metric_name, metric_dimensions)
final_num_metrics = len(metric_json[0]['measurements'])
if final_num_metrics <= initial_num_metrics:
print("No new metrics received", file=sys.stderr)
sys.exit(1)
print("Received " + str(final_num_metrics - initial_num_metrics) + " metrics in " + str(change_time) + " seconds")
if not check_alarm_history(alarm_id):
sys.exit(1)
print('No new metrics received', file=sys.stderr)
return 1
print('Received %d metrics in %d seconds' %
((final_num_metrics - initial_num_metrics), change_time))
if not check_alarm_history(alarm_id, state_changes):
return 1
return 0