Merge branch 'master' of github.com:hpcloud-mon/mon-vagrant

This commit is contained in:
Tim Kuhlman 2014-05-13 15:19:47 -06:00
commit 25e15e67fc
3 changed files with 377 additions and 91 deletions

98
tests/measurement_test.py Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env python
#
"""measurements
"""
from __future__ import print_function
import sys
import time
import pytz
from datetime import datetime
from monclient import client
import monclient.exc as exc
mon_client = None
def call_mon_api(method, fields):
try:
resp = method(**fields)
except exc.HTTPException as he:
print(he.code)
print(he.message)
sys.exit(1)
else:
return resp
def create_timestamp(seconds):
return pytz.utc.localize(datetime.utcfromtimestamp(seconds)).strftime("%Y-%m-%dT%H:%M:%S%z")
def main():
if len(sys.argv) == 1:
print('usage: %s metric_name count' % sys.argv[0], file=sys.stderr)
return 1
api_version = '2_0'
endpoint = 'http://192.168.10.4:8080/v2.0'
kwargs = {
'token': '82510970543135'
}
mon_client = client.Client(api_version, endpoint, **kwargs)
metric_start_time = time.time()
metric_name = sys.argv[1]
num_metrics_to_send = int(sys.argv[2])
dimensions = {'Test_Send':'Number_1'} # Should be arg
start_time = time.time()
fields = {'name':metric_name}
fields['dimensions'] = dimensions
for val in range(0, num_metrics_to_send):
fields['value'] = str(val)
fields['timestamp'] = time.time()
call_mon_api(mon_client.metrics.create, fields)
# time.sleep(1)
print("Took %d seconds to send %d measurements" %
((time.time() - start_time), num_metrics_to_send))
metric_end_time = time.time()
# API requires end time to be greater than start time
if (metric_end_time - metric_start_time) < 1:
metric_end_time = metric_start_time + 1
start_timestamp = create_timestamp(metric_start_time)
end_timestamp = create_timestamp(metric_end_time)
fields = {'name':metric_name}
fields['dimensions'] = dimensions
fields['start_time'] = start_timestamp
fields['end_time'] = end_timestamp
for i in range(0, 30):
result = call_mon_api(mon_client.metrics.list_measurements, fields)
if len(result) > 0:
measurements = result[0]['measurements']
if len(measurements) >= num_metrics_to_send:
break
print('Found %d of %d metrics so far' % (len(measurements), num_metrics_to_send))
time.sleep(1)
if len(result) == 0:
print('Did not receive any metrics in %d seconds' % i, file=sys.stderr)
return 1
if len(measurements) != num_metrics_to_send:
print('Expected %d measurements but found %d' %
(num_metrics_to_send, len(measurements)), file=sys.stderr)
return 1
print('Took %d seconds for metrics to fully arrive' % i)
expected = num_metrics_to_send - 1
result = 0
for index in range(0, num_metrics_to_send):
value = measurements[index]
if value[2] != expected:
print('Expected %d but found %d for %d' %
(expected, value[2], index), file=sys.stderr)
expected = expected - 1
return result
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,118 @@
#!/usr/bin/env python
#
"""Notification Engine Test
Cycle the state of an Alarm the given number of times
"""
from __future__ import print_function
import sys
import time
import json
import subprocess
from monclient import client
import monclient.exc as exc
mon_client = None
def call_mon_api(method, fields):
try:
resp = method(**fields)
except exc.HTTPException as he:
print(he.code)
print(he.message)
sys.exit(1)
else:
return resp
def find_alarm_id():
result = call_mon_api(mon_client.alarms.list, {})
if len(result) == 0:
print('No existing alarms, create one and rerun test', file=sys.stderr)
return None
return result[0]['id']
def get_alarm_state(alarm_id):
result = call_mon_api(mon_client.alarms.get, {'alarm_id':alarm_id})
return result['state']
def find_notifications(alarm_id):
args = ['sudo', 'grep', alarm_id, '/var/mail/root']
result = []
try:
stdout = subprocess.check_output(args)
except subprocess.CalledProcessError as e:
print(e, file=sys.stderr)
sys.exit(1)
for line in stdout.splitlines():
result.append(json.loads(line)['state']);
return result
def main():
if len(sys.argv) == 1:
print('usage: %s count [alarm-id]' % sys.argv[0], file=sys.stderr)
return 1
api_version = '2_0'
endpoint = 'http://192.168.10.4:8080/v2.0'
kwargs = {
'token': '82510970543135'
}
global mon_client
mon_client = client.Client(api_version, endpoint, **kwargs)
num_cycles = int(sys.argv[1])
if len(sys.argv) > 2:
alarm_id = sys.argv[2]
else:
alarm_id = find_alarm_id()
if alarm_id == None:
return 1
start_time = time.time()
initial_state = get_alarm_state(alarm_id)
state = initial_state
fields = {'alarm_id':alarm_id}
existing_notifications = find_notifications(alarm_id)
notifications_sent = num_cycles * 2
for _ in range(0, notifications_sent):
if state == 'OK':
state = 'ALARM'
else:
state = 'OK'
fields['state'] = state
call_mon_api(mon_client.alarms.patch, fields)
new_state = get_alarm_state(alarm_id)
if new_state != state:
print('Expected new state %s but found %s' %
(state, new_state), file=sys.stderr)
return 1
# time.sleep(1)
print("Took %d seconds to send %d alarm state changes" %
((time.time() - start_time), num_cycles * 2))
for i in range(0, 30):
all_notifications = find_notifications(alarm_id)
if (len(all_notifications) - len(existing_notifications)) >= notifications_sent:
break
print('Found %d of %d expected notifications so far' % (len(all_notifications) - len(existing_notifications), notifications_sent))
time.sleep(1)
notifications_found = len(all_notifications) - len(existing_notifications)
if notifications_found < notifications_sent:
print('Expected %d notifications but found %d' %
(notifications_sent, notifications_found), file=sys.stderr)
return 1
print('Took %d seconds for notifications to fully arrive' % i)
result = 0
return result
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,16 +1,20 @@
#!/usr/bin/env python #!/usr/bin/env python
# #
"""smoke """smoke
Runs a smoke test of the jahmon installation on mini-mon by ensuring metrics are flowing and creating a new Runs a smoke test of the jahmon installation on mini-mon by ensuring
notification, alarm and that the Threshold Engine changes the state of the alarm. metrics are flowing and creating a new notification, alarm and that the
This requires the mon CLI and must be run on one of the mini-mon VMs. Tested running on kafka VM. Threshold Engine changes the state of the alarm. This requires the mon
Get it by following the instructions on https://wiki.hpcloud.net/display/iaas/Monitoring+CLI. CLI and must be run on one of the mini-mon VMs. Tested running on kafka VM.
If you want to see the notification, you must install postfix on the kakfa VM, configure it to be local, and Get it by following the instructions on
modify /etc/mon/notification.yaml to use localhost for the email server, then restart https://wiki.hpcloud.net/display/iaas/Monitoring+CLI.
If you want to see the notification, you must install postfix on the kakfa
VM, configure it to be local, and modify /etc/mon/notification.yaml to use
localhost for the email server, then restart
TODO: TODO:
1. Add check of notification history when that is implemented 1. Add check of notification history when that is implemented
2. Add check of mail getting to root when postfix is added mini-mon. This script will have to run on the kafka VM 2. Add check of mail getting to root when postfix is added mini-mon.
This script will have to run on the kafka VM
""" """
from __future__ import print_function from __future__ import print_function
@ -25,92 +29,120 @@ import time
# export OS_NO_CLIENT_AUTH=1 # export OS_NO_CLIENT_AUTH=1
# export MON_API_URL=http://192.168.10.4:8080/v2.0/ # export MON_API_URL=http://192.168.10.4:8080/v2.0/
os.environ["OS_AUTH_TOKEN"] = "82510970543135" os.environ['OS_AUTH_TOKEN'] = '82510970543135'
os.environ["OS_NO_CLIENT_AUTH"] = "1" os.environ['OS_NO_CLIENT_AUTH'] = '1'
os.environ["MON_API_URL"] = "http://192.168.10.4:8080/v2.0/" os.environ['MON_API_URL'] = 'http://192.168.10.4:8080/v2.0/'
def change_alarm_state(alarm_id, new_state):
print('Changing Alarm state to %s' % new_state)
result_json = run_mon_cli(['alarm-patch', '--state', new_state, alarm_id])
if result_json['state'] != new_state:
print('Alarm patch failed, expected state of %s but was %s' %
(result_json['state'], new_state), file=sys.stderr)
return 1
def get_alarm_state(alarm_id): def get_alarm_state(alarm_id):
stdout = run_mon_cli(["mon", "--json", "alarm-show", alarm_id]) result_json = run_mon_cli(['alarm-show', alarm_id])
response_json = json.loads(stdout) return result_json['state']
return response_json['state']
def check_alarm_history(alarm_id):
def check_alarm_history(alarm_id, states):
transitions = len(states) - 1
print('Checking Alarm History') print('Checking Alarm History')
# Make take a little bit of time for Alarm history to flow all the way through # May take some time for Alarm history to flow all the way through
for x in range(0, 10): for _ in range(0, 10):
stdout = run_mon_cli(["mon", "--json", "alarm-history", alarm_id]) result_json = run_mon_cli(['alarm-history', alarm_id])
response_json = json.loads(stdout) if len(result_json) >= transitions:
if len(response_json) > 0:
break break
time.sleep(4) time.sleep(4)
result = True result = True
if not check_expected(1, len(response_json), 'number of history entries'): if not check_expected(transitions, len(result_json),
'number of history entries'):
return False return False
alarm_json = response_json[0] result_json.sort(key=lambda x: x['timestamp'])
if not check_expected('UNDETERMINED', alarm_json['old_state'], 'old_state'): for i in range(0, transitions):
result = False old_state = states[i]
if not check_expected('ALARM', alarm_json['new_state'], 'new_state'): new_state = states[i+1]
result = False alarm_json = result_json[i]
if not check_expected(alarm_id, alarm_json['alarm_id'], 'alarm_id'): if not check_expected(old_state, alarm_json['old_state'], 'old_state'):
result = False result = False
if not check_expected(new_state, alarm_json['new_state'], 'new_state'):
result = False
if not check_expected(alarm_id, alarm_json['alarm_id'], 'alarm_id'):
result = False
if result: if result:
print("Alarm History is OK") print('Alarm History is OK')
return result return result
def check_expected(expected, actual, what): def check_expected(expected, actual, what):
if (expected == actual): if (expected == actual):
return True return True
print("Incorrect value for alarm history " + what + " expected '" + str(expected) + "' but was '" + str(actual) + "'", file=sys.stderr) print("Incorrect value for alarm history %s expected '%s' but was '%s'" %
(what, str(expected), str(actual)), file=sys.stderr)
return False return False
def create_alarm(name, expression, notification_method_id, description=None): def create_alarm(name, expression, notification_method_id, description=None):
args = ["mon", "alarm-create"] args = ['alarm-create']
if (description): if (description):
args.append("--description") args.append('--description')
args.append(description) args.append(description)
args.append("--alarm-actions") args.append('--alarm-actions')
args.append(notification_method_id) args.append(notification_method_id)
args.append("--ok-actions") args.append('--ok-actions')
args.append(notification_method_id) args.append(notification_method_id)
args.append("--undetermined-actions") args.append('--undetermined-actions')
args.append(notification_method_id) args.append(notification_method_id)
args.append(name) args.append(name)
args.append(expression) args.append(expression)
print("Creating alarm") print('Creating alarm')
stdout = run_mon_cli(args) result_json = run_mon_cli(args)
response_json = json.loads(stdout)
# Parse out id # Parse out id
alarm_id = response_json['id'] alarm_id = result_json['id']
return alarm_id return alarm_id
def get_metrics(name, dimensions): def get_metrics(name, dimensions):
print("Getting metrics for " + name + str(dimensions)) print('Getting metrics for %s ' % (name + str(dimensions)))
dimensions_arg = "" dimensions_arg = ''
for key, value in dimensions.iteritems(): for key, value in dimensions.iteritems():
if dimensions_arg != "": if dimensions_arg != '':
dimensions_arg = dimensions_arg + "," dimensions_arg = dimensions_arg + ','
dimensions_arg = dimensions_arg + key + "=" + value dimensions_arg = dimensions_arg + key + '=' + value
stdout = run_mon_cli(["mon", "--json", "measurement-list", "--dimensions", dimensions_arg, name, "00"]) return run_mon_cli(['measurement-list', '--dimensions',
return json.loads(stdout) dimensions_arg, name, '00'])
def run_mon_cli(args):
def run_mon_cli(args, useJson=True):
if useJson:
args.insert(0, '--json')
args.insert(0, 'mon')
try: try:
stdout = subprocess.check_output(args) stdout = subprocess.check_output(args)
return stdout if useJson:
return json.loads(stdout)
else:
return stdout
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(e, file=sys.stderr) print(e, file=sys.stderr)
sys.exit(1) sys.exit(1)
def create_notification(notification_name, notification_email_addr): def create_notification(notification_name, notification_email_addr):
print("Creating notification") print('Creating notification')
stdout = run_mon_cli(["mon", "notification-create", notification_name, "EMAIL", notification_email_addr]) result_json = run_mon_cli(['notification-create', notification_name,
response_json = json.loads(stdout) 'EMAIL', notification_email_addr])
# Parse out id # Parse out id
notification_method_id = response_json['id'] notification_method_id = result_json['id']
return notification_method_id return notification_method_id
def find_id_for_name(object_json, name): def find_id_for_name(object_json, name):
for obj in object_json: for obj in object_json:
@ -119,74 +151,112 @@ def find_id_for_name(object_json, name):
return obj['id'] return obj['id']
return None return None
def cleanup(notification_name, alarm_name): def cleanup(notification_name, alarm_name):
# Delete our alarm if it already exists # Delete our alarm if it already exists
alarm_json = json.loads(run_mon_cli(["mon", "--json", "alarm-list"])) alarm_json = run_mon_cli(['alarm-list'])
alarm_id = find_id_for_name(alarm_json, alarm_name) alarm_id = find_id_for_name(alarm_json, alarm_name)
if alarm_id: if alarm_id:
run_mon_cli(["mon", "alarm-delete", alarm_id]) run_mon_cli(['alarm-delete', alarm_id], useJson=False)
# Delete our notification if it already exists # Delete our notification if it already exists
notification_json = json.loads(run_mon_cli(["mon", "--json", "notification-list"])) notification_json = run_mon_cli(['notification-list'])
notification_id = find_id_for_name(notification_json, notification_name) notification_id = find_id_for_name(notification_json, notification_name)
if notification_id: if notification_id:
run_mon_cli(["mon", "notification-delete", notification_id]) run_mon_cli(['notification-delete', notification_id], useJson=False)
def wait_for_alarm_state_change(alarm_id, old_state):
# Wait for it to change state
print('Waiting for alarm to change state from %s' % old_state)
for x in range(0, 250):
time.sleep(1)
state = get_alarm_state(alarm_id)
if state != old_state:
print('Alarm state changed to %s in %d seconds' % (state, x))
return state
print('State never changed from %s in %d seconds' % (old_state, x),
file=sys.stderr)
sys.exit(1)
def main(): def main():
notification_name = "Jahmon Smoke Test" notification_name = 'Jahmon Smoke Test'
notification_email_addr = "root@kafka" notification_email_addr = 'root@kafka'
alarm_name = "high cpu and load" alarm_name = 'high cpu and load'
metric_name = "cpu_system_perc" metric_name = 'cpu_system_perc'
metric_dimensions = {"hostname":"thresh"} metric_dimensions = {'hostname': 'thresh'}
cleanup(notification_name, alarm_name) cleanup(notification_name, alarm_name)
# Query how many metrics there are for the Alarm # Query how many metrics there are for the Alarm
metric_json = get_metrics(metric_name, metric_dimensions) metric_json = get_metrics(metric_name, metric_dimensions)
if len(metric_json) == 0: if len(metric_json) == 0:
print("No measurements received for metric " + metric_name + str(metric_dimensions), file=sys.stderr) print('No measurements received for metric %s ' %
sys.exit(1) (metric_name + str(metric_dimensions)), file=sys.stderr)
return 1
start_time = time.time()
initial_num_metrics = len(metric_json[0]['measurements']) initial_num_metrics = len(metric_json[0]['measurements'])
# Create Notification through CLI # Create Notification through CLI
notification_method_id = create_notification(notification_name, notification_email_addr) notification_method_id = create_notification(notification_name,
notification_email_addr)
# Create Alarm through CLI # Create Alarm through CLI
alarm_id = create_alarm(alarm_name, "max(cpu_system_perc) > 1 and max(load_avg_1_min{hostname=thresh}) > 3", notification_method_id, "System CPU Utilization exceeds 1% and Load exeeds 3 per measurement period") expression = 'max(cpu_system_perc) > 1 and ' + \
'max(load_avg_1_min{hostname=thresh}) > 1'
description = 'System CPU Utilization exceeds 1% and ' + \
'Load exceeds 3 per measurement period'
alarm_id = create_alarm(alarm_name, expression, notification_method_id,
description)
state = get_alarm_state(alarm_id) state = get_alarm_state(alarm_id)
# Ensure it is created in the right state # Ensure it is created in the right state
if state != 'UNDETERMINED': if state != 'UNDETERMINED':
print("Wrong initial alarm state, expected UNDETERMINED but was " + state) print('Wrong initial alarm state, expected UNDETERMINED but is %s' %
sys.exit(1) state)
# Wait for it to return 1
print("Waiting for alarm to change state")
change_time = 0 state = wait_for_alarm_state_change(alarm_id, 'UNDETERMINED')
for x in range(0, 250):
time.sleep(1)
state = get_alarm_state(alarm_id)
if state != 'UNDETERMINED':
print("Alarm state changed in " + str(x) + " seconds")
change_time = x
break
if state != 'ALARM': if state != 'ALARM':
print("Wrong final state, expected ALARM but was " + state, file=sys.stderr) print('Wrong final state, expected ALARM but was %s' % state,
sys.exit(1) file=sys.stderr)
print("Final state of alarm was " + state) return 1
# If the alarm changes state too fast, then there isn't time for the new metric to arrive.
# Unlikely, but it has been seen state_changes = ['UNDETERMINED', 'ALARM']
new_state = 'OK'
state_changes.append(new_state)
change_alarm_state(alarm_id, new_state)
# There is a bug in the API which allows this to work. Soon that
# will be fixed and this will fail
if len(sys.argv) > 1:
final_state = 'ALARM'
state_changes.append(final_state)
state = wait_for_alarm_state_change(alarm_id, new_state)
if state != final_state:
print('Wrong final state, expected %s but was %s' %
(final_state, state), file=sys.stderr)
return 1
# If the alarm changes state too fast, then there isn't time for the new
# metric to arrive. Unlikely, but it has been seen
change_time = time.time() - start_time
if change_time < 30: if change_time < 30:
time.sleep(30 - change_time) time.sleep(30 - change_time)
change_time = 30 change_time = 30
metric_json = get_metrics(metric_name, metric_dimensions) metric_json = get_metrics(metric_name, metric_dimensions)
final_num_metrics = len(metric_json[0]['measurements']) final_num_metrics = len(metric_json[0]['measurements'])
if final_num_metrics <= initial_num_metrics: if final_num_metrics <= initial_num_metrics:
print("No new metrics received", file=sys.stderr) print('No new metrics received', file=sys.stderr)
sys.exit(1) return 1
print("Received " + str(final_num_metrics - initial_num_metrics) + " metrics in " + str(change_time) + " seconds") print('Received %d metrics in %d seconds' %
if not check_alarm_history(alarm_id): ((final_num_metrics - initial_num_metrics), change_time))
sys.exit(1) if not check_alarm_history(alarm_id, state_changes):
return 1
return 0 return 0
if __name__ == "__main__": if __name__ == "__main__":
sys.exit(main()) sys.exit(main())