
Fixed the notification_cycleTest.py to work on new single node Mini-mon and not re-use random alarms as they weren't always configured properly Removed dead code from notification_crud.py
242 lines
8.5 KiB
Python
Executable File
242 lines
8.5 KiB
Python
Executable File
#!/usr/bin/env python
|
|
#
|
|
"""smoke
|
|
Runs a smoke test of the monitoring installation on mini-mon by ensuring
|
|
metrics are flowing and creating a new notification, alarm and that the
|
|
Threshold Engine changes the state of the alarm. This requires the mon
|
|
CLI and must be run on either the mini-mon VM for the single VM mode or
|
|
on the kafka VM in the multi VM mode.
|
|
Get it by following the instructions on
|
|
https://wiki.hpcloud.net/display/iaas/Monitoring+CLI.
|
|
|
|
TODO:
|
|
1. Add check of notification history when that is implemented
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
import sys
|
|
import os
|
|
import subprocess
|
|
import time
|
|
import cli_wrapper
|
|
from notification import find_notifications
|
|
|
|
# export OS_AUTH_TOKEN=82510970543135
|
|
# export OS_NO_CLIENT_AUTH=1
|
|
# export MON_API_URL=http://192.168.10.4:8080/v2.0/
|
|
|
|
|
|
def check_alarm_history(alarm_id, states):
|
|
transitions = len(states) - 1
|
|
print('Checking Alarm History')
|
|
# May take some time for Alarm history to flow all the way through
|
|
for _ in range(0, 10):
|
|
result_json = cli_wrapper.run_mon_cli(['alarm-history', alarm_id])
|
|
if len(result_json) >= transitions:
|
|
break
|
|
time.sleep(4)
|
|
|
|
result = True
|
|
if not check_expected(transitions, len(result_json),
|
|
'number of history entries'):
|
|
return False
|
|
result_json.sort(key=lambda x: x['timestamp'])
|
|
for i in range(0, transitions):
|
|
old_state = states[i]
|
|
new_state = states[i+1]
|
|
alarm_json = result_json[i]
|
|
if not check_expected(old_state, alarm_json['old_state'], 'old_state'):
|
|
result = False
|
|
if not check_expected(new_state, alarm_json['new_state'], 'new_state'):
|
|
result = False
|
|
if not check_expected(alarm_id, alarm_json['alarm_id'], 'alarm_id'):
|
|
result = False
|
|
|
|
if result:
|
|
print('Alarm History is OK')
|
|
return result
|
|
|
|
|
|
def check_expected(expected, actual, what):
|
|
if (expected == actual):
|
|
return True
|
|
print("Incorrect value for alarm history %s expected '%s' but was '%s'" %
|
|
(what, str(expected), str(actual)), file=sys.stderr)
|
|
return False
|
|
|
|
|
|
def get_metrics(name, dimensions):
|
|
print('Getting metrics for %s ' % (name + str(dimensions)))
|
|
dimensions_arg = ''
|
|
for key, value in dimensions.iteritems():
|
|
if dimensions_arg != '':
|
|
dimensions_arg = dimensions_arg + ','
|
|
dimensions_arg = dimensions_arg + key + '=' + value
|
|
return cli_wrapper.run_mon_cli(['measurement-list', '--dimensions',
|
|
dimensions_arg, name, '00'])
|
|
|
|
|
|
def cleanup(notification_name, alarm_name):
|
|
cli_wrapper.delete_alarm_if_exists(alarm_name)
|
|
cli_wrapper.delete_notification_if_exists(notification_name)
|
|
|
|
|
|
def wait_for_alarm_state_change(alarm_id, old_state):
|
|
# Wait for it to change state
|
|
print('Waiting for alarm to change state from %s' % old_state)
|
|
for x in range(0, 250):
|
|
time.sleep(1)
|
|
state = cli_wrapper.get_alarm_state(alarm_id)
|
|
if state != old_state:
|
|
print('Alarm state changed to %s in %d seconds' % (state, x))
|
|
return state
|
|
print('State never changed from %s in %d seconds' % (old_state, x),
|
|
file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def check_notifications(alarm_id, state_changes):
|
|
if not os.path.isfile('/etc/mon/notification.yaml'):
|
|
print('Notification Engine not installed on this VM,' +
|
|
' skipping Notifications test',
|
|
file=sys.stderr)
|
|
return True
|
|
notifications = find_notifications(alarm_id, "root")
|
|
if len(notifications) != len(state_changes):
|
|
print('Expected %d notifications but only found %d' %
|
|
(len(state_changes), len(notifications)), file=sys.stderr)
|
|
return False
|
|
index = 0
|
|
for expected in state_changes:
|
|
actual = notifications[index]
|
|
if actual != expected:
|
|
print('Expected %s but found %d for state change %d' %
|
|
(expected, actual, index+1), file=sys.stderr)
|
|
return False
|
|
index = index + 1
|
|
print('Received email notifications as expected')
|
|
return True
|
|
|
|
|
|
def count_metrics(metric_name, metric_dimensions):
|
|
# Query how many metrics there are for the Alarm
|
|
metric_json = get_metrics(metric_name, metric_dimensions)
|
|
if len(metric_json) == 0:
|
|
print('No measurements received for metric %s ' %
|
|
(metric_name + str(metric_dimensions)), file=sys.stderr)
|
|
return None
|
|
|
|
return len(metric_json[0]['measurements'])
|
|
|
|
|
|
def ensure_at_least(desired, actual):
|
|
if actual < desired:
|
|
time.sleep(desired - actual)
|
|
|
|
|
|
def main():
|
|
# Determine if we are running on mutiple VMs or just the one
|
|
if os.path.isfile('/etc/mon/mon-api-config.yml'):
|
|
api_host = 'localhost'
|
|
metric_host = subprocess.check_output(['hostname', '-f']).strip()
|
|
mail_host = 'localhost'
|
|
else:
|
|
api_host = '192.168.10.4'
|
|
metric_host = 'thresh'
|
|
mail_host = 'kafka'
|
|
|
|
# These need to be set because we are invoking the CLI as a process
|
|
os.environ['OS_AUTH_TOKEN'] = '82510970543135'
|
|
os.environ['OS_NO_CLIENT_AUTH'] = '1'
|
|
os.environ['MON_API_URL'] = 'http://' + api_host + ':8080/v2.0/'
|
|
|
|
notification_name = 'Jahmon Smoke Test'
|
|
notification_email_addr = 'root@' + mail_host
|
|
alarm_name = 'high cpu and load'
|
|
metric_name = 'load_avg_1_min'
|
|
metric_dimensions = {'hostname': metric_host}
|
|
cleanup(notification_name, alarm_name)
|
|
|
|
# Query how many metrics there are for the Alarm
|
|
initial_num_metrics = count_metrics(metric_name, metric_dimensions)
|
|
if initial_num_metrics is None:
|
|
return 1
|
|
|
|
start_time = time.time()
|
|
|
|
# Create Notification through CLI
|
|
notification_id = cli_wrapper.create_notification(notification_name,
|
|
notification_email_addr)
|
|
# Create Alarm through CLI
|
|
expression = 'max(cpu_system_perc) > 0 and ' + \
|
|
'max(load_avg_1_min{hostname=' + metric_host + '}) > 0'
|
|
description = 'System CPU Utilization exceeds 1% and ' + \
|
|
'Load exceeds 3 per measurement period'
|
|
alarm_id = cli_wrapper.create_alarm(alarm_name, expression,
|
|
description=description,
|
|
ok_notif_id=notification_id,
|
|
alarm_notif_id=notification_id,
|
|
undetermined_notif_id=notification_id)
|
|
state = cli_wrapper.get_alarm_state(alarm_id)
|
|
# Ensure it is created in the right state
|
|
initial_state = 'UNDETERMINED'
|
|
states = []
|
|
if state != initial_state:
|
|
print('Wrong initial alarm state, expected %s but is %s' %
|
|
(initial_state, state))
|
|
return 1
|
|
states.append(initial_state)
|
|
|
|
state = wait_for_alarm_state_change(alarm_id, initial_state)
|
|
if state is None:
|
|
return 1
|
|
|
|
if state != 'ALARM':
|
|
print('Wrong final state, expected ALARM but was %s' % state,
|
|
file=sys.stderr)
|
|
return 1
|
|
states.append(state)
|
|
|
|
new_state = 'OK'
|
|
states.append(new_state)
|
|
cli_wrapper.change_alarm_state(alarm_id, new_state)
|
|
# There is a bug in the API which allows this to work. Soon that
|
|
# will be fixed and this will fail
|
|
if len(sys.argv) > 1:
|
|
final_state = 'ALARM'
|
|
states.append(final_state)
|
|
|
|
state = wait_for_alarm_state_change(alarm_id, new_state)
|
|
if state is None:
|
|
return 1
|
|
|
|
if state != final_state:
|
|
print('Wrong final state, expected %s but was %s' %
|
|
(final_state, state), file=sys.stderr)
|
|
return 1
|
|
|
|
# If the alarm changes state too fast, then there isn't time for the new
|
|
# metric to arrive. Unlikely, but it has been seen
|
|
ensure_at_least(time.time() - start_time, 35)
|
|
change_time = time.time() - start_time
|
|
|
|
final_num_metrics = count_metrics(metric_name, metric_dimensions)
|
|
if final_num_metrics <= initial_num_metrics:
|
|
print('No new metrics received in %d seconds' % change_time,
|
|
file=sys.stderr)
|
|
return 1
|
|
print('Received %d metrics in %d seconds' %
|
|
((final_num_metrics - initial_num_metrics), change_time))
|
|
if not check_alarm_history(alarm_id, states):
|
|
return 1
|
|
|
|
# Notifications are only sent out for the changes, so omit the first state
|
|
if not check_notifications(alarm_id, states[1:]):
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|