#!/usr/bin/env python # """smoke Runs a smoke test of the monitoring installation on mini-mon by ensuring metrics are flowing and creating a new notification, alarm and that the Threshold Engine changes the state of the alarm. This requires the mon CLI and must be run on either the mini-mon VM for the single VM mode or on the kafka VM in the multi VM mode. Get it by following the instructions on https://wiki.hpcloud.net/display/iaas/Monitoring+CLI. TODO: 1. Add check of notification history when that is implemented """ from __future__ import print_function import sys import os import subprocess import time import cli_wrapper from notification import find_notifications # export OS_AUTH_TOKEN=82510970543135 # export OS_NO_CLIENT_AUTH=1 # export MON_API_URL=http://192.168.10.4:8080/v2.0/ def check_alarm_history(alarm_id, states): transitions = len(states) - 1 print('Checking Alarm History') # May take some time for Alarm history to flow all the way through for _ in range(0, 10): result_json = cli_wrapper.run_mon_cli(['alarm-history', alarm_id]) if len(result_json) >= transitions: break time.sleep(4) result = True if not check_expected(transitions, len(result_json), 'number of history entries'): return False result_json.sort(key=lambda x: x['timestamp']) for i in range(0, transitions): old_state = states[i] new_state = states[i+1] alarm_json = result_json[i] if not check_expected(old_state, alarm_json['old_state'], 'old_state'): result = False if not check_expected(new_state, alarm_json['new_state'], 'new_state'): result = False if not check_expected(alarm_id, alarm_json['alarm_id'], 'alarm_id'): result = False if result: print('Alarm History is OK') return result def check_expected(expected, actual, what): if (expected == actual): return True print("Incorrect value for alarm history %s expected '%s' but was '%s'" % (what, str(expected), str(actual)), file=sys.stderr) return False def get_metrics(name, dimensions): print('Getting metrics for %s ' % (name + str(dimensions))) dimensions_arg = '' for key, value in dimensions.iteritems(): if dimensions_arg != '': dimensions_arg = dimensions_arg + ',' dimensions_arg = dimensions_arg + key + '=' + value return cli_wrapper.run_mon_cli(['measurement-list', '--dimensions', dimensions_arg, name, '00']) def cleanup(notification_name, alarm_name): cli_wrapper.delete_alarm_if_exists(alarm_name) cli_wrapper.delete_notification_if_exists(notification_name) def wait_for_alarm_state_change(alarm_id, old_state): # Wait for it to change state print('Waiting for alarm to change state from %s' % old_state) for x in range(0, 250): time.sleep(1) state = cli_wrapper.get_alarm_state(alarm_id) if state != old_state: print('Alarm state changed to %s in %d seconds' % (state, x)) return state print('State never changed from %s in %d seconds' % (old_state, x), file=sys.stderr) return None def check_notifications(alarm_id, state_changes): if not os.path.isfile('/etc/mon/notification.yaml'): print('Notification Engine not installed on this VM,' + ' skipping Notifications test', file=sys.stderr) return True notifications = find_notifications(alarm_id, "root") if len(notifications) != len(state_changes): print('Expected %d notifications but only found %d' % (len(state_changes), len(notifications)), file=sys.stderr) return False index = 0 for expected in state_changes: actual = notifications[index] if actual != expected: print('Expected %s but found %d for state change %d' % (expected, actual, index+1), file=sys.stderr) return False index = index + 1 print('Received email notifications as expected') return True def count_metrics(metric_name, metric_dimensions): # Query how many metrics there are for the Alarm metric_json = get_metrics(metric_name, metric_dimensions) if len(metric_json) == 0: print('No measurements received for metric %s ' % (metric_name + str(metric_dimensions)), file=sys.stderr) return None return len(metric_json[0]['measurements']) def ensure_at_least(desired, actual): if actual < desired: time.sleep(desired - actual) def main(): # Determine if we are running on mutiple VMs or just the one if os.path.isfile('/etc/mon/mon-api-config.yml'): api_host = 'localhost' metric_host = subprocess.check_output(['hostname', '-f']).strip() mail_host = 'localhost' else: api_host = '192.168.10.4' metric_host = 'thresh' mail_host = 'kafka' # These need to be set because we are invoking the CLI as a process os.environ['OS_AUTH_TOKEN'] = '82510970543135' os.environ['OS_NO_CLIENT_AUTH'] = '1' os.environ['MON_API_URL'] = 'http://' + api_host + ':8080/v2.0/' notification_name = 'Jahmon Smoke Test' notification_email_addr = 'root@' + mail_host alarm_name = 'high cpu and load' metric_name = 'load_avg_1_min' metric_dimensions = {'hostname': metric_host} cleanup(notification_name, alarm_name) # Query how many metrics there are for the Alarm initial_num_metrics = count_metrics(metric_name, metric_dimensions) if initial_num_metrics is None: return 1 start_time = time.time() # Create Notification through CLI notification_id = cli_wrapper.create_notification(notification_name, notification_email_addr) # Create Alarm through CLI expression = 'max(cpu_system_perc) > 0 and ' + \ 'max(load_avg_1_min{hostname=' + metric_host + '}) > 0' description = 'System CPU Utilization exceeds 1% and ' + \ 'Load exceeds 3 per measurement period' alarm_id = cli_wrapper.create_alarm(alarm_name, expression, description=description, ok_notif_id=notification_id, alarm_notif_id=notification_id, undetermined_notif_id=notification_id) state = cli_wrapper.get_alarm_state(alarm_id) # Ensure it is created in the right state initial_state = 'UNDETERMINED' states = [] if state != initial_state: print('Wrong initial alarm state, expected %s but is %s' % (initial_state, state)) return 1 states.append(initial_state) state = wait_for_alarm_state_change(alarm_id, initial_state) if state is None: return 1 if state != 'ALARM': print('Wrong final state, expected ALARM but was %s' % state, file=sys.stderr) return 1 states.append(state) new_state = 'OK' states.append(new_state) cli_wrapper.change_alarm_state(alarm_id, new_state) # There is a bug in the API which allows this to work. Soon that # will be fixed and this will fail if len(sys.argv) > 1: final_state = 'ALARM' states.append(final_state) state = wait_for_alarm_state_change(alarm_id, new_state) if state is None: return 1 if state != final_state: print('Wrong final state, expected %s but was %s' % (final_state, state), file=sys.stderr) return 1 # If the alarm changes state too fast, then there isn't time for the new # metric to arrive. Unlikely, but it has been seen ensure_at_least(time.time() - start_time, 35) change_time = time.time() - start_time final_num_metrics = count_metrics(metric_name, metric_dimensions) if final_num_metrics <= initial_num_metrics: print('No new metrics received in %d seconds' % change_time, file=sys.stderr) return 1 print('Received %d metrics in %d seconds' % ((final_num_metrics - initial_num_metrics), change_time)) if not check_alarm_history(alarm_id, states): return 1 # Notifications are only sent out for the changes, so omit the first state if not check_notifications(alarm_id, states[1:]): return 1 return 0 if __name__ == "__main__": sys.exit(main())