diff --git a/etc/pip-requires.txt b/etc/pip-requires.txt index 2676d61..c88953c 100644 --- a/etc/pip-requires.txt +++ b/etc/pip-requires.txt @@ -3,3 +3,4 @@ MySQL-python>=1.2.3 eventlet>=0.9.17 kombu>=2.4.7 librabbitmq>=1.0.0 +argparse diff --git a/migrations/004_usage.sql b/migrations/004_usage.sql new file mode 100644 index 0000000..118b365 --- /dev/null +++ b/migrations/004_usage.sql @@ -0,0 +1,86 @@ +BEGIN; +CREATE TABLE `stacktach_deployment` ( + `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, + `name` varchar(50) NOT NULL +) +; +CREATE TABLE `stacktach_rawdata` ( + `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, + `deployment_id` integer NOT NULL, + `tenant` varchar(50), + `json` longtext NOT NULL, + `routing_key` varchar(50), + `state` varchar(20), + `old_state` varchar(20), + `old_task` varchar(30), + `task` varchar(30), + `image_type` integer, + `when` numeric(20, 6) NOT NULL, + `publisher` varchar(100), + `event` varchar(50), + `service` varchar(50), + `host` varchar(100), + `instance` varchar(50), + `request_id` varchar(50) +) +; +ALTER TABLE `stacktach_rawdata` ADD CONSTRAINT `deployment_id_refs_id_362370d` FOREIGN KEY (`deployment_id`) REFERENCES `stacktach_deployment` (`id`); +CREATE TABLE `stacktach_lifecycle` ( + `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, + `instance` varchar(50), + `last_state` varchar(50), + `last_task_state` varchar(50), + `last_raw_id` integer +) +; +ALTER TABLE `stacktach_lifecycle` ADD CONSTRAINT `last_raw_id_refs_id_2a04e82d` FOREIGN KEY (`last_raw_id`) REFERENCES `stacktach_rawdata` (`id`); +CREATE TABLE `stacktach_instanceusage` ( + `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, + `instance` varchar(50), + `launched_at` numeric(20, 6), + `deleted_at` numeric(20, 6), + `request_id` varchar(50), + `instance_type_id` varchar(50) +) +; +CREATE TABLE `stacktach_instanceexists` ( + `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, + `instance` varchar(50), + `launched_at` numeric(20, 6), + `deleted_at` numeric(20, 6), + `message_id` varchar(50), + `instance_type_id` varchar(50), + `status` varchar(50) NOT NULL, + `raw_id` integer, + `usage_id` integer +) +; +ALTER TABLE `stacktach_instanceexists` ADD CONSTRAINT `usage_id_refs_id_3b13299b` FOREIGN KEY (`usage_id`) REFERENCES `stacktach_instanceusage` (`id`); +ALTER TABLE `stacktach_instanceexists` ADD CONSTRAINT `raw_id_refs_id_65c72953` FOREIGN KEY (`raw_id`) REFERENCES `stacktach_rawdata` (`id`); +CREATE TABLE `stacktach_timing` ( + `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, + `name` varchar(50) NOT NULL, + `lifecycle_id` integer NOT NULL, + `start_raw_id` integer, + `end_raw_id` integer, + `start_when` numeric(20, 6), + `end_when` numeric(20, 6), + `diff` numeric(20, 6) +) +; +ALTER TABLE `stacktach_timing` ADD CONSTRAINT `start_raw_id_refs_id_3cd201fc` FOREIGN KEY (`start_raw_id`) REFERENCES `stacktach_rawdata` (`id`); +ALTER TABLE `stacktach_timing` ADD CONSTRAINT `end_raw_id_refs_id_3cd201fc` FOREIGN KEY (`end_raw_id`) REFERENCES `stacktach_rawdata` (`id`); +ALTER TABLE `stacktach_timing` ADD CONSTRAINT `lifecycle_id_refs_id_4255ead8` FOREIGN KEY (`lifecycle_id`) REFERENCES `stacktach_lifecycle` (`id`); +CREATE TABLE `stacktach_requesttracker` ( + `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, + `request_id` varchar(50) NOT NULL, + `lifecycle_id` integer NOT NULL, + `last_timing_id` integer, + `start` numeric(20, 6) NOT NULL, + `duration` numeric(20, 6) NOT NULL, + `completed` bool NOT NULL +) +; +ALTER TABLE `stacktach_requesttracker` ADD CONSTRAINT `last_timing_id_refs_id_f7d8336` FOREIGN KEY (`last_timing_id`) REFERENCES `stacktach_timing` (`id`); +ALTER TABLE `stacktach_requesttracker` ADD CONSTRAINT `lifecycle_id_refs_id_e457729` FOREIGN KEY (`lifecycle_id`) REFERENCES `stacktach_lifecycle` (`id`); +COMMIT; diff --git a/migrations/005_reports.txt b/migrations/005_reports.txt new file mode 100644 index 0000000..e10220f --- /dev/null +++ b/migrations/005_reports.txt @@ -0,0 +1,3 @@ +do +python manage.py syncdb +to add the JsonReport table diff --git a/reports/pretty.py b/reports/pretty.py index d177867..f3227e5 100644 --- a/reports/pretty.py +++ b/reports/pretty.py @@ -1,3 +1,4 @@ +import argparse import datetime import json import sys @@ -12,155 +13,293 @@ from stacktach import image_type from stacktach import models -if __name__ != '__main__': - sys.exit(1) +def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, + store=False, region=None): + if not yesterday: + yesterday = datetime.datetime.utcnow().date() - \ + datetime.timedelta(days=1) -yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1) -if len(sys.argv) == 2: - try: - t = time.strptime(sys.argv[1], "%Y-%m-%d") - yesterday = datetime.datetime(*t[:6]) - except Exception, e: - print e - print "Usage: python requests.py YYYY-MM-DD (the end date)" + rstart = datetime.datetime(year=yesterday.year, month=yesterday.month, + day=yesterday.day, hour=start_hour) + rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59) + + dstart = dt.dt_to_decimal(rstart) + dend = dt.dt_to_decimal(rend) + + codes = {} + + cells = [] + regions = [] + if region: + region = region.upper() + deployments = models.Deployment.objects.all() + for deployment in deployments: + name = deployment.name.upper() + if not region or region in name: + regions.append(deployment.id) + cells.append(deployment.name) + + if not len(regions): + print "No regions found for '%s'" % region sys.exit(1) -percentile = 90 -hours = 24 + # Get all the instances that have changed in the last N hours ... + updates = models.RawData.objects.filter(event='compute.instance.update', + when__gt=dstart, when__lte=dend, + deployment__in=regions)\ + .values('instance').distinct() -start = datetime.datetime(year=yesterday.year, month=yesterday.month, - day=yesterday.day) -end = start + datetime.timedelta(hours=hours-1, minutes=59, seconds=59) + expiry = 60 * 60 # 1 hour + cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] -print "Generating report for %s to %s" % (start, end) + failures = {} # { key : {failure_type: count} } + durations = {} + attempts = {} -dstart = dt.dt_to_decimal(start) -dend = dt.dt_to_decimal(end) + for uuid_dict in updates: + uuid = uuid_dict['instance'] -codes = {} - -# Get all the instances that have changed in the last N hours ... -updates = models.RawData.objects.filter(event='compute.instance.update', - when__gt=dstart, when__lte=dend)\ - .values('instance').distinct() - -expiry = 60 * 60 # 1 hour -cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] - -failures = {} -durations = {} -attempts = {} - -for uuid_dict in updates: - uuid = uuid_dict['instance'] - - # All the unique Request ID's for this instance during that timespan. - reqs = models.RawData.objects.filter(instance=uuid, - when__gt=dstart, when__lte=dend) \ - .values('request_id').distinct() + # All the unique Request ID's for this instance during that timespan. + reqs = models.RawData.objects.filter(instance=uuid, + when__gt=dstart, when__lte=dend) \ + .values('request_id').distinct() - for req_dict in reqs: - report = False - req = req_dict['request_id'] - raws = models.RawData.objects.filter(request_id=req)\ - .exclude(event='compute.instance.exists')\ - .order_by('when') + for req_dict in reqs: + req = req_dict['request_id'] + raws = models.RawData.objects.filter(request_id=req)\ + .exclude(event='compute.instance.exists')\ + .order_by('when') - start = None - err = None + start = None + err = None + failure_type = None - operation = "aux" - image_type_num = 0 + operation = "aux" + image_type_num = 0 + + for raw in raws: + if not start: + start = raw.when + + if 'error' in raw.routing_key: + err = raw + failure_type = 'http' + + if raw.old_state != 'error' and raw.state == 'error': + failure_type = 'state' + + if raw.old_state == 'error' and \ + (not raw.state in ['deleted', 'error']): + failure_type = None + + for cmd in cmds: + if cmd in raw.event: + operation = cmd + break + + if raw.image_type: + image_type_num |= raw.image_type + + image = "?" + if image_type.isset(image_type_num, image_type.BASE_IMAGE): + image = "base" + if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE): + image = "snap" - for raw in raws: if not start: - start = raw.when - if 'error' in raw.routing_key: - err = raw - report = True + continue - for cmd in cmds: - if cmd in raw.event: - operation = cmd - break + end = raw.when + diff = end - start - if raw.image_type: - image_type_num |= raw.image_type + if diff > 3600: + failure_type = '> 60' - image = "?" - if image_type.isset(image_type_num, image_type.BASE_IMAGE): - image = "base" - if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE): - image = "snap" + key = (operation, image) - if not start: - continue + # Track durations for all attempts, good and bad ... + _durations = durations.get(key, []) + _durations.append(diff) + durations[key] = _durations - end = raw.when - diff = end - start + attempts[key] = attempts.get(key, 0) + 1 - if diff > 3600: - report = True + if failure_type: + if err: + queue, body = json.loads(err.json) + payload = body['payload'] + exc = payload.get('exception') + if exc: + code = int(exc.get('kwargs', {}).get('code', 0)) + if code >= 400 and code < 500: + failure_type = "4xx" + if code >= 500 and code < 600: + failure_type = "5xx" + breakdown = failures.get(key, {}) + breakdown[failure_type] = breakdown.get(failure_type, 0) + 1 + failures[key] = breakdown - key = (operation, image) + # Summarize the results ... + report = [] + pct = (float(100 - percentile) / 2.0) / 100.0 + details = {'percentile': percentile, 'pct': pct, 'hours': hours, + 'start': float(dstart), 'end': float(dend), 'region': region, + 'cells': cells} + report.append(details) - # Track durations for all attempts, good and bad ... - _durations = durations.get(key, []) - _durations.append(diff) - durations[key] = _durations + failure_types = ["4xx", "5xx", "> 60", "state"] + cols = ["Operation", "Image", "Min", "Max", "Med", "%d%%" % percentile, + "Requests"] + for failure_type in failure_types: + cols.append("%s" % failure_type) + cols.append("%% %s" % failure_type) + report.append(cols) - attempts[key] = attempts.get(key, 0) + 1 + total = 0 + failure_totals = {} + for key, count in attempts.iteritems(): + total += count + operation, image = key - if report: - failures[key] = failures.get(key, 0) + 1 + breakdown = failures.get(key, {}) + this_failure_pair = [] + for failure_type in failure_types: + # Failure counts for this attempt. + # Sum for grand totals. + failure_count = breakdown.get(failure_type, 0) + failure_totals[failure_type] = \ + failure_totals.get(failure_type, 0) + failure_count -# Print the results ... -cols = ["Operation", "Image", "Min*", "Max*", "Avg*", - "Requests", "# Fail", "Fail %"] -p = prettytable.PrettyTable(cols) -for c in cols[2:]: - p.align[c] = 'r' -p.sortby = cols[0] + # Failure percentage for this attempt. + percentage = float(failure_count) / float(count) + this_failure_pair.append((failure_count, percentage)) -pct = (float(100 - percentile) / 2.0) / 100.0 -print "* Using %d-th percentile for results (+/-%.1f%% cut)" % \ - (percentile, pct * 100.0) -total = 0 -failure_total = 0 -for key, count in attempts.iteritems(): - total += count - operation, image = key + # N-th % of durations ... + _values = durations[key] + _values.sort() + _min = 99999999 + _max = 0 + _total = 0.0 + for value in _values: + _min = min(_min, value) + _max = max(_max, value) + _total += float(value) + _num = len(_values) + _avg = float(_total) / float(_num) + half = _num / 2 + _median = _values[half] + _percentile_index = int((float(percentile) / 100.0) * float(_num)) + _percentile = _values[_percentile_index] - failure_count = failures.get(key, 0) - failure_total += failure_count - failure_percentage = float(failure_count) / float(count) - _failure_percentage = "%.1f%%" % (failure_percentage * 100.0) + _fmin = dt.sec_to_str(_min) + _fmax = dt.sec_to_str(_max) + _favg = dt.sec_to_str(_avg) + _fmedian = dt.sec_to_str(_median) + _fpercentile = dt.sec_to_str(_percentile) - # N-th % of durations ... - _values = durations[key] - _values.sort() - _outliers = int(float(len(_values)) * pct) - if _outliers > 0: - before = len(_values) - _values = _values[_outliers:-_outliers] - print "culling %d -> %d" % (before, len(_values)) - _min = 99999999 - _max = 0 - _total = 0.0 - for value in _values: - _min = min(_min, value) - _max = max(_max, value) - _total += float(value) - _avg = float(_total) / float(len(_values)) - _fmin = dt.sec_to_str(_min) - _fmax = dt.sec_to_str(_max) - _favg = dt.sec_to_str(_avg) + row = [operation, image, _fmin, _fmax, _fmedian, _fpercentile, count] + for failure_count, failure_percentage in this_failure_pair: + row.append(failure_count) + row.append(failure_percentage) + report.append(row) - p.add_row([operation, image, _fmin, _fmax, _favg, count, - failure_count, _failure_percentage]) -print p + details['total'] = total + failure_grand_total = 0 + for failure_type in failure_types: + failure_total = failure_totals.get(failure_type, 0) + failure_grand_total += failure_total + details["%s failure count" % failure_type] = failure_total + failure_percentage = (float(failure_total)/float(total)) * 100.0 + details["%s failure percentage" % failure_type] = failure_percentage -print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \ - (total, failure_total, - (float(failure_total)/float(total)) * 100.0) + details['failure_grand_total'] = failure_grand_total + details['failure_grand_rate'] = (float(failure_grand_total)/float(total)) * 100.0 + return (rstart, rend, report) + + +def valid_date(date): + try: + t = time.strptime(date, "%Y-%m-%d") + return datetime.datetime(*t[:6]) + except Exception, e: + raise argparse.ArgumentTypeError( + "'%s' is not in YYYY-MM-DD format." % date) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('StackTach Nova Usage Summary Report') + parser.add_argument('--utcdate', + help='Report start date YYYY-MM-DD. Default yesterday midnight.', + type=valid_date, default=None) + parser.add_argument('--region', + help='Report Region. Default is all regions.', default=None) + parser.add_argument('--hours', + help='Report span in hours. Default: 24', default=24, + type=int) + parser.add_argument('--start_hour', + help='Starting hour 0-23. Default: 0', default=0, + type=int) + parser.add_argument('--percentile', + help='Percentile for timings. Default: 97', default=97, + type=int) + parser.add_argument('--store', + help='Store report in database. Default: False', + default=False, action="store_true") + parser.add_argument('--silent', + help="Do not show summary report. Default: False", + default=False, action="store_true") + args = parser.parse_args() + + yesterday = args.utcdate + percentile = args.percentile + hours = args.hours + start_hour = args.start_hour + store_report = args.store + region = args.region + + start, end, raw_report = make_report(yesterday, start_hour, hours, + percentile, store_report, region) + details = raw_report[0] + pct = details['pct'] + + region_name = "all" + if region: + region_name = region + + if store_report: + values = {'json': json.dumps(raw_report), + 'created': dt.dt_to_decimal(datetime.datetime.utcnow()), + 'period_start': start, + 'period_end': end, + 'version': 3, + 'name': 'summary for region: %s' % region_name} + report = models.JsonReport(**values) + report.save() + print "Report stored (id=%d)" % report.id + + if args.silent: + sys.exit(1) + + print "'%s' Report for %s to %s" % (region_name, start, end) + + cols = raw_report[1] + + # Print the results ... + p = prettytable.PrettyTable(cols) + for c in cols[2:]: + p.align[c] = 'r' + p.sortby = cols[0] + + for row in raw_report[2:]: + frow = row[:] + for col in [8, 10, 12, 14]: + frow[col] = "%.1f%%" % (row[col] * 100.0) + p.add_row(frow) + print p + + total = details['total'] + failure_total = details['failure_grand_total'] + failure_rate = details['failure_grand_rate'] + print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \ + (total, failure_total, failure_rate) diff --git a/settings.py b/settings.py index 7233fcf..01da733 100644 --- a/settings.py +++ b/settings.py @@ -46,7 +46,7 @@ DATABASES = { # timezone as the operating system. # If running in a Windows environment this must be set to the same as your # system time zone. -TIME_ZONE = 'America/Chicago' +TIME_ZONE = None # Language code for this installation. All choices can be found here: # http://www.i18nguy.com/unicode/language-identifiers.html diff --git a/stacktach/models.py b/stacktach/models.py index 222a933..46f4597 100644 --- a/stacktach/models.py +++ b/stacktach/models.py @@ -20,6 +20,9 @@ from django.db import models class Deployment(models.Model): name = models.CharField(max_length=50) + def __repr__(self): + return self.name + class RawData(models.Model): deployment = models.ForeignKey(Deployment) @@ -154,3 +157,14 @@ class RequestTracker(models.Model): # Not used ... but soon hopefully. completed = models.BooleanField(default=False, db_index=True) + + +class JsonReport(models.Model): + """Stores cron-job reports in raw json format for extraction + via stacky/rest. All DateTimes are UTC.""" + period_start = models.DateTimeField(db_index=True) + period_end = models.DateTimeField(db_index=True) + created = models.DecimalField(max_digits=20, decimal_places=6, db_index=True) + name = models.CharField(max_length=50, db_index=True) + version = models.IntegerField(default=1) + json = models.TextField() diff --git a/stacktach/stacky_server.py b/stacktach/stacky_server.py index 994729c..5543aa3 100644 --- a/stacktach/stacky_server.py +++ b/stacktach/stacky_server.py @@ -4,6 +4,7 @@ import json from django.db.models import Q from django.http import HttpResponse +from django.shortcuts import get_object_or_404 import datetime_to_decimal as dt import models @@ -375,3 +376,30 @@ def do_list_usage_exists(request): exist.status]) return rsp(results) + + +def do_jsonreports(request): + yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) + now = datetime.datetime.utcnow() + yesterday = dt.dt_to_decimal(yesterday) + now = dt.dt_to_decimal(now) + _from = request.GET.get('created_from', yesterday) + _to = request.GET.get('created_to', now) + reports = models.JsonReport.objects.filter(created__gte=_from, + created__lte=_to) + results = [] + results.append(['Id', 'Start', 'End', 'Created', 'Name', 'Version']) + for report in reports: + results.append([report.id, + float(dt.dt_to_decimal(report.period_start)), + float(dt.dt_to_decimal(report.period_end)), + float(report.created), + report.name, + report.version]) + return rsp(results) + + +def do_jsonreport(request, report_id): + report_id = int(report_id) + report = get_object_or_404(models.JsonReport, pk=report_id) + return rsp(report.json) diff --git a/stacktach/urls.py b/stacktach/urls.py index bfa6678..240e1ee 100644 --- a/stacktach/urls.py +++ b/stacktach/urls.py @@ -12,6 +12,9 @@ urlpatterns = patterns('', url(r'stacky/timings/uuid/$', 'stacktach.stacky_server.do_timings_uuid'), url(r'stacky/summary/$', 'stacktach.stacky_server.do_summary'), url(r'stacky/request/$', 'stacktach.stacky_server.do_request'), + url(r'stacky/reports/$', 'stacktach.stacky_server.do_jsonreports'), + url(r'stacky/report/(?P\d+)/$', + 'stacktach.stacky_server.do_jsonreport'), url(r'stacky/show/(?P\d+)/$', 'stacktach.stacky_server.do_show'), url(r'stacky/watch/(?P\d+)/$', diff --git a/tests/unit/test_dbapi.py b/tests/unit/test_dbapi.py index 3f993d3..3b5df59 100644 --- a/tests/unit/test_dbapi.py +++ b/tests/unit/test_dbapi.py @@ -9,7 +9,7 @@ import utils from utils import INSTANCE_ID_1 -class StacktachRawParsingTestCase(unittest.TestCase): +class DBAPITestCase(unittest.TestCase): def setUp(self): self.mox = mox.Mox()