Merge branch 'master' of git://github.com/rackspace/stacktach into limit_offet_args

This commit is contained in:
Andrew Melton 2013-02-25 14:01:19 -05:00
commit 924f320128
9 changed files with 399 additions and 125 deletions

View File

@ -3,3 +3,4 @@ MySQL-python>=1.2.3
eventlet>=0.9.17
kombu>=2.4.7
librabbitmq>=1.0.0
argparse

86
migrations/004_usage.sql Normal file
View File

@ -0,0 +1,86 @@
BEGIN;
CREATE TABLE `stacktach_deployment` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`name` varchar(50) NOT NULL
)
;
CREATE TABLE `stacktach_rawdata` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`deployment_id` integer NOT NULL,
`tenant` varchar(50),
`json` longtext NOT NULL,
`routing_key` varchar(50),
`state` varchar(20),
`old_state` varchar(20),
`old_task` varchar(30),
`task` varchar(30),
`image_type` integer,
`when` numeric(20, 6) NOT NULL,
`publisher` varchar(100),
`event` varchar(50),
`service` varchar(50),
`host` varchar(100),
`instance` varchar(50),
`request_id` varchar(50)
)
;
ALTER TABLE `stacktach_rawdata` ADD CONSTRAINT `deployment_id_refs_id_362370d` FOREIGN KEY (`deployment_id`) REFERENCES `stacktach_deployment` (`id`);
CREATE TABLE `stacktach_lifecycle` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`instance` varchar(50),
`last_state` varchar(50),
`last_task_state` varchar(50),
`last_raw_id` integer
)
;
ALTER TABLE `stacktach_lifecycle` ADD CONSTRAINT `last_raw_id_refs_id_2a04e82d` FOREIGN KEY (`last_raw_id`) REFERENCES `stacktach_rawdata` (`id`);
CREATE TABLE `stacktach_instanceusage` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`instance` varchar(50),
`launched_at` numeric(20, 6),
`deleted_at` numeric(20, 6),
`request_id` varchar(50),
`instance_type_id` varchar(50)
)
;
CREATE TABLE `stacktach_instanceexists` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`instance` varchar(50),
`launched_at` numeric(20, 6),
`deleted_at` numeric(20, 6),
`message_id` varchar(50),
`instance_type_id` varchar(50),
`status` varchar(50) NOT NULL,
`raw_id` integer,
`usage_id` integer
)
;
ALTER TABLE `stacktach_instanceexists` ADD CONSTRAINT `usage_id_refs_id_3b13299b` FOREIGN KEY (`usage_id`) REFERENCES `stacktach_instanceusage` (`id`);
ALTER TABLE `stacktach_instanceexists` ADD CONSTRAINT `raw_id_refs_id_65c72953` FOREIGN KEY (`raw_id`) REFERENCES `stacktach_rawdata` (`id`);
CREATE TABLE `stacktach_timing` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`name` varchar(50) NOT NULL,
`lifecycle_id` integer NOT NULL,
`start_raw_id` integer,
`end_raw_id` integer,
`start_when` numeric(20, 6),
`end_when` numeric(20, 6),
`diff` numeric(20, 6)
)
;
ALTER TABLE `stacktach_timing` ADD CONSTRAINT `start_raw_id_refs_id_3cd201fc` FOREIGN KEY (`start_raw_id`) REFERENCES `stacktach_rawdata` (`id`);
ALTER TABLE `stacktach_timing` ADD CONSTRAINT `end_raw_id_refs_id_3cd201fc` FOREIGN KEY (`end_raw_id`) REFERENCES `stacktach_rawdata` (`id`);
ALTER TABLE `stacktach_timing` ADD CONSTRAINT `lifecycle_id_refs_id_4255ead8` FOREIGN KEY (`lifecycle_id`) REFERENCES `stacktach_lifecycle` (`id`);
CREATE TABLE `stacktach_requesttracker` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`request_id` varchar(50) NOT NULL,
`lifecycle_id` integer NOT NULL,
`last_timing_id` integer,
`start` numeric(20, 6) NOT NULL,
`duration` numeric(20, 6) NOT NULL,
`completed` bool NOT NULL
)
;
ALTER TABLE `stacktach_requesttracker` ADD CONSTRAINT `last_timing_id_refs_id_f7d8336` FOREIGN KEY (`last_timing_id`) REFERENCES `stacktach_timing` (`id`);
ALTER TABLE `stacktach_requesttracker` ADD CONSTRAINT `lifecycle_id_refs_id_e457729` FOREIGN KEY (`lifecycle_id`) REFERENCES `stacktach_lifecycle` (`id`);
COMMIT;

View File

@ -0,0 +1,3 @@
do
python manage.py syncdb
to add the JsonReport table

View File

@ -1,3 +1,4 @@
import argparse
import datetime
import json
import sys
@ -12,155 +13,293 @@ from stacktach import image_type
from stacktach import models
if __name__ != '__main__':
sys.exit(1)
def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
store=False, region=None):
if not yesterday:
yesterday = datetime.datetime.utcnow().date() - \
datetime.timedelta(days=1)
yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1)
if len(sys.argv) == 2:
try:
t = time.strptime(sys.argv[1], "%Y-%m-%d")
yesterday = datetime.datetime(*t[:6])
except Exception, e:
print e
print "Usage: python requests.py YYYY-MM-DD (the end date)"
rstart = datetime.datetime(year=yesterday.year, month=yesterday.month,
day=yesterday.day, hour=start_hour)
rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59)
dstart = dt.dt_to_decimal(rstart)
dend = dt.dt_to_decimal(rend)
codes = {}
cells = []
regions = []
if region:
region = region.upper()
deployments = models.Deployment.objects.all()
for deployment in deployments:
name = deployment.name.upper()
if not region or region in name:
regions.append(deployment.id)
cells.append(deployment.name)
if not len(regions):
print "No regions found for '%s'" % region
sys.exit(1)
percentile = 90
hours = 24
# Get all the instances that have changed in the last N hours ...
updates = models.RawData.objects.filter(event='compute.instance.update',
when__gt=dstart, when__lte=dend,
deployment__in=regions)\
.values('instance').distinct()
start = datetime.datetime(year=yesterday.year, month=yesterday.month,
day=yesterday.day)
end = start + datetime.timedelta(hours=hours-1, minutes=59, seconds=59)
expiry = 60 * 60 # 1 hour
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
print "Generating report for %s to %s" % (start, end)
failures = {} # { key : {failure_type: count} }
durations = {}
attempts = {}
dstart = dt.dt_to_decimal(start)
dend = dt.dt_to_decimal(end)
for uuid_dict in updates:
uuid = uuid_dict['instance']
codes = {}
# Get all the instances that have changed in the last N hours ...
updates = models.RawData.objects.filter(event='compute.instance.update',
when__gt=dstart, when__lte=dend)\
.values('instance').distinct()
expiry = 60 * 60 # 1 hour
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
failures = {}
durations = {}
attempts = {}
for uuid_dict in updates:
uuid = uuid_dict['instance']
# All the unique Request ID's for this instance during that timespan.
reqs = models.RawData.objects.filter(instance=uuid,
when__gt=dstart, when__lte=dend) \
.values('request_id').distinct()
# All the unique Request ID's for this instance during that timespan.
reqs = models.RawData.objects.filter(instance=uuid,
when__gt=dstart, when__lte=dend) \
.values('request_id').distinct()
for req_dict in reqs:
report = False
req = req_dict['request_id']
raws = models.RawData.objects.filter(request_id=req)\
.exclude(event='compute.instance.exists')\
.order_by('when')
for req_dict in reqs:
req = req_dict['request_id']
raws = models.RawData.objects.filter(request_id=req)\
.exclude(event='compute.instance.exists')\
.order_by('when')
start = None
err = None
start = None
err = None
failure_type = None
operation = "aux"
image_type_num = 0
operation = "aux"
image_type_num = 0
for raw in raws:
if not start:
start = raw.when
if 'error' in raw.routing_key:
err = raw
failure_type = 'http'
if raw.old_state != 'error' and raw.state == 'error':
failure_type = 'state'
if raw.old_state == 'error' and \
(not raw.state in ['deleted', 'error']):
failure_type = None
for cmd in cmds:
if cmd in raw.event:
operation = cmd
break
if raw.image_type:
image_type_num |= raw.image_type
image = "?"
if image_type.isset(image_type_num, image_type.BASE_IMAGE):
image = "base"
if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
image = "snap"
for raw in raws:
if not start:
start = raw.when
if 'error' in raw.routing_key:
err = raw
report = True
continue
for cmd in cmds:
if cmd in raw.event:
operation = cmd
break
end = raw.when
diff = end - start
if raw.image_type:
image_type_num |= raw.image_type
if diff > 3600:
failure_type = '> 60'
image = "?"
if image_type.isset(image_type_num, image_type.BASE_IMAGE):
image = "base"
if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
image = "snap"
key = (operation, image)
if not start:
continue
# Track durations for all attempts, good and bad ...
_durations = durations.get(key, [])
_durations.append(diff)
durations[key] = _durations
end = raw.when
diff = end - start
attempts[key] = attempts.get(key, 0) + 1
if diff > 3600:
report = True
if failure_type:
if err:
queue, body = json.loads(err.json)
payload = body['payload']
exc = payload.get('exception')
if exc:
code = int(exc.get('kwargs', {}).get('code', 0))
if code >= 400 and code < 500:
failure_type = "4xx"
if code >= 500 and code < 600:
failure_type = "5xx"
breakdown = failures.get(key, {})
breakdown[failure_type] = breakdown.get(failure_type, 0) + 1
failures[key] = breakdown
key = (operation, image)
# Summarize the results ...
report = []
pct = (float(100 - percentile) / 2.0) / 100.0
details = {'percentile': percentile, 'pct': pct, 'hours': hours,
'start': float(dstart), 'end': float(dend), 'region': region,
'cells': cells}
report.append(details)
# Track durations for all attempts, good and bad ...
_durations = durations.get(key, [])
_durations.append(diff)
durations[key] = _durations
failure_types = ["4xx", "5xx", "> 60", "state"]
cols = ["Operation", "Image", "Min", "Max", "Med", "%d%%" % percentile,
"Requests"]
for failure_type in failure_types:
cols.append("%s" % failure_type)
cols.append("%% %s" % failure_type)
report.append(cols)
attempts[key] = attempts.get(key, 0) + 1
total = 0
failure_totals = {}
for key, count in attempts.iteritems():
total += count
operation, image = key
if report:
failures[key] = failures.get(key, 0) + 1
breakdown = failures.get(key, {})
this_failure_pair = []
for failure_type in failure_types:
# Failure counts for this attempt.
# Sum for grand totals.
failure_count = breakdown.get(failure_type, 0)
failure_totals[failure_type] = \
failure_totals.get(failure_type, 0) + failure_count
# Print the results ...
cols = ["Operation", "Image", "Min*", "Max*", "Avg*",
"Requests", "# Fail", "Fail %"]
p = prettytable.PrettyTable(cols)
for c in cols[2:]:
p.align[c] = 'r'
p.sortby = cols[0]
# Failure percentage for this attempt.
percentage = float(failure_count) / float(count)
this_failure_pair.append((failure_count, percentage))
pct = (float(100 - percentile) / 2.0) / 100.0
print "* Using %d-th percentile for results (+/-%.1f%% cut)" % \
(percentile, pct * 100.0)
total = 0
failure_total = 0
for key, count in attempts.iteritems():
total += count
operation, image = key
# N-th % of durations ...
_values = durations[key]
_values.sort()
_min = 99999999
_max = 0
_total = 0.0
for value in _values:
_min = min(_min, value)
_max = max(_max, value)
_total += float(value)
_num = len(_values)
_avg = float(_total) / float(_num)
half = _num / 2
_median = _values[half]
_percentile_index = int((float(percentile) / 100.0) * float(_num))
_percentile = _values[_percentile_index]
failure_count = failures.get(key, 0)
failure_total += failure_count
failure_percentage = float(failure_count) / float(count)
_failure_percentage = "%.1f%%" % (failure_percentage * 100.0)
_fmin = dt.sec_to_str(_min)
_fmax = dt.sec_to_str(_max)
_favg = dt.sec_to_str(_avg)
_fmedian = dt.sec_to_str(_median)
_fpercentile = dt.sec_to_str(_percentile)
# N-th % of durations ...
_values = durations[key]
_values.sort()
_outliers = int(float(len(_values)) * pct)
if _outliers > 0:
before = len(_values)
_values = _values[_outliers:-_outliers]
print "culling %d -> %d" % (before, len(_values))
_min = 99999999
_max = 0
_total = 0.0
for value in _values:
_min = min(_min, value)
_max = max(_max, value)
_total += float(value)
_avg = float(_total) / float(len(_values))
_fmin = dt.sec_to_str(_min)
_fmax = dt.sec_to_str(_max)
_favg = dt.sec_to_str(_avg)
row = [operation, image, _fmin, _fmax, _fmedian, _fpercentile, count]
for failure_count, failure_percentage in this_failure_pair:
row.append(failure_count)
row.append(failure_percentage)
report.append(row)
p.add_row([operation, image, _fmin, _fmax, _favg, count,
failure_count, _failure_percentage])
print p
details['total'] = total
failure_grand_total = 0
for failure_type in failure_types:
failure_total = failure_totals.get(failure_type, 0)
failure_grand_total += failure_total
details["%s failure count" % failure_type] = failure_total
failure_percentage = (float(failure_total)/float(total)) * 100.0
details["%s failure percentage" % failure_type] = failure_percentage
print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \
(total, failure_total,
(float(failure_total)/float(total)) * 100.0)
details['failure_grand_total'] = failure_grand_total
details['failure_grand_rate'] = (float(failure_grand_total)/float(total)) * 100.0
return (rstart, rend, report)
def valid_date(date):
try:
t = time.strptime(date, "%Y-%m-%d")
return datetime.datetime(*t[:6])
except Exception, e:
raise argparse.ArgumentTypeError(
"'%s' is not in YYYY-MM-DD format." % date)
if __name__ == '__main__':
parser = argparse.ArgumentParser('StackTach Nova Usage Summary Report')
parser.add_argument('--utcdate',
help='Report start date YYYY-MM-DD. Default yesterday midnight.',
type=valid_date, default=None)
parser.add_argument('--region',
help='Report Region. Default is all regions.', default=None)
parser.add_argument('--hours',
help='Report span in hours. Default: 24', default=24,
type=int)
parser.add_argument('--start_hour',
help='Starting hour 0-23. Default: 0', default=0,
type=int)
parser.add_argument('--percentile',
help='Percentile for timings. Default: 97', default=97,
type=int)
parser.add_argument('--store',
help='Store report in database. Default: False',
default=False, action="store_true")
parser.add_argument('--silent',
help="Do not show summary report. Default: False",
default=False, action="store_true")
args = parser.parse_args()
yesterday = args.utcdate
percentile = args.percentile
hours = args.hours
start_hour = args.start_hour
store_report = args.store
region = args.region
start, end, raw_report = make_report(yesterday, start_hour, hours,
percentile, store_report, region)
details = raw_report[0]
pct = details['pct']
region_name = "all"
if region:
region_name = region
if store_report:
values = {'json': json.dumps(raw_report),
'created': dt.dt_to_decimal(datetime.datetime.utcnow()),
'period_start': start,
'period_end': end,
'version': 3,
'name': 'summary for region: %s' % region_name}
report = models.JsonReport(**values)
report.save()
print "Report stored (id=%d)" % report.id
if args.silent:
sys.exit(1)
print "'%s' Report for %s to %s" % (region_name, start, end)
cols = raw_report[1]
# Print the results ...
p = prettytable.PrettyTable(cols)
for c in cols[2:]:
p.align[c] = 'r'
p.sortby = cols[0]
for row in raw_report[2:]:
frow = row[:]
for col in [8, 10, 12, 14]:
frow[col] = "%.1f%%" % (row[col] * 100.0)
p.add_row(frow)
print p
total = details['total']
failure_total = details['failure_grand_total']
failure_rate = details['failure_grand_rate']
print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \
(total, failure_total, failure_rate)

View File

@ -46,7 +46,7 @@ DATABASES = {
# timezone as the operating system.
# If running in a Windows environment this must be set to the same as your
# system time zone.
TIME_ZONE = 'America/Chicago'
TIME_ZONE = None
# Language code for this installation. All choices can be found here:
# http://www.i18nguy.com/unicode/language-identifiers.html

View File

@ -20,6 +20,9 @@ from django.db import models
class Deployment(models.Model):
name = models.CharField(max_length=50)
def __repr__(self):
return self.name
class RawData(models.Model):
deployment = models.ForeignKey(Deployment)
@ -154,3 +157,14 @@ class RequestTracker(models.Model):
# Not used ... but soon hopefully.
completed = models.BooleanField(default=False, db_index=True)
class JsonReport(models.Model):
"""Stores cron-job reports in raw json format for extraction
via stacky/rest. All DateTimes are UTC."""
period_start = models.DateTimeField(db_index=True)
period_end = models.DateTimeField(db_index=True)
created = models.DecimalField(max_digits=20, decimal_places=6, db_index=True)
name = models.CharField(max_length=50, db_index=True)
version = models.IntegerField(default=1)
json = models.TextField()

View File

@ -4,6 +4,7 @@ import json
from django.db.models import Q
from django.http import HttpResponse
from django.shortcuts import get_object_or_404
import datetime_to_decimal as dt
import models
@ -375,3 +376,30 @@ def do_list_usage_exists(request):
exist.status])
return rsp(results)
def do_jsonreports(request):
yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1)
now = datetime.datetime.utcnow()
yesterday = dt.dt_to_decimal(yesterday)
now = dt.dt_to_decimal(now)
_from = request.GET.get('created_from', yesterday)
_to = request.GET.get('created_to', now)
reports = models.JsonReport.objects.filter(created__gte=_from,
created__lte=_to)
results = []
results.append(['Id', 'Start', 'End', 'Created', 'Name', 'Version'])
for report in reports:
results.append([report.id,
float(dt.dt_to_decimal(report.period_start)),
float(dt.dt_to_decimal(report.period_end)),
float(report.created),
report.name,
report.version])
return rsp(results)
def do_jsonreport(request, report_id):
report_id = int(report_id)
report = get_object_or_404(models.JsonReport, pk=report_id)
return rsp(report.json)

View File

@ -12,6 +12,9 @@ urlpatterns = patterns('',
url(r'stacky/timings/uuid/$', 'stacktach.stacky_server.do_timings_uuid'),
url(r'stacky/summary/$', 'stacktach.stacky_server.do_summary'),
url(r'stacky/request/$', 'stacktach.stacky_server.do_request'),
url(r'stacky/reports/$', 'stacktach.stacky_server.do_jsonreports'),
url(r'stacky/report/(?P<report_id>\d+)/$',
'stacktach.stacky_server.do_jsonreport'),
url(r'stacky/show/(?P<event_id>\d+)/$',
'stacktach.stacky_server.do_show'),
url(r'stacky/watch/(?P<deployment_id>\d+)/$',

View File

@ -9,7 +9,7 @@ import utils
from utils import INSTANCE_ID_1
class StacktachRawParsingTestCase(unittest.TestCase):
class DBAPITestCase(unittest.TestCase):
def setUp(self):
self.mox = mox.Mox()