This commit is contained in:
Thomas Maddox 2013-06-21 13:08:03 -05:00
parent 8ecd7a4a81
commit 93a0723544
2 changed files with 42 additions and 53 deletions

View File

@ -17,6 +17,16 @@ if __name__ != '__main__':
# To mask unique identifiers for categorizing notifications # To mask unique identifiers for categorizing notifications
def mask_msg(text): def mask_msg(text):
# Needs order because of how precedent effects masking.
#
# Example: REQ_ID has a UUID in it, but the meaning is different
# in this context, so best to grab those first.
#
# LG_NUM usually represents a memory size; with the number of flavors
# this can create a lot of noise.
#
# The intent is to remove noise from unimportant subtleties
masking_regex = ( masking_regex = (
(1, 'REQ_ID', (1, 'REQ_ID',
r"req-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}" r"req-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}"
@ -117,14 +127,21 @@ if __name__ == '__main__':
day=yesterday.day) day=yesterday.day)
end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59) end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59)
deployments = {}
instance_map = {} # { uuid : [request_id, request_id, ...] } instance_map = {} # { uuid : [request_id, request_id, ...] }
exception_counts = {} # { exception_message : count } exception_counts = {} # { exception_message : count }
event_counts = {} # { event_name : count } event_counts = {} # { event_name : count }
metadata = {'report_format': 'json', tenant_issues = {}
'instances': instance_map, codes = {}
'exception_counts': exception_counts, metadata = {
'event_counts': event_counts 'report_format': 'json',
} 'instances': instance_map,
'exception_counts': exception_counts,
'event_counts': event_counts,
'tenant_issues': tenant_issues,
'codes': codes,
}
# Tell Stacky to format as JSON and set placeholders for various summaries # Tell Stacky to format as JSON and set placeholders for various summaries
report = [metadata] report = [metadata]
@ -132,8 +149,6 @@ if __name__ == '__main__':
dstart = dt.dt_to_decimal(start) dstart = dt.dt_to_decimal(start)
dend = dt.dt_to_decimal(end) dend = dt.dt_to_decimal(end)
codes = {}
deployments = {}
for deploy in models.Deployment.objects.all(): for deploy in models.Deployment.objects.all():
deployments[deploy.id] = deploy.name deployments[deploy.id] = deploy.name
@ -145,12 +160,6 @@ if __name__ == '__main__':
expiry = 60 * 60 # 1 hour expiry = 60 * 60 # 1 hour
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
failures = {}
causes = {}
durations = {}
successes = {}
tenant_issues = {}
for uuid_dict in updates: for uuid_dict in updates:
uuid = uuid_dict['instance'] uuid = uuid_dict['instance']
@ -224,42 +233,24 @@ if __name__ == '__main__':
if not _start: if not _start:
continue continue
image = "?"
if image_type.isset(image_type_num, image_type.BASE_IMAGE):
image = "base"
if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
image = "snap"
_end = _when _end = _when
diff = _end - _start diff = _end - _start
if diff > 3600 and failure_type is None: if diff > 1800 and failure_type is None:
failure_type = ">60" failure_type = ">30"
key = (operation, image_type_num, cell) if failure_type:
key = (operation, image_type_num, cell)
# Track durations for all attempts, good and bad ...
duration_min, duration_max, duration_count, duration_total = \
durations.get(key, (9999999, 0, 0, 0))
duration_min = min(duration_min, diff)
duration_max = max(duration_max, diff)
duration_count += 1
duration_total += diff
durations[key] = (duration_min, duration_max, duration_count,
duration_total)
if not failure_type:
successes[key] = successes.get(key, 0) + 1
else:
failed_request = {} failed_request = {}
message = [] # For exception message masking message = [] # For exception message masking
req_list.append(req) req_list.append(req)
instance_map[uuid] = req_list instance_map[uuid] = req_list
failed_request['req'] = req failed_request['req'] = req
failed_request['uuid'] = uuid
failed_request['tenant'] = tenant
failed_request['duration'] = "%.2f minutes" % (diff/60) failed_request['duration'] = "%.2f minutes" % (diff/60)
failed_request['operation'] = operation failed_request['operation'] = operation
failed_request['platform'] = image_type.readable(image_type_num) failed_request['platform'] = image_type.readable(image_type_num)
failures[key] = failures.get(key, 0) + 1
tenant_issues[tenant] = tenant_issues.get(tenant, 0) + 1 tenant_issues[tenant] = tenant_issues.get(tenant, 0) + 1
if err_id: if err_id:
@ -296,12 +287,12 @@ if __name__ == '__main__':
codes[code] = codes.get(code, 0) + 1 codes[code] = codes.get(code, 0) + 1
failure_type = code failure_type = code
failed_request['failure_type'] = failure_type failed_request['failure_type'] = failure_type
raws = models.RawData.objects.filter(request_id=req)\ raws = models.RawData.objects.filter(request_id=req)\
.exclude(event='compute.instance.exists')\ .exclude(event='compute.instance.exists')\
.order_by('when') .order_by('when')
failed_request['details'] = [] failed_request['details'] = []
for raw in raws: for raw in raws:
failure_detail = {} failure_detail = {}
failure_detail['host'] = raw.host failure_detail['host'] = raw.host
@ -310,13 +301,11 @@ if __name__ == '__main__':
failure_detail['state'] = raw.state failure_detail['state'] = raw.state
failure_detail['old_task'] = raw.old_task failure_detail['old_task'] = raw.old_task
failure_detail['task'] = raw.task failure_detail['task'] = raw.task
failed_request['details'].append(failure_detail) failed_request['details'].append(failure_detail)
report.append(failed_request) report.append(failed_request)
cause_key = (key, failure_type)
causes[cause_key] = causes.get(cause_key, 0) + 1
# Assign values to store in DB # Assign values to store in DB
values = {'json': json.dumps(report), values = {'json': json.dumps(report),
'created': dt.dt_to_decimal(datetime.datetime.utcnow()), 'created': dt.dt_to_decimal(datetime.datetime.utcnow()),

View File

@ -16,17 +16,16 @@ from stacktach import models
def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
store=False, region=None, too_long=1800): store=False, region=None, too_long=1800):
if not yesterday: if not yesterday:
yesterday = datetime.datetime.utcnow().date() - \ yesterday = datetime.datetime.utcnow().date() -\
datetime.timedelta(days=1) datetime.timedelta(days=1)
rstart = datetime.datetime(year=yesterday.year, month=yesterday.month, rstart = datetime.datetime(year=yesterday.year, month=yesterday.month,
day=yesterday.day, hour=start_hour) day=yesterday.day, hour=start_hour)
rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59) rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59)
dstart = dt.dt_to_decimal(rstart) dstart = dt.dt_to_decimal(rstart)
dend = dt.dt_to_decimal(rend) dend = dt.dt_to_decimal(rend)
codes = {}
too_long_col = '> %d' % (too_long / 60) too_long_col = '> %d' % (too_long / 60)
cells = [] cells = []
@ -87,11 +86,12 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
err = raw err = raw
failure_type = 'http' failure_type = 'http'
if raw.old_state != 'error' and raw.state == 'error': if failure_type != 'state' and raw.old_state != 'error'\
and raw.state == 'error':
failure_type = 'state' failure_type = 'state'
if raw.old_state == 'error' and \ if raw.old_state == 'error' and \
(not raw.state in ['deleted', 'error']): (not raw.state in ['deleted', 'error']):
failure_type = None failure_type = None
for cmd in cmds: for cmd in cmds:
@ -110,7 +110,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
image = "snap" image = "snap"
#Get os_type from image_type bit field #Get os_type from image_type bit field
os_type = "other" os_type = "?"
if image_type.isset(image_type_num, image_type.LINUX_IMAGE): if image_type.isset(image_type_num, image_type.LINUX_IMAGE):
os_type = "linux" os_type = "linux"
if image_type.isset(image_type_num, image_type.WINDOWS_IMAGE): if image_type.isset(image_type_num, image_type.WINDOWS_IMAGE):
@ -122,7 +122,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
end = raw.when end = raw.when
diff = end - start diff = end - start
if diff > too_long and failure_type == None: if diff > too_long and failure_type is None:
failure_type = too_long_col failure_type = too_long_col
key = (operation, image, os_type) key = (operation, image, os_type)
@ -158,7 +158,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
report.append(details) report.append(details)
failure_types = ["4xx", "5xx", too_long_col, "state"] failure_types = ["4xx", "5xx", too_long_col, "state"]
cols = ["Operation", "Image", "OS", "Min", "Max", "Med", "%d%%" % percentile, cols = ["Operation", "Image", "OS Type", "Min", "Max", "Med", "%d%%" % percentile,
"Requests"] "Requests"]
for failure_type in failure_types: for failure_type in failure_types:
cols.append("%s" % failure_type) cols.append("%s" % failure_type)
@ -178,7 +178,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
# Sum for grand totals. # Sum for grand totals.
failure_count = breakdown.get(failure_type, 0) failure_count = breakdown.get(failure_type, 0)
failure_totals[failure_type] = \ failure_totals[failure_type] = \
failure_totals.get(failure_type, 0) + failure_count failure_totals.get(failure_type, 0) + failure_count
# Failure percentage for this attempt. # Failure percentage for this attempt.
percentage = float(failure_count) / float(count) percentage = float(failure_count) / float(count)
@ -231,9 +231,9 @@ def valid_date(date):
try: try:
t = time.strptime(date, "%Y-%m-%d") t = time.strptime(date, "%Y-%m-%d")
return datetime.datetime(*t[:6]) return datetime.datetime(*t[:6])
except Exception, e: except Exception:
raise argparse.ArgumentTypeError( raise argparse.ArgumentTypeError(
"'%s' is not in YYYY-MM-DD format." % date) "'%s' is not in YYYY-MM-DD format." % date)
if __name__ == '__main__': if __name__ == '__main__':