324 lines
11 KiB
Python
324 lines
11 KiB
Python
import datetime
|
|
import json
|
|
import sys
|
|
import time
|
|
import os
|
|
import re
|
|
|
|
sys.path.append(os.environ.get('STACKTACH_INSTALL_DIR', '/stacktach'))
|
|
from stacktach import datetime_to_decimal as dt
|
|
from stacktach import image_type
|
|
from stacktach import models
|
|
|
|
|
|
if __name__ != '__main__':
|
|
sys.exit(1)
|
|
|
|
|
|
# To mask unique identifiers for categorizing notifications
|
|
def mask_msg(text):
|
|
# Needs order because of how precedent effects masking.
|
|
#
|
|
# Example: REQ_ID has a UUID in it, but the meaning is different
|
|
# in this context, so best to grab those first.
|
|
#
|
|
# LG_NUM usually represents a memory size; with the number of flavors
|
|
# this can create a lot of noise.
|
|
#
|
|
# The intent is to remove noise from unimportant subtleties
|
|
|
|
masking_regex = (
|
|
(1, 'REQ_ID',
|
|
r"req-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}"
|
|
),
|
|
(2, 'UUID',
|
|
r"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}"
|
|
),
|
|
(3, 'HOST_ADDRESS',
|
|
r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
|
|
),
|
|
(4, 'LG_NUM',
|
|
r"\b\d{3}\d+\b"
|
|
)
|
|
)
|
|
masked = str(text)
|
|
for config in masking_regex:
|
|
masked = re.sub(config[2], "$%s" % str(config[1]), masked)
|
|
return masked
|
|
|
|
|
|
# Assemble message from exception object
|
|
def build_exc_msg(exc=None, separator=", "):
|
|
|
|
"""
|
|
White-list exception components we're aware of, and leave a catch all;
|
|
because of freeform exception objects from notifications.
|
|
"""
|
|
|
|
if exc is None:
|
|
return exc
|
|
|
|
message = []
|
|
if exc.get('kwargs', False):
|
|
kwargs = exc['kwargs']
|
|
if kwargs.get('value', False):
|
|
value = kwargs['value']
|
|
trcbk_index = value.rfind("Traceback")
|
|
if trcbk_index > 0:
|
|
value = str(value[:trcbk_index]) + "$TRACEBACK"
|
|
message.append("value: %s" % value)
|
|
|
|
# kwargs: generic message components that don't require more filter
|
|
misc_list = ['reason', 'method', 'topic', 'exc_type',
|
|
'actual', 'code']
|
|
for key in misc_list:
|
|
if kwargs.get(key, False):
|
|
message.append("%s: %s" % (key, kwargs[key]))
|
|
# END generic message components in kwargs
|
|
|
|
if kwargs.get('expected', False):
|
|
message.append("expected: %s" % kwargs['expected'][0])
|
|
|
|
if exc.get('details', False):
|
|
details = exc['details']
|
|
if type(details) is list:
|
|
for item in details:
|
|
message.append(str(item))
|
|
elif type(details) is dict:
|
|
for k, v in details.iteritems():
|
|
message.append("%s: %s" % (k, v))
|
|
elif type(details) is str:
|
|
message.append(details)
|
|
|
|
# exc: generic messages that don't require more filter
|
|
misc_list = ['message', 'cmd', 'stderr', 'exit_code',
|
|
'code', 'description']
|
|
for key in misc_list:
|
|
if exc.get(key, False):
|
|
message.append("%s: %s" % (key, exc[key]))
|
|
|
|
if exc.get('stdout', False):
|
|
if exc['stdout'] != "":
|
|
message.append("stdout: %s" % exc['stdout'])
|
|
#END generic message components in exc
|
|
|
|
if len(message) == 0:
|
|
for k, v in exc.iteritems():
|
|
message.append("%s: %s" % (k, v))
|
|
return separator.join(message)
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# Start report
|
|
yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1)
|
|
if len(sys.argv) == 2:
|
|
try:
|
|
t = time.strptime(sys.argv[1], "%Y-%m-%d")
|
|
yesterday = datetime.datetime(*t[:6])
|
|
except Exception, e:
|
|
print e
|
|
print "Usage: python error_details.py YYYY-MM-DD (the end date)"
|
|
sys.exit(1)
|
|
|
|
hours = 0
|
|
length = 24
|
|
|
|
start = datetime.datetime(year=yesterday.year, month=yesterday.month,
|
|
day=yesterday.day)
|
|
end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59)
|
|
|
|
deployments = {}
|
|
|
|
instance_map = {} # { uuid : [request_id, request_id, ...] }
|
|
exception_counts = {} # { exception_message : count }
|
|
event_counts = {} # { event_name : count }
|
|
tenant_issues = {}
|
|
codes = {}
|
|
metadata = {
|
|
'report_format': 'json',
|
|
'instances': instance_map,
|
|
'exception_counts': exception_counts,
|
|
'event_counts': event_counts,
|
|
'tenant_issues': tenant_issues,
|
|
'codes': codes,
|
|
}
|
|
|
|
# Tell Stacky to format as JSON and set placeholders for various summaries
|
|
report = [metadata]
|
|
|
|
dstart = dt.dt_to_decimal(start)
|
|
dend = dt.dt_to_decimal(end)
|
|
|
|
for deploy in models.Deployment.objects.all():
|
|
deployments[deploy.id] = deploy.name
|
|
|
|
# Get all the instances that have changed in the last N hours ...
|
|
updates = models.RawData.objects.filter(event='compute.instance.update',
|
|
when__gt=dstart, when__lte=dend)\
|
|
.values('instance').distinct()
|
|
|
|
expiry = 60 * 60 # 1 hour
|
|
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
|
|
|
|
requests = models.RawData.objects.filter(when__gt=dstart, when__lte=dend)\
|
|
.exclude(instance=None,
|
|
event='compute.instance.exists')\
|
|
.values('request_id', 'instance')\
|
|
.distinct()
|
|
inst_recs = {}
|
|
for request in requests:
|
|
uuid = request['instance']
|
|
request_id = request['request_id']
|
|
value = inst_recs.get(uuid, [])
|
|
value.append(request_id)
|
|
inst_recs[uuid] = value
|
|
|
|
for uuid_dict in updates:
|
|
uuid = uuid_dict['instance']
|
|
|
|
req_list = []
|
|
for req in inst_recs.get(uuid, []):
|
|
raws = list(models.RawData.objects.filter(request_id=req)
|
|
.exclude(event='compute.instance.exists')
|
|
.values("id", "when", "routing_key", "old_state",
|
|
"state", "tenant", "event", "image_type",
|
|
"deployment")
|
|
.order_by('when'))
|
|
|
|
_start = None
|
|
_when = None
|
|
|
|
err_id = None
|
|
failure_type = None
|
|
operation = "n/a"
|
|
platform = 0
|
|
tenant = 0
|
|
cell = "n/a"
|
|
image_type_num = 0
|
|
|
|
for raw in raws:
|
|
_when = raw['when']
|
|
_routing_key = raw['routing_key']
|
|
_old_state = raw['old_state']
|
|
_state = raw['state']
|
|
_tenant = raw['tenant']
|
|
_event = raw['event']
|
|
_image_type = raw['image_type']
|
|
_name = raw['deployment']
|
|
_id = raw['id']
|
|
|
|
if not _start:
|
|
_start = _when
|
|
|
|
if 'error' in _routing_key:
|
|
err_id = _id
|
|
failure_type = 'http'
|
|
|
|
if failure_type != 'state' and _old_state != 'error' and\
|
|
_state == 'error':
|
|
failure_type = 'state'
|
|
err_id = _id
|
|
|
|
if _old_state == 'error' and \
|
|
(not _state in ['deleted', 'error']):
|
|
failure_type = None
|
|
err_id = None
|
|
|
|
if _tenant:
|
|
tenant = _tenant
|
|
|
|
for cmd in cmds:
|
|
if cmd in _event:
|
|
operation = cmd
|
|
cell = deployments.get(_name, "n/a")
|
|
break
|
|
|
|
if _image_type:
|
|
image_type_num |= _image_type
|
|
|
|
if not _start:
|
|
continue
|
|
|
|
_end = _when
|
|
diff = _end - _start
|
|
|
|
if diff > 1800 and failure_type is None:
|
|
failure_type = ">30"
|
|
|
|
if failure_type:
|
|
key = (operation, image_type_num, cell)
|
|
failed_request = {}
|
|
message = [] # For exception message masking
|
|
req_list.append(req)
|
|
instance_map[uuid] = req_list
|
|
failed_request['req'] = req
|
|
failed_request['uuid'] = uuid
|
|
failed_request['tenant'] = tenant
|
|
failed_request['duration'] = "%.2f minutes" % (diff/60)
|
|
failed_request['operation'] = operation
|
|
failed_request['platform'] = image_type.readable(image_type_num)
|
|
tenant_issues[tenant] = tenant_issues.get(tenant, 0) + 1
|
|
|
|
if err_id:
|
|
err = models.RawData.objects.get(id=err_id)
|
|
queue, body = json.loads(err.json)
|
|
payload = body['payload']
|
|
|
|
# Add error information to failed request report
|
|
failed_request['event_id'] = err.id
|
|
failed_request['tenant'] = err.tenant
|
|
failed_request['service'] = err.service
|
|
failed_request['host'] = err.host
|
|
failed_request['deployment'] = err.deployment.name
|
|
failed_request['event'] = err.event
|
|
failed_request['when'] = str(dt.dt_from_decimal(err.when))
|
|
|
|
# Track failed event counts
|
|
event_counts[err.event] = event_counts.get(err.event, 0) + 1
|
|
|
|
exc = payload.get('exception')
|
|
if exc:
|
|
# group the messages ...
|
|
failed_request['exception'] = exc
|
|
|
|
# assemble message from exception and generalize
|
|
message_str = mask_msg(build_exc_msg(exc))
|
|
# count exception messages
|
|
exception_counts[message_str] = exception_counts.get(
|
|
message_str, 0) + 1
|
|
|
|
# extract the code, if any ...
|
|
code = exc.get('kwargs', {}).get('code')
|
|
if code:
|
|
codes[code] = codes.get(code, 0) + 1
|
|
failure_type = code
|
|
failed_request['failure_type'] = failure_type
|
|
|
|
raws = models.RawData.objects.filter(request_id=req)\
|
|
.exclude(event='compute.instance.exists')\
|
|
.order_by('when')
|
|
|
|
failed_request['details'] = []
|
|
for raw in raws:
|
|
failure_detail = {}
|
|
failure_detail['host'] = raw.host
|
|
failure_detail['event'] = raw.event
|
|
failure_detail['old_state'] = raw.old_state
|
|
failure_detail['state'] = raw.state
|
|
failure_detail['old_task'] = raw.old_task
|
|
failure_detail['task'] = raw.task
|
|
|
|
failed_request['details'].append(failure_detail)
|
|
|
|
report.append(failed_request)
|
|
|
|
# Assign values to store in DB
|
|
values = {'json': json.dumps(report),
|
|
'created': dt.dt_to_decimal(datetime.datetime.utcnow()),
|
|
'period_start': start,
|
|
'period_end': end,
|
|
'version': 1,
|
|
'name': 'Error detail report'}
|
|
json_report = models.JsonReport(**values)
|
|
json_report.save()
|