359 lines
13 KiB
Python
359 lines
13 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
import argparse
|
|
import datetime
|
|
import json
|
|
import sys
|
|
import time
|
|
import os
|
|
import prettytable
|
|
|
|
sys.path.append(os.environ.get('STACKTACH_INSTALL_DIR', '/stacktach'))
|
|
|
|
from stacktach import datetime_to_decimal as dt
|
|
from stacktach import image_type
|
|
from stacktach import models
|
|
|
|
|
|
def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
|
|
store=False, region=None, too_long=1800):
|
|
if not yesterday:
|
|
yesterday = datetime.datetime.utcnow().date() -\
|
|
datetime.timedelta(days=1)
|
|
|
|
rstart = datetime.datetime(year=yesterday.year, month=yesterday.month,
|
|
day=yesterday.day, hour=start_hour)
|
|
rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59)
|
|
|
|
dstart = dt.dt_to_decimal(rstart)
|
|
dend = dt.dt_to_decimal(rend)
|
|
|
|
too_long_col = '> %d' % (too_long / 60)
|
|
|
|
cells = []
|
|
regions = []
|
|
if region:
|
|
region = region.upper()
|
|
deployments = models.Deployment.objects.all()
|
|
for deployment in deployments:
|
|
name = deployment.name.upper()
|
|
if not region or region in name:
|
|
regions.append(deployment.id)
|
|
cells.append(deployment.name)
|
|
|
|
if not len(regions):
|
|
print "No regions found for '%s'" % region
|
|
sys.exit(1)
|
|
|
|
# Get all the instances that have changed in the last N hours ...
|
|
updates = models.RawData.objects.filter(event='compute.instance.update',
|
|
when__gt=dstart, when__lte=dend,
|
|
deployment__in=regions)\
|
|
.values('instance').distinct()
|
|
|
|
expiry = 60 * 60 # 1 hour
|
|
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
|
|
|
|
requests = models.RawData.objects.filter(when__gt=dstart, when__lte=dend)\
|
|
.exclude(instance=None,
|
|
event='compute.instance.exists')\
|
|
.values('request_id', 'instance')\
|
|
.distinct()
|
|
inst_recs = {}
|
|
for request in requests:
|
|
uuid = request['instance']
|
|
request_id = request['request_id']
|
|
value = inst_recs.get(uuid, [])
|
|
value.append(request_id)
|
|
inst_recs[uuid] = value
|
|
|
|
failures = {} # { key : {failure_type: count} }
|
|
durations = {}
|
|
attempts = {}
|
|
|
|
for uuid_dict in updates:
|
|
uuid = uuid_dict['instance']
|
|
|
|
for req in inst_recs.get(uuid, []):
|
|
raws = models.RawData.objects.filter(request_id=req)\
|
|
.exclude(event='compute.instance.exists')\
|
|
.order_by('when')
|
|
|
|
start = None
|
|
err = None
|
|
failure_type = None
|
|
|
|
operation = "aux"
|
|
image_type_num = 0
|
|
|
|
for raw in raws:
|
|
if not start:
|
|
start = raw.when
|
|
|
|
if 'error' in raw.routing_key:
|
|
err = raw
|
|
failure_type = 'http'
|
|
|
|
if failure_type != 'state' and raw.old_state != 'error'\
|
|
and raw.state == 'error':
|
|
failure_type = 'state'
|
|
|
|
if raw.old_state == 'error' and \
|
|
(not raw.state in ['deleted', 'error']):
|
|
failure_type = None
|
|
|
|
for cmd in cmds:
|
|
if cmd in raw.event:
|
|
operation = cmd
|
|
break
|
|
|
|
if raw.image_type:
|
|
image_type_num |= raw.image_type
|
|
|
|
# Get image (base or snapshot) from image_type bit field
|
|
image = "?"
|
|
if image_type.isset(image_type_num, image_type.BASE_IMAGE):
|
|
image = "base"
|
|
if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
|
|
image = "snap"
|
|
|
|
#Get os_type from image_type bit field
|
|
os_type = "?"
|
|
if image_type.isset(image_type_num, image_type.LINUX_IMAGE):
|
|
os_type = "linux"
|
|
if image_type.isset(image_type_num, image_type.WINDOWS_IMAGE):
|
|
os_type = "windows"
|
|
|
|
if not start:
|
|
continue
|
|
|
|
end = raw.when
|
|
diff = end - start
|
|
|
|
if diff > too_long and failure_type is None:
|
|
failure_type = too_long_col
|
|
|
|
key = (operation, image, os_type)
|
|
|
|
# Track durations for all attempts, good and bad ...
|
|
_durations = durations.get(key, [])
|
|
_durations.append(diff)
|
|
durations[key] = _durations
|
|
|
|
attempts[key] = attempts.get(key, 0) + 1
|
|
|
|
if failure_type:
|
|
if err:
|
|
queue, body = json.loads(err.json)
|
|
payload = body['payload']
|
|
exc = payload.get('exception')
|
|
if exc:
|
|
code = int(exc.get('kwargs', {}).get('code', 0))
|
|
if code >= 400 and code < 500:
|
|
failure_type = "4xx"
|
|
if code >= 500 and code < 600:
|
|
failure_type = "5xx"
|
|
breakdown = failures.get(key, {})
|
|
breakdown[failure_type] = breakdown.get(failure_type, 0) + 1
|
|
failures[key] = breakdown
|
|
|
|
# Summarize the results ...
|
|
report = []
|
|
pct = (float(100 - percentile) / 2.0) / 100.0
|
|
details = {'percentile': percentile, 'pct': pct, 'hours': hours,
|
|
'start': float(dstart), 'end': float(dend), 'region': region,
|
|
'cells': cells}
|
|
report.append(details)
|
|
|
|
failure_types = ["4xx", "5xx", too_long_col, "state"]
|
|
cols = ["Operation", "Image", "OS Type", "Min", "Max", "Med", "%d%%" % percentile,
|
|
"Requests"]
|
|
for failure_type in failure_types:
|
|
cols.append("%s" % failure_type)
|
|
cols.append("%% %s" % failure_type)
|
|
report.append(cols)
|
|
|
|
total = 0
|
|
failure_totals = {}
|
|
for key, count in attempts.iteritems():
|
|
total += count
|
|
operation, image, os_type = key
|
|
|
|
breakdown = failures.get(key, {})
|
|
this_failure_pair = []
|
|
for failure_type in failure_types:
|
|
# Failure counts for this attempt.
|
|
# Sum for grand totals.
|
|
failure_count = breakdown.get(failure_type, 0)
|
|
failure_totals[failure_type] = \
|
|
failure_totals.get(failure_type, 0) + failure_count
|
|
|
|
# Failure percentage for this attempt.
|
|
percentage = float(failure_count) / float(count)
|
|
this_failure_pair.append((failure_count, percentage))
|
|
|
|
# N-th % of durations ...
|
|
_values = durations[key]
|
|
_values.sort()
|
|
_min = 99999999
|
|
_max = 0
|
|
_total = 0.0
|
|
for value in _values:
|
|
_min = min(_min, value)
|
|
_max = max(_max, value)
|
|
_total += float(value)
|
|
_num = len(_values)
|
|
_avg = float(_total) / float(_num)
|
|
half = _num / 2
|
|
_median = _values[half]
|
|
_percentile_index = int((float(percentile) / 100.0) * float(_num))
|
|
_percentile = _values[_percentile_index]
|
|
|
|
_fmin = dt.sec_to_str(_min)
|
|
_fmax = dt.sec_to_str(_max)
|
|
_favg = dt.sec_to_str(_avg)
|
|
_fmedian = dt.sec_to_str(_median)
|
|
_fpercentile = dt.sec_to_str(_percentile)
|
|
|
|
row = [operation, image, os_type, _fmin, _fmax, _fmedian, _fpercentile, count]
|
|
for failure_count, failure_percentage in this_failure_pair:
|
|
row.append(failure_count)
|
|
row.append(failure_percentage)
|
|
report.append(row)
|
|
|
|
details['total'] = total
|
|
failure_grand_total = 0
|
|
for failure_type in failure_types:
|
|
failure_total = failure_totals.get(failure_type, 0)
|
|
failure_grand_total += failure_total
|
|
details["%s failure count" % failure_type] = failure_total
|
|
failure_percentage = (float(failure_total)/float(total)) * 100.0
|
|
details["%s failure percentage" % failure_type] = failure_percentage
|
|
|
|
details['failure_grand_total'] = failure_grand_total
|
|
details['failure_grand_rate'] = (float(failure_grand_total)/float(total)) * 100.0
|
|
return (rstart, rend, report)
|
|
|
|
|
|
def valid_date(date):
|
|
try:
|
|
t = time.strptime(date, "%Y-%m-%d")
|
|
return datetime.datetime(*t[:6])
|
|
except Exception:
|
|
raise argparse.ArgumentTypeError(
|
|
"'%s' is not in YYYY-MM-DD format." % date)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser('StackTach Nova Usage Summary Report')
|
|
parser.add_argument('--utcdate',
|
|
help='Report start date YYYY-MM-DD. Default yesterday midnight.',
|
|
type=valid_date, default=None)
|
|
parser.add_argument('--region',
|
|
help='Report Region. Default is all regions.', default=None)
|
|
parser.add_argument('--hours',
|
|
help='Report span in hours. Default: 24', default=24,
|
|
type=int)
|
|
parser.add_argument('--days_back',
|
|
help='Report start date. N days back from now. Default: 0', default=0,
|
|
type=int)
|
|
parser.add_argument('--hours_back',
|
|
help='Report start date. N hours back from now. Default: 0', default=0,
|
|
type=int)
|
|
parser.add_argument('--start_hour',
|
|
help='Starting hour 0-23. Default: 0', default=0,
|
|
type=int)
|
|
parser.add_argument('--percentile',
|
|
help='Percentile for timings. Default: 97', default=97,
|
|
type=int)
|
|
parser.add_argument('--too_long',
|
|
help='Seconds for an operation to fail. Default: 1800 (30min)', default=1800,
|
|
type=int)
|
|
parser.add_argument('--store',
|
|
help='Store report in database. Default: False',
|
|
default=False, action="store_true")
|
|
parser.add_argument('--silent',
|
|
help="Do not show summary report. Default: False",
|
|
default=False, action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
yesterday = args.utcdate
|
|
days_back = args.days_back
|
|
hours_back = args.hours_back
|
|
percentile = args.percentile
|
|
hours = args.hours
|
|
start_hour = args.start_hour
|
|
store_report = args.store
|
|
region = args.region
|
|
too_long = args.too_long
|
|
|
|
if (not yesterday) and days_back > 0:
|
|
yesterday = datetime.datetime.utcnow().date() - \
|
|
datetime.timedelta(days=days_back)
|
|
if (not yesterday) and hours_back > 0:
|
|
yesterday = datetime.datetime.utcnow() - \
|
|
datetime.timedelta(hours=hours_back)
|
|
yesterday = yesterday.replace(minute=0, second=0, microsecond=0)
|
|
start_hour = yesterday.hour
|
|
|
|
start, end, raw_report = make_report(yesterday, start_hour, hours,
|
|
percentile, store_report, region,
|
|
too_long)
|
|
details = raw_report[0]
|
|
pct = details['pct']
|
|
|
|
region_name = "all"
|
|
if region:
|
|
region_name = region
|
|
|
|
if store_report:
|
|
values = {'json': json.dumps(raw_report),
|
|
'created': dt.dt_to_decimal(datetime.datetime.utcnow()),
|
|
'period_start': start,
|
|
'period_end': end,
|
|
'version': 4,
|
|
'name': 'summary for region: %s' % region_name}
|
|
report = models.JsonReport(**values)
|
|
report.save()
|
|
print "Report stored (id=%d)" % report.id
|
|
|
|
if args.silent:
|
|
sys.exit(1)
|
|
|
|
print "'%s' Report for %s to %s" % (region_name, start, end)
|
|
|
|
cols = raw_report[1]
|
|
|
|
# Print the results ...
|
|
p = prettytable.PrettyTable(cols)
|
|
for c in cols[2:]:
|
|
p.align[c] = 'r'
|
|
p.sortby = cols[0]
|
|
|
|
for row in raw_report[2:]:
|
|
frow = row[:]
|
|
for col in [9, 11, 13, 15]:
|
|
frow[col] = "%.1f%%" % (row[col] * 100.0)
|
|
p.add_row(frow)
|
|
print p
|
|
|
|
total = details['total']
|
|
failure_total = details['failure_grand_total']
|
|
failure_rate = details['failure_grand_rate']
|
|
print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \
|
|
(total, failure_total, failure_rate)
|