Merge pull request #14 from rackspace/kpi

KPI - Time from Request to Action Completion
This commit is contained in:
Sandy Walsh 2012-11-19 08:01:05 -08:00
commit bc12461afe
4 changed files with 84 additions and 54 deletions

View File

@ -57,6 +57,12 @@ class RawData(models.Model):
class Lifecycle(models.Model):
"""The Lifecycle table is the Master for a group of
Timing detail records. There is one Lifecycle row for
each instance seen in the event stream. The Timings
relate to the execution time for each .start/.end event
pair for this instance. These pairs are over the entire
lifespan of the instance, even across multiple api requests."""
instance = models.CharField(max_length=50, null=True,
blank=True, db_index=True)
last_state = models.CharField(max_length=50, null=True,
@ -67,12 +73,30 @@ class Lifecycle(models.Model):
class Timing(models.Model):
"""Each Timing record corresponds to a .start/.end event pair
for an instance. It tracks how long it took this operation
to execute."""
name = models.CharField(max_length=50, db_index=True)
lifecycle = models.ForeignKey(Lifecycle)
start_raw = models.ForeignKey(RawData, related_name='+', null=True)
end_raw = models.ForeignKey(RawData, related_name='+', null=True)
start_when = models.DecimalField(null=True, max_digits=20, decimal_places=6)
start_when = models.DecimalField(null=True, max_digits=20,
decimal_places=6)
end_when = models.DecimalField(null=True, max_digits=20, decimal_places=6)
diff = models.DecimalField(null=True, max_digits=20, decimal_places=6)
class RequestTracker(models.Model):
"""The RequestTracker table tracks the elapsed time of a user
request from the time it hits the API node to the time of the
final .end event (with the same Request ID)."""
request_id = models.CharField(max_length=50, db_index=True)
lifecycle = models.ForeignKey(Lifecycle)
last_timing = models.ForeignKey(Timing, null=True)
start = models.DecimalField(max_digits=20, decimal_places=6)
duration = models.DecimalField(max_digits=20, decimal_places=6)
# Not used ... but soon hopefully.
completed = models.BooleanField(default=False)

View File

@ -280,58 +280,19 @@ def do_watch(request, deployment_id):
def do_kpi(request):
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
events = models.RawData.objects.exclude(instance=None) \
.exclude(when__lt=yesterday) \
.filter(Q(event__endswith='.end') |
Q(event="compute.instance.update")) \
.only('event', 'host', 'request_id',
'instance', 'deployment') \
.order_by('when')
events = list(events)
instance_map = {} # { uuid: [(request_id, start_event, end_event), ...] }
for e in events:
if e.event == "compute.instance.update":
if "api" in e.host:
activities = instance_map.get(e.instance, [])
activities.append((e.request_id, e, None))
instance_map[e.instance] = activities
continue
if not e.event.endswith(".end"):
continue
activities = instance_map.get(e.instance)
if not activities:
# We missed the api start, skip it
continue
found = False
for index, a in enumerate(activities):
request_id, start_event, end_event = a
#if end_event is not None:
# continue
if request_id == e.request_id:
end_event = e
activities[index] = (request_id, start_event, e)
found = True
break
yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1)
yesterday = dt.dt_to_decimal(yesterday)
trackers = models.RequestTracker.objects.select_related() \
.exclude(last_timing=None) \
.exclude(start__lt=yesterday) \
.order_by('duration')
results = []
results.append(["Event", "Time", "UUID", "Deployment"])
for uuid, activities in instance_map.iteritems():
for request_id, start_event, end_event in activities:
if not end_event:
continue
event = end_event.event[:-len(".end")]
start = dt.dt_from_decimal(start_event.when)
end = dt.dt_from_decimal(end_event.when)
diff = end - start
results.append([event, sec_to_time(seconds_from_timedelta(
diff.days, diff.seconds, diff.microseconds)), uuid,
end_event.deployment.name])
for track in trackers:
end_event = track.last_timing.end_raw
event = end_event.event[:-len(".end")]
uuid = track.lifecycle.instance
results.append([event, sec_to_time(track.duration),
uuid, end_event.deployment.name])
return rsp(results)

View File

@ -74,11 +74,53 @@ HANDLERS = {'monitor.info':_monitor_message,
'':_compute_update_message}
def start_kpi_tracking(lifecycle, raw):
"""Start the clock for kpi timings when we see an instance.update
coming in from an api node."""
if raw.event != "compute.instance.update":
return
if "api" not in raw.host:
return
tracker = models.RequestTracker(request_id=raw.request_id,
start=raw.when,
lifecycle=lifecycle,
last_timing=None,
duration=str(0.0))
tracker.save()
def update_kpi(lifecycle, timing, raw):
"""Whenever we get a .end event, use the Timing object to
compute our current end-to-end duration.
Note: it may not be completely accurate if the operation is
still in-process, but we have no way of knowing it's still
in-process without mapping the original command with the
expected .end event (that's a whole other thing)
Until then, we'll take the lazy route and be aware of these
potential fence-post issues."""
trackers = models.RequestTracker.objects.\
filter(request_id=raw.request_id)
if len(trackers) == 0:
return
tracker = trackers[0]
tracker.last_timing = timing
tracker.duration = timing.end_when - tracker.start
tracker.save()
def aggregate(raw):
"""Roll up the raw event into a Lifecycle object
and a bunch of Timing objects.
We can use this for summarized timing reports.
Additionally, we can use this processing to give
us end-to-end user request timings for kpi reports.
"""
if not raw.instance:
@ -104,6 +146,8 @@ def aggregate(raw):
name = '.'.join(parts[:-1])
if not step in ['start', 'end']:
# Perhaps it's an operation initiated in the API?
start_kpi_tracking(lifecycle, raw)
return
# We are going to try to track every event pair that comes
@ -151,9 +195,10 @@ def aggregate(raw):
# We could have missed start so watch out ...
if timing.start_when:
timing.diff = timing.end_when - timing.start_when
# Looks like a valid pair ...
update_kpi(lifecycle, timing, raw)
timing.save()
def process_raw_data(deployment, args, json_args):
"""This is called directly by the worker to add the event to the db."""
db.reset_queries()

View File

@ -1,6 +1,6 @@
#!/bin/bash
WORKDIR=/srv/www/stacktach/django/stproject/
WORKDIR=/srv/www/stacktach/app
DAEMON=/usr/bin/python
ARGS=$WORKDIR/worker/start_workers.py
PIDFILE=/var/run/stacktach.pid