Merge pull request #14 from rackspace/kpi

KPI - Time from Request to Action Completion
2012-11-19 08:01:05 -08:00 · 2012-11-19 08:01:05 -08:00 · bc12461afe
commit bc12461afe
parent a56e6834db 4f36d540d0
4 changed files with 84 additions and 54 deletions
--- a/stacktach/models.py
+++ b/stacktach/models.py
@ -57,6 +57,12 @@ class RawData(models.Model):


 class Lifecycle(models.Model):
+    """The Lifecycle table is the Master for a group of
+    Timing detail records. There is one Lifecycle row for
+    each instance seen in the event stream. The Timings
+    relate to the execution time for each .start/.end event
+    pair for this instance. These pairs are over the entire
+    lifespan of the instance, even across multiple api requests."""
    instance = models.CharField(max_length=50, null=True,
                                blank=True, db_index=True)
    last_state = models.CharField(max_length=50, null=True,
@ -67,12 +73,30 @@ class Lifecycle(models.Model):


 class Timing(models.Model):
+    """Each Timing record corresponds to a .start/.end event pair
+    for an instance. It tracks how long it took this operation
+    to execute."""
    name = models.CharField(max_length=50, db_index=True)
    lifecycle = models.ForeignKey(Lifecycle)
    start_raw = models.ForeignKey(RawData, related_name='+', null=True)
    end_raw = models.ForeignKey(RawData, related_name='+', null=True)

-    start_when = models.DecimalField(null=True, max_digits=20, decimal_places=6)
+    start_when = models.DecimalField(null=True, max_digits=20,
+                                     decimal_places=6)
    end_when = models.DecimalField(null=True, max_digits=20, decimal_places=6)

    diff = models.DecimalField(null=True, max_digits=20, decimal_places=6)
+
+
+class RequestTracker(models.Model):
+    """The RequestTracker table tracks the elapsed time of a user
+    request from the time it hits the API node to the time of the
+    final .end event (with the same Request ID)."""
+    request_id = models.CharField(max_length=50, db_index=True)
+    lifecycle = models.ForeignKey(Lifecycle)
+    last_timing = models.ForeignKey(Timing, null=True)
+    start = models.DecimalField(max_digits=20, decimal_places=6)
+    duration = models.DecimalField(max_digits=20, decimal_places=6)
+
+    # Not used ... but soon hopefully.
+    completed = models.BooleanField(default=False)
--- a/stacktach/stacky_server.py
+++ b/stacktach/stacky_server.py
@ -280,58 +280,19 @@ def do_watch(request, deployment_id):


 def do_kpi(request):
-    yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
-
-    events = models.RawData.objects.exclude(instance=None)  \
-                                   .exclude(when__lt=yesterday) \
-                                   .filter(Q(event__endswith='.end') |
-                                           Q(event="compute.instance.update")) \
-                                   .only('event', 'host', 'request_id',
-                                           'instance', 'deployment') \
-                                   .order_by('when')
-
-    events = list(events)
-    instance_map = {}  # { uuid: [(request_id, start_event, end_event), ...] }
-
-    for e in events:
-        if e.event == "compute.instance.update":
-            if "api" in e.host:
-                activities = instance_map.get(e.instance, [])
-                activities.append((e.request_id, e, None))
-                instance_map[e.instance] = activities
-            continue
-
-        if not e.event.endswith(".end"):
-            continue
-
-        activities = instance_map.get(e.instance)
-        if not activities:
-            # We missed the api start, skip it
-            continue
-
-        found = False
-        for index, a in enumerate(activities):
-            request_id, start_event, end_event = a
-            #if end_event is not None:
-            #    continue
-
-            if request_id == e.request_id:
-                end_event = e
-                activities[index] = (request_id, start_event, e)
-                found = True
-                break
+    yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1)
+    yesterday = dt.dt_to_decimal(yesterday)
+    trackers = models.RequestTracker.objects.select_related() \
+                                   .exclude(last_timing=None)  \
+                                   .exclude(start__lt=yesterday) \
+                                   .order_by('duration')

    results = []
    results.append(["Event", "Time", "UUID", "Deployment"])
-    for uuid, activities in instance_map.iteritems():
-        for request_id, start_event, end_event in activities:
-            if not end_event:
-                continue
-            event = end_event.event[:-len(".end")]
-            start = dt.dt_from_decimal(start_event.when)
-            end = dt.dt_from_decimal(end_event.when)
-            diff = end - start
-            results.append([event, sec_to_time(seconds_from_timedelta(
-                       diff.days, diff.seconds, diff.microseconds)), uuid,
-                       end_event.deployment.name])
+    for track in trackers:
+        end_event = track.last_timing.end_raw
+        event = end_event.event[:-len(".end")]
+        uuid = track.lifecycle.instance
+        results.append([event, sec_to_time(track.duration),
+                   uuid, end_event.deployment.name])
    return rsp(results)
--- a/stacktach/views.py
+++ b/stacktach/views.py
@ -74,11 +74,53 @@ HANDLERS = {'monitor.info':_monitor_message,
            '':_compute_update_message}


+def start_kpi_tracking(lifecycle, raw):
+    """Start the clock for kpi timings when we see an instance.update
+    coming in from an api node."""
+    if raw.event != "compute.instance.update":
+        return
+
+    if "api" not in raw.host:
+        return
+
+    tracker = models.RequestTracker(request_id=raw.request_id,
+                                    start=raw.when,
+                                    lifecycle=lifecycle,
+                                    last_timing=None,
+                                    duration=str(0.0))
+    tracker.save()
+
+
+def update_kpi(lifecycle, timing, raw):
+    """Whenever we get a .end event, use the Timing object to
+    compute our current end-to-end duration.
+
+    Note: it may not be completely accurate if the operation is
+    still in-process, but we have no way of knowing it's still
+    in-process without mapping the original command with the
+    expected .end event (that's a whole other thing)
+
+    Until then, we'll take the lazy route and be aware of these
+    potential fence-post issues."""
+    trackers = models.RequestTracker.objects.\
+                                        filter(request_id=raw.request_id)
+    if len(trackers) == 0:
+        return
+
+    tracker = trackers[0]
+    tracker.last_timing = timing
+    tracker.duration = timing.end_when - tracker.start
+    tracker.save()
+
+
 def aggregate(raw):
    """Roll up the raw event into a Lifecycle object
    and a bunch of Timing objects.

    We can use this for summarized timing reports.
+
+    Additionally, we can use this processing to give
+    us end-to-end user request timings for kpi reports.
    """

    if not raw.instance:
@ -104,6 +146,8 @@ def aggregate(raw):
    name = '.'.join(parts[:-1])

    if not step in ['start', 'end']:
+        # Perhaps it's an operation initiated in the API?
+        start_kpi_tracking(lifecycle, raw)
        return

    # We are going to try to track every event pair that comes
@ -151,9 +195,10 @@ def aggregate(raw):
        # We could have missed start so watch out ...
        if timing.start_when:
            timing.diff = timing.end_when - timing.start_when
+            # Looks like a valid pair ...
+            update_kpi(lifecycle, timing, raw)
    timing.save()

-
 def process_raw_data(deployment, args, json_args):
    """This is called directly by the worker to add the event to the db."""
    db.reset_queries()
--- a/worker/stacktach.sh
+++ b/worker/stacktach.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-WORKDIR=/srv/www/stacktach/django/stproject/
+WORKDIR=/srv/www/stacktach/app
 DAEMON=/usr/bin/python
 ARGS=$WORKDIR/worker/start_workers.py
 PIDFILE=/var/run/stacktach.pid