diff --git a/stacktach/models.py b/stacktach/models.py index 7a536c3..2a5310b 100644 --- a/stacktach/models.py +++ b/stacktach/models.py @@ -57,6 +57,12 @@ class RawData(models.Model): class Lifecycle(models.Model): + """The Lifecycle table is the Master for a group of + Timing detail records. There is one Lifecycle row for + each instance seen in the event stream. The Timings + relate to the execution time for each .start/.end event + pair for this instance. These pairs are over the entire + lifespan of the instance, even across multiple api requests.""" instance = models.CharField(max_length=50, null=True, blank=True, db_index=True) last_state = models.CharField(max_length=50, null=True, @@ -67,12 +73,30 @@ class Lifecycle(models.Model): class Timing(models.Model): + """Each Timing record corresponds to a .start/.end event pair + for an instance. It tracks how long it took this operation + to execute.""" name = models.CharField(max_length=50, db_index=True) lifecycle = models.ForeignKey(Lifecycle) start_raw = models.ForeignKey(RawData, related_name='+', null=True) end_raw = models.ForeignKey(RawData, related_name='+', null=True) - start_when = models.DecimalField(null=True, max_digits=20, decimal_places=6) + start_when = models.DecimalField(null=True, max_digits=20, + decimal_places=6) end_when = models.DecimalField(null=True, max_digits=20, decimal_places=6) diff = models.DecimalField(null=True, max_digits=20, decimal_places=6) + + +class RequestTracker(models.Model): + """The RequestTracker table tracks the elapsed time of a user + request from the time it hits the API node to the time of the + final .end event (with the same Request ID).""" + request_id = models.CharField(max_length=50, db_index=True) + lifecycle = models.ForeignKey(Lifecycle) + last_timing = models.ForeignKey(Timing, null=True) + start = models.DecimalField(max_digits=20, decimal_places=6) + duration = models.DecimalField(max_digits=20, decimal_places=6) + + # Not used ... but soon hopefully. + completed = models.BooleanField(default=False) diff --git a/stacktach/stacky_server.py b/stacktach/stacky_server.py index 037b75d..4daeaba 100644 --- a/stacktach/stacky_server.py +++ b/stacktach/stacky_server.py @@ -280,58 +280,19 @@ def do_watch(request, deployment_id): def do_kpi(request): - yesterday = datetime.datetime.now() - datetime.timedelta(days=1) - - events = models.RawData.objects.exclude(instance=None) \ - .exclude(when__lt=yesterday) \ - .filter(Q(event__endswith='.end') | - Q(event="compute.instance.update")) \ - .only('event', 'host', 'request_id', - 'instance', 'deployment') \ - .order_by('when') - - events = list(events) - instance_map = {} # { uuid: [(request_id, start_event, end_event), ...] } - - for e in events: - if e.event == "compute.instance.update": - if "api" in e.host: - activities = instance_map.get(e.instance, []) - activities.append((e.request_id, e, None)) - instance_map[e.instance] = activities - continue - - if not e.event.endswith(".end"): - continue - - activities = instance_map.get(e.instance) - if not activities: - # We missed the api start, skip it - continue - - found = False - for index, a in enumerate(activities): - request_id, start_event, end_event = a - #if end_event is not None: - # continue - - if request_id == e.request_id: - end_event = e - activities[index] = (request_id, start_event, e) - found = True - break + yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) + yesterday = dt.dt_to_decimal(yesterday) + trackers = models.RequestTracker.objects.select_related() \ + .exclude(last_timing=None) \ + .exclude(start__lt=yesterday) \ + .order_by('duration') results = [] results.append(["Event", "Time", "UUID", "Deployment"]) - for uuid, activities in instance_map.iteritems(): - for request_id, start_event, end_event in activities: - if not end_event: - continue - event = end_event.event[:-len(".end")] - start = dt.dt_from_decimal(start_event.when) - end = dt.dt_from_decimal(end_event.when) - diff = end - start - results.append([event, sec_to_time(seconds_from_timedelta( - diff.days, diff.seconds, diff.microseconds)), uuid, - end_event.deployment.name]) + for track in trackers: + end_event = track.last_timing.end_raw + event = end_event.event[:-len(".end")] + uuid = track.lifecycle.instance + results.append([event, sec_to_time(track.duration), + uuid, end_event.deployment.name]) return rsp(results) diff --git a/stacktach/views.py b/stacktach/views.py index e4684e7..f71cecc 100644 --- a/stacktach/views.py +++ b/stacktach/views.py @@ -74,11 +74,53 @@ HANDLERS = {'monitor.info':_monitor_message, '':_compute_update_message} +def start_kpi_tracking(lifecycle, raw): + """Start the clock for kpi timings when we see an instance.update + coming in from an api node.""" + if raw.event != "compute.instance.update": + return + + if "api" not in raw.host: + return + + tracker = models.RequestTracker(request_id=raw.request_id, + start=raw.when, + lifecycle=lifecycle, + last_timing=None, + duration=str(0.0)) + tracker.save() + + +def update_kpi(lifecycle, timing, raw): + """Whenever we get a .end event, use the Timing object to + compute our current end-to-end duration. + + Note: it may not be completely accurate if the operation is + still in-process, but we have no way of knowing it's still + in-process without mapping the original command with the + expected .end event (that's a whole other thing) + + Until then, we'll take the lazy route and be aware of these + potential fence-post issues.""" + trackers = models.RequestTracker.objects.\ + filter(request_id=raw.request_id) + if len(trackers) == 0: + return + + tracker = trackers[0] + tracker.last_timing = timing + tracker.duration = timing.end_when - tracker.start + tracker.save() + + def aggregate(raw): """Roll up the raw event into a Lifecycle object and a bunch of Timing objects. We can use this for summarized timing reports. + + Additionally, we can use this processing to give + us end-to-end user request timings for kpi reports. """ if not raw.instance: @@ -104,6 +146,8 @@ def aggregate(raw): name = '.'.join(parts[:-1]) if not step in ['start', 'end']: + # Perhaps it's an operation initiated in the API? + start_kpi_tracking(lifecycle, raw) return # We are going to try to track every event pair that comes @@ -151,9 +195,10 @@ def aggregate(raw): # We could have missed start so watch out ... if timing.start_when: timing.diff = timing.end_when - timing.start_when + # Looks like a valid pair ... + update_kpi(lifecycle, timing, raw) timing.save() - def process_raw_data(deployment, args, json_args): """This is called directly by the worker to add the event to the db.""" db.reset_queries() diff --git a/worker/stacktach.sh b/worker/stacktach.sh index ccc87c8..16794fa 100755 --- a/worker/stacktach.sh +++ b/worker/stacktach.sh @@ -1,6 +1,6 @@ #!/bin/bash -WORKDIR=/srv/www/stacktach/django/stproject/ +WORKDIR=/srv/www/stacktach/app DAEMON=/usr/bin/python ARGS=$WORKDIR/worker/start_workers.py PIDFILE=/var/run/stacktach.pid