From bce8443c9e7f3e6b021b2b9d1d765cc5a8ffd021 Mon Sep 17 00:00:00 2001
From: Darrell Bishop <darrell@swiftstack.com>
Date: Sat, 19 Jan 2013 10:57:14 -0800
Subject: [PATCH] Adds first-byte latency timings for GET requests.

This was an oustanding TODO for StatsD Swift metrics.  A new timing
metric is tracked for (only) GET requests for accounts, containers,
and objects:

  proxy-server.<req_type>.GET.<status_int>.first-byte.timing

Also updated StatsD documentation in the Admin Guide to clarify that
timing metrics are sent in units of milliseconds.

Change-Id: I5bb781c06cefcb5280f4fb1112a526c029fe0c20
---
 doc/source/admin_guide.rst                    |  45 ++---
 swift/common/middleware/proxy_logging.py      |  83 +++++++---
 .../common/middleware/test_proxy_logging.py   | 155 +++++++++++++++---
 3 files changed, 211 insertions(+), 72 deletions(-)
diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst
index e76a957a10..30d4ce5328 100644
--- a/doc/source/admin_guide.rst
+++ b/doc/source/admin_guide.rst
@@ -462,13 +462,14 @@ servers when sending statistics to a central StatsD server.  If you run a local
 StatsD server per node, you could configure a per-node metrics prefix there and
 leave `log_statsd_metric_prefix` blank.
 
-Note that metrics reported to StatsD are counters or timing data (which
-StatsD usually expands out to min, max, avg, count, and 90th percentile
-per timing metric).  Some important "gauge" metrics will still need to
-be collected using another method.  For example, the
-`object-server.async_pendings` StatsD metric counts the generation of
-async_pendings in real-time, but will not tell you the current number
-of async_pending container updates on disk at any point in time.
+Note that metrics reported to StatsD are counters or timing data (which are
+sent in units of milliseconds).  StatsD usually expands timing data out to min,
+max, avg, count, and 90th percentile per timing metric, but the details of
+this behavior will depend on the configuration of your StatsD server.  Some
+important "gauge" metrics may still need to be collected using another method.
+For example, the `object-server.async_pendings` StatsD metric counts the generation
+of async_pendings in real-time, but will not tell you the current number of
+async_pending container updates on disk at any point in time.
 
 Note also that the set of metrics collected, their names, and their semantics
 are not locked down and will change over time.  StatsD logging is currently in
@@ -494,7 +495,7 @@ Metric Name                                     Description
 `account-reaper.errors`                         Count of devices failing the mount check.
 `account-reaper.timing`                         Timing data for each reap_account() call.
 `account-reaper.return_codes.X`                 Count of HTTP return codes from various operations
-                                                (eg. object listing, container deletion, etc.). The
+                                                (e.g. object listing, container deletion, etc.). The
                                                 value for X is the first digit of the return code
                                                 (2 for 201, 4 for 404, etc.).
 `account-reaper.containers_failures`            Count of failures to delete a container.
@@ -840,17 +841,23 @@ the default setting yields the above behavior.
 
 .. _Swift Origin Server: https://github.com/dpgoetz/sos
 
-============================================  ====================================================
-Metric Name                                   Description
---------------------------------------------  ----------------------------------------------------
-`proxy-server.<type>.<verb>.<status>.timing`  Timing data for requests.  The <status> portion is
-                                              the numeric HTTP status code for the request (eg.
-                                              "200" or "404")
-`proxy-server.<type>.<verb>.<status>.xfer`    The count of the sum of bytes transferred in (from
-                                              clients) and out (to clients) for requests.  The
-                                              <type>, <verb>, and <status> portions of the metric
-                                              are just like the timing metric.
-============================================  ====================================================
+====================================================  ============================================
+Metric Name                                           Description
+----------------------------------------------------  --------------------------------------------
+`proxy-server.<type>.<verb>.<status>.timing`          Timing data for requests, start to finish.
+                                                      The <status> portion is the numeric HTTP
+                                                      status code for the request (e.g.  "200" or
+                                                      "404").
+`proxy-server.<type>.GET.<status>.first-byte.timing`  Timing data up to completion of sending the
+                                                      response headers (only for GET requests).
+                                                      <status> and <type> are as for the main
+                                                      timing metric.
+`proxy-server.<type>.<verb>.<status>.xfer`            This counter metric is the sum of bytes
+                                                      transferred in (from clients) and out (to
+                                                      clients) for requests.  The <type>, <verb>,
+                                                      and <status> portions of the metric are just
+                                                      like the main timing metric.
+====================================================  ============================================
 
 Metrics for `tempauth` middleware (in the table, `<reseller_prefix>` represents
 the actual configured reseller_prefix or "`NONE`" if the reseller_prefix is the
diff --git a/swift/common/middleware/proxy_logging.py b/swift/common/middleware/proxy_logging.py
index 2139b0ed99..97950a7c94 100644
--- a/swift/common/middleware/proxy_logging.py
+++ b/swift/common/middleware/proxy_logging.py
@@ -99,22 +99,28 @@ class ProxyLoggingMiddleware(object):
                                         log_route='proxy-access')
         self.access_logger.set_statsd_prefix('proxy-server')
 
-    def log_request(self, env, status_int, bytes_received, bytes_sent,
-                    request_time, client_disconnect):
+    def method_from_req(self, req):
+        return req.environ.get('swift.orig_req_method', req.method)
+
+    def req_already_logged(self, req):
+        return req.environ.get('swift.proxy_access_log_made')
+
+    def mark_req_logged(self, req):
+        req.environ['swift.proxy_access_log_made'] = True
+
+    def log_request(self, req, status_int, bytes_received, bytes_sent,
+                    request_time):
         """
         Log a request.
 
-        :param env: WSGI environment
+        :param req: swob.Request object for the request
         :param status_int: integer code for the response status
         :param bytes_received: bytes successfully read from the request body
         :param bytes_sent: bytes yielded to the WSGI server
         :param request_time: time taken to satisfy the request, in seconds
         """
-        if env.get('swift.proxy_access_log_made'):
+        if self.req_already_logged(req):
             return
-        req = Request(env)
-        if client_disconnect:  # log disconnected clients as '499' status code
-            status_int = 499
         req_path = get_valid_utf8_str(req.path)
         the_request = quote(unquote(req_path))
         if req.query_string:
@@ -123,7 +129,7 @@ class ProxyLoggingMiddleware(object):
         if self.log_hdrs:
             logged_headers = '\n'.join('%s: %s' % (k, v)
                                        for k, v in req.headers.items())
-        method = req.environ.get('swift.orig_req_method', req.method)
+        method = self.method_from_req(req)
         self.access_logger.info(' '.join(
             quote(str(x) if x else '-')
             for x in (
@@ -145,8 +151,18 @@ class ProxyLoggingMiddleware(object):
                 '%.4f' % request_time,
                 req.environ.get('swift.source'),
             )))
-        env['swift.proxy_access_log_made'] = True
+        self.mark_req_logged(req)
         # Log timing and bytes-transfered data to StatsD
+        metric_name = self.statsd_metric_name(req, status_int, method)
+        # Only log data for valid controllers (or SOS) to keep the metric count
+        # down (egregious errors will get logged by the proxy server itself).
+        if metric_name:
+            self.access_logger.timing(metric_name + '.timing',
+                                      request_time * 1000)
+            self.access_logger.update_stats(metric_name + '.xfer',
+                                            bytes_received + bytes_sent)
+
+    def statsd_metric_name(self, req, status_int, method):
         if req.path.startswith('/v1/'):
             try:
                 stat_type = [None, 'account', 'container',
@@ -154,17 +170,12 @@ class ProxyLoggingMiddleware(object):
             except IndexError:
                 stat_type = 'object'
         else:
-            stat_type = env.get('swift.source')
-        # Only log data for valid controllers (or SOS) to keep the metric count
-        # down (egregious errors will get logged by the proxy server itself).
-        if stat_type:
-            stat_method = method if method in self.valid_methods \
-                else 'BAD_METHOD'
-            metric_name = '.'.join((stat_type, stat_method, str(status_int)))
-            self.access_logger.timing(metric_name + '.timing',
-                                      request_time * 1000)
-            self.access_logger.update_stats(metric_name + '.xfer',
-                                            bytes_received + bytes_sent)
+            stat_type = req.environ.get('swift.source')
+        if stat_type is None:
+            return None
+        stat_method = method if method in self.valid_methods \
+            else 'BAD_METHOD'
+        return '.'.join((stat_type, stat_method, str(status_int)))
 
     def __call__(self, env, start_response):
         start_response_args = [None]
@@ -175,6 +186,14 @@ class ProxyLoggingMiddleware(object):
         def my_start_response(status, headers, exc_info=None):
             start_response_args[0] = (status, list(headers), exc_info)
 
+        def status_int_for_logging(client_disconnect=False, start_status=None):
+            # log disconnected clients as '499' status code
+            if client_disconnect or input_proxy.client_disconnect:
+                return 499
+            elif start_status is None:
+                return int(start_response_args[0][0].split(' ', 1)[0])
+            return start_status
+
         def iter_response(iterable):
             iterator = iter(iterable)
             try:
@@ -193,6 +212,17 @@ class ProxyLoggingMiddleware(object):
                     start_response_args[0][1].append(
                         ('content-length', str(sum(len(i) for i in iterable))))
             start_response(*start_response_args[0])
+            req = Request(env)
+
+            # Log timing information for time-to-first-byte (GET requests only)
+            method = self.method_from_req(req)
+            if method == 'GET' and not self.req_already_logged(req):
+                status_int = status_int_for_logging()
+                metric_name = self.statsd_metric_name(req, status_int, method)
+                if metric_name:
+                    self.access_logger.timing_since(
+                        metric_name + '.first-byte.timing', start_time)
+
             bytes_sent = 0
             client_disconnect = False
             try:
@@ -204,18 +234,19 @@ class ProxyLoggingMiddleware(object):
                 client_disconnect = True
                 raise
             finally:
-                status_int = int(start_response_args[0][0].split(' ', 1)[0])
+                status_int = status_int_for_logging(client_disconnect)
                 self.log_request(
-                    env, status_int, input_proxy.bytes_received, bytes_sent,
-                    time.time() - start_time,
-                    client_disconnect or input_proxy.client_disconnect)
+                    req, status_int, input_proxy.bytes_received, bytes_sent,
+                    time.time() - start_time)
 
         try:
             iterable = self.app(env, my_start_response)
         except Exception:
+            req = Request(env)
+            status_int = status_int_for_logging(start_status=500)
             self.log_request(
-                env, 500, input_proxy.bytes_received, 0,
-                time.time() - start_time, input_proxy.client_disconnect)
+                req, status_int, input_proxy.bytes_received, 0,
+                time.time() - start_time)
             raise
         else:
             return iter_response(iterable)
diff --git a/test/unit/common/middleware/test_proxy_logging.py b/test/unit/common/middleware/test_proxy_logging.py
index 86841df5b8..b9484ba008 100644
--- a/test/unit/common/middleware/test_proxy_logging.py
+++ b/test/unit/common/middleware/test_proxy_logging.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import unittest
+import time
 from urllib import quote, unquote
 import cStringIO as StringIO
 from logging.handlers import SysLogHandler
@@ -25,12 +26,14 @@ from swift.common.swob import Request
 
 
 class FakeApp(object):
-    def __init__(self, body=['FAKE APP']):
+    def __init__(self, body=['FAKE APP'], response_str='200 OK'):
         self.body = body
+        self.response_str = response_str
 
     def __call__(self, env, start_response):
-        start_response('200 OK', [('Content-Type', 'text/plain'),
-                            ('Content-Length', str(sum(map(len, self.body))))])
+        start_response(self.response_str,
+                       [('Content-Type', 'text/plain'),
+                        ('Content-Length', str(sum(map(len, self.body))))])
         while env['wsgi.input'].read(5):
             pass
         return self.body
@@ -72,19 +75,48 @@ def start_response(*args):
 
 class TestProxyLogging(unittest.TestCase):
 
-    def _log_parts(self, app):
+    def _log_parts(self, app, should_be_empty=False):
         info_calls = app.access_logger.log_dict['info']
-        self.assertEquals(1, len(info_calls))
-        return info_calls[0][0][0].split(' ')
+        if should_be_empty:
+            self.assertEquals([], info_calls)
+        else:
+            self.assertEquals(1, len(info_calls))
+            return info_calls[0][0][0].split(' ')
 
     def assertTiming(self, exp_metric, app, exp_timing=None):
         timing_calls = app.access_logger.log_dict['timing']
-        self.assertEquals(1, len(timing_calls))
-        self.assertEquals({}, timing_calls[0][1])
-        self.assertEquals(2, len(timing_calls[0][0]))
-        self.assertEquals(exp_metric, timing_calls[0][0][0])
-        if exp_timing is not None:
-            self.assertEquals(exp_timing, timing_calls[0][0][1])
+        found = False
+        for timing_call in timing_calls:
+            self.assertEquals({}, timing_call[1])
+            self.assertEquals(2, len(timing_call[0]))
+            if timing_call[0][0] == exp_metric:
+                found = True
+                if exp_timing is not None:
+                    self.assertAlmostEqual(exp_timing, timing_call[0][1],
+                                           places=4)
+        if not found:
+            self.assertTrue(False, 'assertTiming: %s not found in %r' % (
+                exp_metric, timing_calls))
+
+    def assertTimingSince(self, exp_metric, app, exp_start=None):
+        timing_calls = app.access_logger.log_dict['timing_since']
+        found = False
+        for timing_call in timing_calls:
+            self.assertEquals({}, timing_call[1])
+            self.assertEquals(2, len(timing_call[0]))
+            if timing_call[0][0] == exp_metric:
+                found = True
+                if exp_start is not None:
+                    self.assertAlmostEqual(exp_start, timing_call[0][1],
+                                           places=4)
+        if not found:
+            self.assertTrue(False, 'assertTimingSince: %s not found in %r' % (
+                exp_metric, timing_calls))
+
+    def assertNotTiming(self, not_exp_metric, app):
+        timing_calls = app.access_logger.log_dict['timing']
+        for timing_call in timing_calls:
+            self.assertNotEqual(not_exp_metric, timing_call[0][0])
 
     def assertUpdateStats(self, exp_metric, exp_bytes, app):
         update_stats_calls = app.access_logger.log_dict['update_stats']
@@ -99,9 +131,8 @@ class TestProxyLogging(unittest.TestCase):
             req = Request.blank(url, environ={'REQUEST_METHOD': 'GET'})
             resp = app(req.environ, start_response)
             resp_body = ''.join(resp)
-            self.assertEquals(0, len(app.access_logger.log_dict['timing']))
-            self.assertEquals(0,
-                              len(app.access_logger.log_dict['update_stats']))
+            self.assertEqual([], app.access_logger.log_dict['timing'])
+            self.assertEqual([], app.access_logger.log_dict['update_stats'])
 
     def test_log_request_stat_type_bad(self):
         for bad_path in ['', '/', '/bad', '/baddy/mc_badderson', '/v1',
@@ -109,11 +140,21 @@ class TestProxyLogging(unittest.TestCase):
             app = proxy_logging.ProxyLoggingMiddleware(FakeApp(), {})
             app.access_logger = FakeLogger()
             req = Request.blank(bad_path, environ={'REQUEST_METHOD': 'GET'})
-            app.log_request(req.environ, 123, 7, 13, 2.71828182846, False)
+            app.log_request(req, 123, 7, 13, 2.71828182846)
             self.assertEqual([], app.access_logger.log_dict['timing'])
             self.assertEqual([], app.access_logger.log_dict['update_stats'])
 
     def test_log_request_stat_type_good(self):
+        """
+        log_request() should send timing and byte-count counters for GET
+        requests.  Also, __call__()'s iter_response() function should
+        statsd-log time to first byte (calling the passed-in start_response
+        function), but only for GET requests.
+        """
+        stub_times = []
+        def stub_time():
+            return stub_times.pop(0)
+
         path_types = {
             '/v1/a': 'account',
             '/v1/a/': 'account',
@@ -126,13 +167,62 @@ class TestProxyLogging(unittest.TestCase):
             '/v1/a/c/o/p/p2': 'object',
         }
         for path, exp_type in path_types.iteritems():
-            app = proxy_logging.ProxyLoggingMiddleware(FakeApp(), {})
-            app.access_logger = FakeLogger()
-            req = Request.blank(path, environ={'REQUEST_METHOD': 'GET'})
-            app.log_request(req.environ, 321, 7, 13, 2.71828182846, False)
-            self.assertTiming('%s.GET.321.timing' % exp_type, app,
-                              exp_timing=2.71828182846 * 1000)
-            self.assertUpdateStats('%s.GET.321.xfer' % exp_type, 7 + 13, app)
+            orig_time = time.time
+            try:
+                time.time = stub_time
+                # GET
+                app = proxy_logging.ProxyLoggingMiddleware(
+                    FakeApp(body='7654321', response_str='321 Fubar'), {})
+                app.access_logger = FakeLogger()
+                req = Request.blank(path, environ={
+                    'REQUEST_METHOD': 'GET',
+                    'wsgi.input': StringIO.StringIO('4321')})
+                stub_times = [18.0, 20.71828182846]
+                iter_response = app(req.environ, lambda *_: None)
+                self.assertEqual('7654321', ''.join(iter_response))
+                self.assertTiming('%s.GET.321.timing' % exp_type, app,
+                                  exp_timing=2.71828182846 * 1000)
+                self.assertTimingSince(
+                    '%s.GET.321.first-byte.timing' % exp_type, app,
+                    exp_start=18.0)
+                self.assertUpdateStats('%s.GET.321.xfer' % exp_type,
+                                       4 + 7, app)
+
+                # GET with swift.proxy_access_log_made already set
+                app = proxy_logging.ProxyLoggingMiddleware(
+                    FakeApp(body='7654321', response_str='321 Fubar'), {})
+                app.access_logger = FakeLogger()
+                req = Request.blank(path, environ={
+                    'REQUEST_METHOD': 'GET',
+                    'swift.proxy_access_log_made': True,
+                    'wsgi.input': StringIO.StringIO('4321')})
+                stub_times = [18.0, 20.71828182846]
+                iter_response = app(req.environ, lambda *_: None)
+                self.assertEqual('7654321', ''.join(iter_response))
+                self.assertEqual([], app.access_logger.log_dict['timing'])
+                self.assertEqual([], app.access_logger.log_dict['timing_since'])
+                self.assertEqual([], app.access_logger.log_dict['update_stats'])
+
+                # PUT (no first-byte timing!)
+                app = proxy_logging.ProxyLoggingMiddleware(
+                    FakeApp(body='87654321', response_str='314 PiTown'), {})
+                app.access_logger = FakeLogger()
+                req = Request.blank(path, environ={
+                    'REQUEST_METHOD': 'PUT',
+                    'wsgi.input': StringIO.StringIO('654321')})
+                # (it's not a GET, so time() doesn't have a 2nd call)
+                stub_times = [58.2, 58.2 + 7.3321]
+                iter_response = app(req.environ, lambda *_: None)
+                self.assertEqual('87654321', ''.join(iter_response))
+                self.assertTiming('%s.PUT.314.timing' % exp_type, app,
+                                  exp_timing=7.3321 * 1000)
+                self.assertNotTiming(
+                    '%s.GET.314.first-byte.timing' % exp_type, app)
+                self.assertNotTiming(
+                    '%s.PUT.314.first-byte.timing' % exp_type, app)
+                self.assertUpdateStats('%s.PUT.314.xfer' % exp_type, 6 + 8, app)
+            finally:
+                time.time = orig_time
 
     def test_log_request_stat_method_filtering_default(self):
         method_map = {
@@ -152,7 +242,7 @@ class TestProxyLogging(unittest.TestCase):
             app = proxy_logging.ProxyLoggingMiddleware(FakeApp(), {})
             app.access_logger = FakeLogger()
             req = Request.blank('/v1/a/', environ={'REQUEST_METHOD': method})
-            app.log_request(req.environ, 299, 11, 3, 1.17, False)
+            app.log_request(req, 299, 11, 3, 1.17)
             self.assertTiming('account.%s.299.timing' % exp_method, app,
                               exp_timing=1.17 * 1000)
             self.assertUpdateStats('account.%s.299.xfer' % exp_method,
@@ -177,7 +267,7 @@ class TestProxyLogging(unittest.TestCase):
                 })
                 app.access_logger = FakeLogger()
                 req = Request.blank('/v1/a/c', environ={'REQUEST_METHOD': method})
-                app.log_request(req.environ, 911, 4, 43, 1.01, False)
+                app.log_request(req, 911, 4, 43, 1.01)
                 self.assertTiming('container.%s.911.timing' % exp_method, app,
                                 exp_timing=1.01 * 1000)
                 self.assertUpdateStats('container.%s.911.xfer' % exp_method,
@@ -197,6 +287,17 @@ class TestProxyLogging(unittest.TestCase):
         self.assertEquals(resp_body, 'FAKE APP')
         self.assertEquals(log_parts[11], str(len(resp_body)))
 
+    def test_basic_req_second_time(self):
+        app = proxy_logging.ProxyLoggingMiddleware(FakeApp(), {})
+        app.access_logger = FakeLogger()
+        req = Request.blank('/', environ={
+            'swift.proxy_access_log_made': True,
+            'REQUEST_METHOD': 'GET'})
+        resp = app(req.environ, start_response)
+        resp_body = ''.join(resp)
+        log_parts = self._log_parts(app, should_be_empty=True)
+        self.assertEquals(resp_body, 'FAKE APP')
+
     def test_multi_segment_resp(self):
         app = proxy_logging.ProxyLoggingMiddleware(FakeApp(
             ['some', 'chunks', 'of data']), {})
@@ -334,7 +435,7 @@ class TestProxyLogging(unittest.TestCase):
         try:
             resp = app(req.environ, start_response)
             body = ''.join(resp)
-        except Exception:
+        except IOError:
             pass
         log_parts = self._log_parts(app)
         self.assertEquals(log_parts[6], '499')
@@ -349,7 +450,7 @@ class TestProxyLogging(unittest.TestCase):
         try:
             resp = app(req.environ, start_response)
             body = ''.join(resp)
-        except Exception:
+        except IOError:
             pass
         log_parts = self._log_parts(app)
         self.assertEquals(log_parts[6], '499')