From f8419bf68a02afca7802a30bc3844d651b9e3469 Mon Sep 17 00:00:00 2001 From: Samuel Merritt Date: Fri, 20 Apr 2018 16:36:19 -0700 Subject: [PATCH] Improve path handling in proxy_logging In the access log, proxy_logging places the URL path, double-URL-encoded (e.g. " " -> %20 -> %2520). In the WSGI environment, env['PATH_INFO'] is the unquoted path sent by the client, e.g. "/v1/a/c/o" or "/v1/a/c/obj with spaces" or "/v1/a/c/u-\xf0\x9f\xa4\x94". On the request, we have the "path" attribute, which returns the quoted form of env["PATH_INFO"]. Note that, under Python 2, these are always str objects, not unicode. Swift used to log this: quote(quote(unquote( req.path.decode('utf-8', 'replace').encode('utf-8')), "/:")) There's a lot going on here. Let's look at the innermost expression: req.path.decode('utf-8', 'replace').encode('utf-8') This appears to be trying to replace invalid UTF-8 sequences with the Unicode replacement character. However, since we start from "req.path", which quotes its input, we don't have any non-ASCII bytes, so the decode/encode is a no-op. That is, if env["PATH_INFO"] had "\xff\xff" in it, req.path would have "%FF%FF", so there's no invalid byte sequences to escape. So, without changing the output, we can simplify the above code to: quote(quote(unquote(req.path), "/:")) The second call to quote() has a second argument of "/:", so neither slashes or colons are quoted. This was introduced in commit cd8af3f8f1 as part of a change to log client IPv6 addresses (from env["REMOTE_ADDR") without escaping the colons. However, cd8af3f8f1 also applied that to the logging of env["PATH_INFO"]. This seems unhelpful since IPv6 addresses in URL paths are not particularly common, so I've removed it. The change is backwards-compatible since a double-unquote of the logged value will still return the original PATH_INFO. With that in mind, let's simplify further. req.path quotes env["PATH_INFO"], so that cancels out one unquote() call, leaving us with this: quote(req.path_info) which produces basically the same result as before, but faster. Change-Id: I7e665eb55a9524aa49e2578f2084d8354f6c1e31 --- swift/common/middleware/proxy_logging.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/swift/common/middleware/proxy_logging.py b/swift/common/middleware/proxy_logging.py index 12c20774b4..069dc106f9 100644 --- a/swift/common/middleware/proxy_logging.py +++ b/swift/common/middleware/proxy_logging.py @@ -75,10 +75,10 @@ import sys import time import six -from six.moves.urllib.parse import quote, unquote +from six.moves.urllib.parse import quote from swift.common.swob import Request from swift.common.utils import (get_logger, get_remote_client, - get_valid_utf8_str, config_true_value, + config_true_value, InputProxy, list_from_csv, get_policy_index) from swift.common.storage_policy import POLICIES @@ -152,8 +152,7 @@ class ProxyLoggingMiddleware(object): :param resp_headers: dict of the response headers """ resp_headers = resp_headers or {} - req_path = get_valid_utf8_str(req.path) - the_request = quote(unquote(req_path), QUOTE_SAFE) + the_request = req.path if req.query_string: the_request = the_request + '?' + req.query_string logged_headers = None