From a7604da065f025931f42bbcf9307bbabda3a37a9 Mon Sep 17 00:00:00 2001 From: Samuel Merritt Date: Wed, 10 Sep 2014 17:29:05 -0700 Subject: [PATCH] Move multipart MIME parser into utils Over on the EC branch, we need to be able to parse multipart MIME documents in the object server. The formpost middleware has a perfectly good MIME parser, but it seems sort of awful to import things from formpost in swift/obj/server.py, so I pulled it out into common.utils. Change-Id: Ieb4c05d02d8e4ef51a3a11d26c503786b1897f60 --- swift/common/exceptions.py | 4 + swift/common/middleware/formpost.py | 138 +--------------- swift/common/utils.py | 126 ++++++++++++++ test/unit/common/middleware/test_formpost.py | 161 ------------------ test/unit/common/test_utils.py | 163 ++++++++++++++++++- 5 files changed, 301 insertions(+), 291 deletions(-) diff --git a/swift/common/exceptions.py b/swift/common/exceptions.py index e7999bab97..e46df933b7 100644 --- a/swift/common/exceptions.py +++ b/swift/common/exceptions.py @@ -145,6 +145,10 @@ class ReplicationLockTimeout(LockTimeout): pass +class MimeInvalid(SwiftException): + pass + + class ClientException(Exception): def __init__(self, msg, http_scheme='', http_host='', http_port='', diff --git a/swift/common/middleware/formpost.py b/swift/common/middleware/formpost.py index dbc2b85d6a..1a79a3e280 100644 --- a/swift/common/middleware/formpost.py +++ b/swift/common/middleware/formpost.py @@ -112,14 +112,15 @@ the file are simply ignored). __all__ = ['FormPost', 'filter_factory', 'READ_CHUNK_SIZE', 'MAX_VALUE_LENGTH'] import hmac -import re import rfc822 from hashlib import sha1 from time import time from urllib import quote +from swift.common.exceptions import MimeInvalid from swift.common.middleware.tempurl import get_tempurl_keys_from_metadata -from swift.common.utils import streq_const_time, register_swift_info +from swift.common.utils import streq_const_time, register_swift_info, \ + parse_content_disposition, iter_multipart_mime_documents from swift.common.wsgi import make_pre_authed_env from swift.common.swob import HTTPUnauthorized from swift.proxy.controllers.base import get_account_info @@ -132,9 +133,6 @@ READ_CHUNK_SIZE = 4096 #: truncated. MAX_VALUE_LENGTH = 4096 -#: Regular expression to match form attributes. -ATTRIBUTES_RE = re.compile(r'(\w+)=(".*?"|[^";]+)(; ?|$)') - class FormInvalid(Exception): pass @@ -144,125 +142,6 @@ class FormUnauthorized(Exception): pass -def _parse_attrs(header): - """ - Given the value of a header like: - Content-Disposition: form-data; name="somefile"; filename="test.html" - - Return data like - ("form-data", {"name": "somefile", "filename": "test.html"}) - - :param header: Value of a header (the part after the ': '). - :returns: (value name, dict) of the attribute data parsed (see above). - """ - attributes = {} - attrs = '' - if '; ' in header: - header, attrs = header.split('; ', 1) - m = True - while m: - m = ATTRIBUTES_RE.match(attrs) - if m: - attrs = attrs[len(m.group(0)):] - attributes[m.group(1)] = m.group(2).strip('"') - return header, attributes - - -class _IterRequestsFileLikeObject(object): - - def __init__(self, wsgi_input, boundary, input_buffer): - self.no_more_data_for_this_file = False - self.no_more_files = False - self.wsgi_input = wsgi_input - self.boundary = boundary - self.input_buffer = input_buffer - - def read(self, length=None): - if not length: - length = READ_CHUNK_SIZE - if self.no_more_data_for_this_file: - return '' - - # read enough data to know whether we're going to run - # into a boundary in next [length] bytes - if len(self.input_buffer) < length + len(self.boundary) + 2: - to_read = length + len(self.boundary) + 2 - while to_read > 0: - chunk = self.wsgi_input.read(to_read) - to_read -= len(chunk) - self.input_buffer += chunk - if not chunk: - self.no_more_files = True - break - - boundary_pos = self.input_buffer.find(self.boundary) - - # boundary does not exist in the next (length) bytes - if boundary_pos == -1 or boundary_pos > length: - ret = self.input_buffer[:length] - self.input_buffer = self.input_buffer[length:] - # if it does, just return data up to the boundary - else: - ret, self.input_buffer = self.input_buffer.split(self.boundary, 1) - self.no_more_files = self.input_buffer.startswith('--') - self.no_more_data_for_this_file = True - self.input_buffer = self.input_buffer[2:] - return ret - - def readline(self): - if self.no_more_data_for_this_file: - return '' - boundary_pos = newline_pos = -1 - while newline_pos < 0 and boundary_pos < 0: - chunk = self.wsgi_input.read(READ_CHUNK_SIZE) - self.input_buffer += chunk - newline_pos = self.input_buffer.find('\r\n') - boundary_pos = self.input_buffer.find(self.boundary) - if not chunk: - self.no_more_files = True - break - # found a newline - if newline_pos >= 0 and \ - (boundary_pos < 0 or newline_pos < boundary_pos): - # Use self.read to ensure any logic there happens... - ret = '' - to_read = newline_pos + 2 - while to_read > 0: - chunk = self.read(to_read) - # Should never happen since we're reading from input_buffer, - # but just for completeness... - if not chunk: - break - to_read -= len(chunk) - ret += chunk - return ret - else: # no newlines, just return up to next boundary - return self.read(len(self.input_buffer)) - - -def _iter_requests(wsgi_input, boundary): - """ - Given a multi-part mime encoded input file object and boundary, - yield file-like objects for each part. - - :param wsgi_input: The file-like object to read from. - :param boundary: The mime boundary to separate new file-like - objects on. - :returns: A generator of file-like objects for each part. - """ - boundary = '--' + boundary - if wsgi_input.readline(len(boundary + '\r\n')).strip() != boundary: - raise FormInvalid('invalid starting boundary') - boundary = '\r\n' + boundary - input_buffer = '' - done = False - while not done: - it = _IterRequestsFileLikeObject(wsgi_input, boundary, input_buffer) - yield it - done = it.no_more_files - input_buffer = it.input_buffer - - class _CappedFileLikeObject(object): """ A file-like object wrapping another file-like object that raises @@ -328,7 +207,7 @@ class FormPost(object): if env['REQUEST_METHOD'] == 'POST': try: content_type, attrs = \ - _parse_attrs(env.get('CONTENT_TYPE') or '') + parse_content_disposition(env.get('CONTENT_TYPE') or '') if content_type == 'multipart/form-data' and \ 'boundary' in attrs: http_user_agent = "%s FormPost" % ( @@ -338,7 +217,7 @@ class FormPost(object): env, attrs['boundary']) start_response(status, headers) return [body] - except (FormInvalid, EOFError) as err: + except (FormInvalid, MimeInvalid, EOFError) as err: body = 'FormPost: %s' % err start_response( '400 Bad Request', @@ -365,10 +244,11 @@ class FormPost(object): attributes = {} subheaders = [] file_count = 0 - for fp in _iter_requests(env['wsgi.input'], boundary): + for fp in iter_multipart_mime_documents( + env['wsgi.input'], boundary, read_chunk_size=READ_CHUNK_SIZE): hdrs = rfc822.Message(fp, 0) - disp, attrs = \ - _parse_attrs(hdrs.getheader('Content-Disposition', '')) + disp, attrs = parse_content_disposition( + hdrs.getheader('Content-Disposition', '')) if disp == 'form-data' and attrs.get('filename'): file_count += 1 try: diff --git a/swift/common/utils.py b/swift/common/utils.py index 4dacae24fb..e37dc34c61 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -2986,3 +2986,129 @@ def get_expirer_container(x_delete_at, expirer_divisor, acc, cont, obj): shard_int = int(hash_path(acc, cont, obj), 16) % 100 return normalize_delete_at_timestamp( int(x_delete_at) / expirer_divisor * expirer_divisor - shard_int) + + +class _MultipartMimeFileLikeObject(object): + + def __init__(self, wsgi_input, boundary, input_buffer, read_chunk_size): + self.no_more_data_for_this_file = False + self.no_more_files = False + self.wsgi_input = wsgi_input + self.boundary = boundary + self.input_buffer = input_buffer + self.read_chunk_size = read_chunk_size + + def read(self, length=None): + if not length: + length = self.read_chunk_size + if self.no_more_data_for_this_file: + return '' + + # read enough data to know whether we're going to run + # into a boundary in next [length] bytes + if len(self.input_buffer) < length + len(self.boundary) + 2: + to_read = length + len(self.boundary) + 2 + while to_read > 0: + chunk = self.wsgi_input.read(to_read) + to_read -= len(chunk) + self.input_buffer += chunk + if not chunk: + self.no_more_files = True + break + + boundary_pos = self.input_buffer.find(self.boundary) + + # boundary does not exist in the next (length) bytes + if boundary_pos == -1 or boundary_pos > length: + ret = self.input_buffer[:length] + self.input_buffer = self.input_buffer[length:] + # if it does, just return data up to the boundary + else: + ret, self.input_buffer = self.input_buffer.split(self.boundary, 1) + self.no_more_files = self.input_buffer.startswith('--') + self.no_more_data_for_this_file = True + self.input_buffer = self.input_buffer[2:] + return ret + + def readline(self): + if self.no_more_data_for_this_file: + return '' + boundary_pos = newline_pos = -1 + while newline_pos < 0 and boundary_pos < 0: + chunk = self.wsgi_input.read(self.read_chunk_size) + self.input_buffer += chunk + newline_pos = self.input_buffer.find('\r\n') + boundary_pos = self.input_buffer.find(self.boundary) + if not chunk: + self.no_more_files = True + break + # found a newline + if newline_pos >= 0 and \ + (boundary_pos < 0 or newline_pos < boundary_pos): + # Use self.read to ensure any logic there happens... + ret = '' + to_read = newline_pos + 2 + while to_read > 0: + chunk = self.read(to_read) + # Should never happen since we're reading from input_buffer, + # but just for completeness... + if not chunk: + break + to_read -= len(chunk) + ret += chunk + return ret + else: # no newlines, just return up to next boundary + return self.read(len(self.input_buffer)) + + +def iter_multipart_mime_documents(wsgi_input, boundary, read_chunk_size=4096): + """ + Given a multi-part-mime-encoded input file object and boundary, + yield file-like objects for each part. + + :param wsgi_input: The file-like object to read from. + :param boundary: The mime boundary to separate new file-like + objects on. + :returns: A generator of file-like objects for each part. + :raises: MimeInvalid if the document is malformed + """ + boundary = '--' + boundary + if wsgi_input.readline(len(boundary + '\r\n')).strip() != boundary: + raise swift.common.exceptions.MimeInvalid('invalid starting boundary') + boundary = '\r\n' + boundary + input_buffer = '' + done = False + while not done: + it = _MultipartMimeFileLikeObject(wsgi_input, boundary, input_buffer, + read_chunk_size) + yield it + done = it.no_more_files + input_buffer = it.input_buffer + + +#: Regular expression to match form attributes. +ATTRIBUTES_RE = re.compile(r'(\w+)=(".*?"|[^";]+)(; ?|$)') + + +def parse_content_disposition(header): + """ + Given the value of a header like: + Content-Disposition: form-data; name="somefile"; filename="test.html" + + Return data like + ("form-data", {"name": "somefile", "filename": "test.html"}) + + :param header: Value of a header (the part after the ': '). + :returns: (value name, dict) of the attribute data parsed (see above). + """ + attributes = {} + attrs = '' + if '; ' in header: + header, attrs = header.split('; ', 1) + m = True + while m: + m = ATTRIBUTES_RE.match(attrs) + if m: + attrs = attrs[len(m.group(0)):] + attributes[m.group(1)] = m.group(2).strip('"') + return header, attributes diff --git a/test/unit/common/middleware/test_formpost.py b/test/unit/common/middleware/test_formpost.py index 338d1129fe..9ba713c2c2 100644 --- a/test/unit/common/middleware/test_formpost.py +++ b/test/unit/common/middleware/test_formpost.py @@ -68,167 +68,6 @@ class FakeApp(object): return ['Client Disconnect\n'] -class TestParseAttrs(unittest.TestCase): - - def test_basic_content_type(self): - name, attrs = formpost._parse_attrs('text/plain') - self.assertEquals(name, 'text/plain') - self.assertEquals(attrs, {}) - - def test_content_type_with_charset(self): - name, attrs = formpost._parse_attrs('text/plain; charset=UTF8') - self.assertEquals(name, 'text/plain') - self.assertEquals(attrs, {'charset': 'UTF8'}) - - def test_content_disposition(self): - name, attrs = formpost._parse_attrs( - 'form-data; name="somefile"; filename="test.html"') - self.assertEquals(name, 'form-data') - self.assertEquals(attrs, {'name': 'somefile', 'filename': 'test.html'}) - - -class TestIterRequests(unittest.TestCase): - - def test_bad_start(self): - it = formpost._iter_requests(StringIO('blah'), 'unique') - exc = None - try: - it.next() - except formpost.FormInvalid as err: - exc = err - self.assertEquals(str(exc), 'invalid starting boundary') - - def test_empty(self): - it = formpost._iter_requests(StringIO('--unique'), 'unique') - fp = it.next() - self.assertEquals(fp.read(), '') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - - def test_basic(self): - it = formpost._iter_requests( - StringIO('--unique\r\nabcdefg\r\n--unique--'), 'unique') - fp = it.next() - self.assertEquals(fp.read(), 'abcdefg') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - - def test_basic2(self): - it = formpost._iter_requests( - StringIO('--unique\r\nabcdefg\r\n--unique\r\nhijkl\r\n--unique--'), - 'unique') - fp = it.next() - self.assertEquals(fp.read(), 'abcdefg') - fp = it.next() - self.assertEquals(fp.read(), 'hijkl') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - - def test_tiny_reads(self): - it = formpost._iter_requests( - StringIO('--unique\r\nabcdefg\r\n--unique\r\nhijkl\r\n--unique--'), - 'unique') - fp = it.next() - self.assertEquals(fp.read(2), 'ab') - self.assertEquals(fp.read(2), 'cd') - self.assertEquals(fp.read(2), 'ef') - self.assertEquals(fp.read(2), 'g') - self.assertEquals(fp.read(2), '') - fp = it.next() - self.assertEquals(fp.read(), 'hijkl') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - - def test_big_reads(self): - it = formpost._iter_requests( - StringIO('--unique\r\nabcdefg\r\n--unique\r\nhijkl\r\n--unique--'), - 'unique') - fp = it.next() - self.assertEquals(fp.read(65536), 'abcdefg') - self.assertEquals(fp.read(), '') - fp = it.next() - self.assertEquals(fp.read(), 'hijkl') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - - def test_broken_mid_stream(self): - # We go ahead and accept whatever is sent instead of rejecting the - # whole request, in case the partial form is still useful. - it = formpost._iter_requests( - StringIO('--unique\r\nabc'), 'unique') - fp = it.next() - self.assertEquals(fp.read(), 'abc') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - - def test_readline(self): - it = formpost._iter_requests( - StringIO('--unique\r\nab\r\ncd\ref\ng\r\n--unique\r\nhi\r\n\r\n' - 'jkl\r\n\r\n--unique--'), 'unique') - fp = it.next() - self.assertEquals(fp.readline(), 'ab\r\n') - self.assertEquals(fp.readline(), 'cd\ref\ng') - self.assertEquals(fp.readline(), '') - fp = it.next() - self.assertEquals(fp.readline(), 'hi\r\n') - self.assertEquals(fp.readline(), '\r\n') - self.assertEquals(fp.readline(), 'jkl\r\n') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - - def test_readline_with_tiny_chunks(self): - orig_read_chunk_size = formpost.READ_CHUNK_SIZE - try: - formpost.READ_CHUNK_SIZE = 2 - it = formpost._iter_requests( - StringIO('--unique\r\nab\r\ncd\ref\ng\r\n--unique\r\nhi\r\n' - '\r\njkl\r\n\r\n--unique--'), 'unique') - fp = it.next() - self.assertEquals(fp.readline(), 'ab\r\n') - self.assertEquals(fp.readline(), 'cd\ref\ng') - self.assertEquals(fp.readline(), '') - fp = it.next() - self.assertEquals(fp.readline(), 'hi\r\n') - self.assertEquals(fp.readline(), '\r\n') - self.assertEquals(fp.readline(), 'jkl\r\n') - exc = None - try: - it.next() - except StopIteration as err: - exc = err - self.assertTrue(exc is not None) - finally: - formpost.READ_CHUNK_SIZE = orig_read_chunk_size - - class TestCappedFileLikeObject(unittest.TestCase): def test_whole(self): diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index a6213c2872..6f91477d4f 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -54,7 +54,8 @@ from mock import MagicMock, patch from swift.common.exceptions import (Timeout, MessageTimeout, ConnectionTimeout, LockTimeout, - ReplicationLockTimeout) + ReplicationLockTimeout, + MimeInvalid) from swift.common import utils from swift.common.container_sync_realms import ContainerSyncRealms from swift.common.swob import Request, Response @@ -4168,5 +4169,165 @@ class TestLRUCache(unittest.TestCase): self.assertEqual(f.size(), 4) +class TestParseContentDisposition(unittest.TestCase): + + def test_basic_content_type(self): + name, attrs = utils.parse_content_disposition('text/plain') + self.assertEquals(name, 'text/plain') + self.assertEquals(attrs, {}) + + def test_content_type_with_charset(self): + name, attrs = utils.parse_content_disposition( + 'text/plain; charset=UTF8') + self.assertEquals(name, 'text/plain') + self.assertEquals(attrs, {'charset': 'UTF8'}) + + def test_content_disposition(self): + name, attrs = utils.parse_content_disposition( + 'form-data; name="somefile"; filename="test.html"') + self.assertEquals(name, 'form-data') + self.assertEquals(attrs, {'name': 'somefile', 'filename': 'test.html'}) + + +class TestIterMultipartMimeDocuments(unittest.TestCase): + + def test_bad_start(self): + it = utils.iter_multipart_mime_documents(StringIO('blah'), 'unique') + exc = None + try: + it.next() + except MimeInvalid as err: + exc = err + self.assertEquals(str(exc), 'invalid starting boundary') + + def test_empty(self): + it = utils.iter_multipart_mime_documents(StringIO('--unique'), + 'unique') + fp = it.next() + self.assertEquals(fp.read(), '') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + def test_basic(self): + it = utils.iter_multipart_mime_documents( + StringIO('--unique\r\nabcdefg\r\n--unique--'), 'unique') + fp = it.next() + self.assertEquals(fp.read(), 'abcdefg') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + def test_basic2(self): + it = utils.iter_multipart_mime_documents( + StringIO('--unique\r\nabcdefg\r\n--unique\r\nhijkl\r\n--unique--'), + 'unique') + fp = it.next() + self.assertEquals(fp.read(), 'abcdefg') + fp = it.next() + self.assertEquals(fp.read(), 'hijkl') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + def test_tiny_reads(self): + it = utils.iter_multipart_mime_documents( + StringIO('--unique\r\nabcdefg\r\n--unique\r\nhijkl\r\n--unique--'), + 'unique') + fp = it.next() + self.assertEquals(fp.read(2), 'ab') + self.assertEquals(fp.read(2), 'cd') + self.assertEquals(fp.read(2), 'ef') + self.assertEquals(fp.read(2), 'g') + self.assertEquals(fp.read(2), '') + fp = it.next() + self.assertEquals(fp.read(), 'hijkl') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + def test_big_reads(self): + it = utils.iter_multipart_mime_documents( + StringIO('--unique\r\nabcdefg\r\n--unique\r\nhijkl\r\n--unique--'), + 'unique') + fp = it.next() + self.assertEquals(fp.read(65536), 'abcdefg') + self.assertEquals(fp.read(), '') + fp = it.next() + self.assertEquals(fp.read(), 'hijkl') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + def test_broken_mid_stream(self): + # We go ahead and accept whatever is sent instead of rejecting the + # whole request, in case the partial form is still useful. + it = utils.iter_multipart_mime_documents( + StringIO('--unique\r\nabc'), 'unique') + fp = it.next() + self.assertEquals(fp.read(), 'abc') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + def test_readline(self): + it = utils.iter_multipart_mime_documents( + StringIO('--unique\r\nab\r\ncd\ref\ng\r\n--unique\r\nhi\r\n\r\n' + 'jkl\r\n\r\n--unique--'), 'unique') + fp = it.next() + self.assertEquals(fp.readline(), 'ab\r\n') + self.assertEquals(fp.readline(), 'cd\ref\ng') + self.assertEquals(fp.readline(), '') + fp = it.next() + self.assertEquals(fp.readline(), 'hi\r\n') + self.assertEquals(fp.readline(), '\r\n') + self.assertEquals(fp.readline(), 'jkl\r\n') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + def test_readline_with_tiny_chunks(self): + it = utils.iter_multipart_mime_documents( + StringIO('--unique\r\nab\r\ncd\ref\ng\r\n--unique\r\nhi\r\n' + '\r\njkl\r\n\r\n--unique--'), + 'unique', + read_chunk_size=2) + fp = it.next() + self.assertEquals(fp.readline(), 'ab\r\n') + self.assertEquals(fp.readline(), 'cd\ref\ng') + self.assertEquals(fp.readline(), '') + fp = it.next() + self.assertEquals(fp.readline(), 'hi\r\n') + self.assertEquals(fp.readline(), '\r\n') + self.assertEquals(fp.readline(), 'jkl\r\n') + exc = None + try: + it.next() + except StopIteration as err: + exc = err + self.assertTrue(exc is not None) + + if __name__ == '__main__': unittest.main()