Fix encoding issue in ssync_sender.send_put()

EC object metadata can currently have a mixture of bytestrings and
unicode.  The ssync_sender.send_put() method raises an
UnicodeDecodeError when it attempts to concatenate the metadata
values, if any bytestring has non-ascii characters.

The root cause of this issue is that the object server uses unicode
for the keys of some object metadata items that are received in the
footer of an EC PUT request, whereas all other object metadata keys
and values are persisted as bytestrings.

This patch fixes the bug by changing diskfile write_metadata()
function to encode all unicode metadata keys and values as utf8
encoded bytes before writing to disk. To cope with existing objects
that have a mixture of unicode and bytestring metadata, the diskfile
read_metadata() function is also changed so that all returned unicode
metadata keys and values are utf8 encoded. This ensures that
ssync_sender.send_put() (and any other caller of diskfile
read_metadata) only reads bytestrings from object metadata.

Closes-Bug: #1678018
Co-Authored-By: Alistair Coles <alistairncoles@gmail.com>
Change-Id: Ic23c55754ee142f6f5388dcda592a3afc9845c39
This commit is contained in:
Romain LE DISEZ 2017-03-31 11:21:13 +02:00 committed by Alistair Coles
parent 83750cf79c
commit 091157fc7f
5 changed files with 129 additions and 18 deletions

View File

@ -114,6 +114,20 @@ def _get_filename(fd):
return fd
def _encode_metadata(metadata):
"""
UTF8 encode any unicode keys or values in given metadata dict.
:param metadata: a dict
"""
def encode_str(item):
if isinstance(item, six.text_type):
return item.encode('utf8')
return item
return dict(((encode_str(k), encode_str(v)) for k, v in metadata.items()))
def read_metadata(fd):
"""
Helper function to read the pickled metadata from an object file.
@ -140,7 +154,10 @@ def read_metadata(fd):
raise DiskFileNotExist()
# TODO: we might want to re-raise errors that don't denote a missing
# xattr here. Seems to be ENODATA on linux and ENOATTR on BSD/OSX.
return pickle.loads(metadata)
# strings are utf-8 encoded when written, but have not always been
# (see https://bugs.launchpad.net/swift/+bug/1678018) so encode them again
# when read
return _encode_metadata(pickle.loads(metadata))
def write_metadata(fd, metadata, xattr_size=65536):
@ -150,7 +167,7 @@ def write_metadata(fd, metadata, xattr_size=65536):
:param fd: file descriptor or filename to write the metadata
:param metadata: metadata to write
"""
metastr = pickle.dumps(metadata, PICKLE_PROTOCOL)
metastr = pickle.dumps(_encode_metadata(metadata), PICKLE_PROTOCOL)
key = 0
while metastr:
try:

View File

@ -81,8 +81,13 @@ class TestReconstructorRebuild(ECProbeTest):
# PUT object and POST some metadata
contents = Body()
headers = {'x-object-meta-foo': 'meta-foo'}
self.headers_post = {'x-object-meta-bar': 'meta-bar'}
headers = {
self._make_name('x-object-meta-').decode('utf8'):
self._make_name('meta-foo-').decode('utf8'),
}
self.headers_post = {
self._make_name('x-object-meta-').decode('utf8'):
self._make_name('meta-bar-').decode('utf8')}
self.etag = client.put_object(self.url, self.token,
self.container_name,

View File

@ -286,6 +286,58 @@ class TestDiskFileModuleMethods(unittest.TestCase):
exp_dir = '/srv/node/sdb5/objects-1/123'
self.assertEqual(part_dir, exp_dir)
def test_write_read_metadata(self):
path = os.path.join(self.testdir, str(uuid.uuid4()))
metadata = {'name': '/a/c/o',
'Content-Length': 99,
u'X-Object-Sysmeta-Ec-Frag-Index': 4,
u'X-Object-Meta-Strange': u'should be bytes',
b'X-Object-Meta-x\xff': b'not utf8 \xff',
u'X-Object-Meta-y\xe8': u'not ascii \xe8'}
expected = {b'name': b'/a/c/o',
b'Content-Length': 99,
b'X-Object-Sysmeta-Ec-Frag-Index': 4,
b'X-Object-Meta-Strange': b'should be bytes',
b'X-Object-Meta-x\xff': b'not utf8 \xff',
b'X-Object-Meta-y\xc3\xa8': b'not ascii \xc3\xa8'}
def check_metadata():
with open(path, 'rb') as fd:
actual = diskfile.read_metadata(fd)
self.assertEqual(expected, actual)
for k in actual.keys():
self.assertIsInstance(k, six.binary_type)
for k in (b'name',
b'X-Object-Meta-Strange',
b'X-Object-Meta-x\xff',
b'X-Object-Meta-y\xc3\xa8'):
self.assertIsInstance(actual[k], six.binary_type)
with open(path, 'wb') as fd:
diskfile.write_metadata(fd, metadata)
check_metadata()
# mock the read path to check the write path encoded persisted metadata
with mock.patch.object(diskfile, '_encode_metadata', lambda x: x):
check_metadata()
# simulate a legacy diskfile that might have persisted unicode metadata
with mock.patch.object(diskfile, '_encode_metadata', lambda x: x):
with open(path, 'wb') as fd:
diskfile.write_metadata(fd, metadata)
# sanity check, while still mocked, that we did persist unicode
with open(path, 'rb') as fd:
actual = diskfile.read_metadata(fd)
for k, v in actual.items():
if k == u'X-Object-Meta-Strange':
self.assertIsInstance(k, six.text_type)
self.assertIsInstance(v, six.text_type)
break
else:
self.fail('Did not find X-Object-Meta-Strange')
# check that read_metadata converts binary_type
check_metadata()
@patch_policies
class TestObjectAuditLocationGenerator(unittest.TestCase):

View File

@ -7255,6 +7255,19 @@ class TestObjectServer(unittest.TestCase):
obj_datafile = found_files['.data'][0]
self.assertEqual("%s#2#d.data" % put_timestamp.internal,
os.path.basename(obj_datafile))
with open(obj_datafile) as fd:
actual_meta = diskfile.read_metadata(fd)
expected_meta = {'Content-Length': '82',
'name': '/a/c/o',
'X-Object-Sysmeta-Ec-Frag-Index': '2',
'X-Timestamp': put_timestamp.normal,
'Content-Type': 'text/plain'}
for k, v in actual_meta.items():
self.assertIsInstance(k, six.binary_type)
self.assertIsInstance(v, six.binary_type)
self.assertIsNotNone(actual_meta.pop('ETag', None))
self.assertEqual(expected_meta, actual_meta)
# but no other files
self.assertFalse(found_files['.data'][1:])
found_files.pop('.data')

View File

@ -1452,62 +1452,86 @@ class TestSender(BaseTest):
exc = err
self.assertEqual(str(exc), '0.01 seconds: send_put chunk')
def test_send_put(self):
def _check_send_put(self, obj_name, meta_value):
ts_iter = make_timestamp_iter()
t1 = next(ts_iter)
body = 'test'
extra_metadata = {'Some-Other-Header': 'value'}
df = self._make_open_diskfile(body=body, timestamp=t1,
extra_metadata = {'Some-Other-Header': 'value',
u'Unicode-Meta-Name': meta_value}
df = self._make_open_diskfile(obj=obj_name, body=body,
timestamp=t1,
extra_metadata=extra_metadata)
expected = dict(df.get_metadata())
expected['body'] = body
expected['chunk_size'] = len(body)
expected['meta'] = meta_value
path = six.moves.urllib.parse.quote(expected['name'])
expected['path'] = path
expected['length'] = format(145 + len(path) + len(meta_value), 'x')
# .meta file metadata is not included in expected for data only PUT
t2 = next(ts_iter)
metadata = {'X-Timestamp': t2.internal, 'X-Object-Meta-Fruit': 'kiwi'}
df.write_metadata(metadata)
df.open()
self.sender.connection = FakeConnection()
self.sender.send_put('/a/c/o', df)
self.sender.send_put(path, df)
self.assertEqual(
''.join(self.sender.connection.sent),
'82\r\n'
'PUT /a/c/o\r\n'
'%(length)s\r\n'
'PUT %(path)s\r\n'
'Content-Length: %(Content-Length)s\r\n'
'ETag: %(ETag)s\r\n'
'Some-Other-Header: value\r\n'
'Unicode-Meta-Name: %(meta)s\r\n'
'X-Timestamp: %(X-Timestamp)s\r\n'
'\r\n'
'\r\n'
'%(chunk_size)s\r\n'
'%(body)s\r\n' % expected)
def test_send_post(self):
def test_send_put(self):
self._check_send_put('o', 'meta')
def test_send_put_unicode(self):
self._check_send_put(
'o_with_caract\xc3\xa8res_like_in_french', 'm\xc3\xa8ta')
def _check_send_post(self, obj_name, meta_value):
ts_iter = make_timestamp_iter()
# create .data file
extra_metadata = {'X-Object-Meta-Foo': 'old_value',
'X-Object-Sysmeta-Test': 'test_sysmeta',
'Content-Type': 'test_content_type'}
ts_0 = next(ts_iter)
df = self._make_open_diskfile(extra_metadata=extra_metadata,
df = self._make_open_diskfile(obj=obj_name,
extra_metadata=extra_metadata,
timestamp=ts_0)
# create .meta file
ts_1 = next(ts_iter)
newer_metadata = {'X-Object-Meta-Foo': 'new_value',
newer_metadata = {u'X-Object-Meta-Foo': meta_value,
'X-Timestamp': ts_1.internal}
df.write_metadata(newer_metadata)
path = six.moves.urllib.parse.quote(df.read_metadata()['name'])
length = format(61 + len(path) + len(meta_value), 'x')
self.sender.connection = FakeConnection()
with df.open():
self.sender.send_post('/a/c/o', df)
self.sender.send_post(path, df)
self.assertEqual(
''.join(self.sender.connection.sent),
'4c\r\n'
'POST /a/c/o\r\n'
'X-Object-Meta-Foo: new_value\r\n'
'%s\r\n'
'POST %s\r\n'
'X-Object-Meta-Foo: %s\r\n'
'X-Timestamp: %s\r\n'
'\r\n'
'\r\n' % ts_1.internal)
'\r\n' % (length, path, meta_value, ts_1.internal))
def test_send_post(self):
self._check_send_post('o', 'meta')
def test_send_post_unicode(self):
self._check_send_post(
'o_with_caract\xc3\xa8res_like_in_french', 'm\xc3\xa8ta')
def test_disconnect_timeout(self):
self.sender.connection = FakeConnection()