reject problematic object names

We had this problem:

    >> : x = '\xed\xa0\xbc\xed\xbc\xb8'
    >> : x == x.decode('utf-8').encode('utf-8')
    << : False

That str contains two utf-8 codepoints, which I guess python is normalizing
into one unicode character, which it then encodes as one utf-8 codepoint.
Like this:

    >> : u'\ud83c\udf38'
    << : u'\U0001f338'

I don't entirely understand that, but having a different byte representation
after round-tripping through unicode causes problems with replication and
listings.

This patch just rejects anything that doesn't re-encode to the same thing.
If someone smarter wants to do something different, please speak up.

Change-Id: I9ac48ac2693e4121be6585c6e4f5d0079e9bb3e4
This commit is contained in:
Michael Barton 2014-10-27 16:29:07 +00:00
parent 87a82a35f4
commit 0080337897
2 changed files with 6 additions and 1 deletions

View File

@ -306,7 +306,8 @@ def check_utf8(string):
if isinstance(string, unicode): if isinstance(string, unicode):
string.encode('utf-8') string.encode('utf-8')
else: else:
string.decode('UTF-8') if string.decode('UTF-8').encode('UTF-8') != string:
return False
return '\x00' not in string return '\x00' not in string
# If string is unicode, decode() will raise UnicodeEncodeError # If string is unicode, decode() will raise UnicodeEncodeError
# So, we should catch both UnicodeDecodeError & UnicodeEncodeError # So, we should catch both UnicodeDecodeError & UnicodeEncodeError

View File

@ -416,6 +416,10 @@ class TestConstraints(unittest.TestCase):
valid_utf8_str]: valid_utf8_str]:
self.assertTrue(constraints.check_utf8(true_argument)) self.assertTrue(constraints.check_utf8(true_argument))
def test_check_utf8_non_canonical(self):
self.assertFalse(constraints.check_utf8('\xed\xa0\xbc\xed\xbc\xb8'))
self.assertFalse(constraints.check_utf8('\xed\xa0\xbd\xed\xb9\x88'))
def test_validate_bad_meta(self): def test_validate_bad_meta(self):
req = Request.blank( req = Request.blank(
'/v/a/c/o', '/v/a/c/o',