reject problematic object names
We had this problem: >> : x = '\xed\xa0\xbc\xed\xbc\xb8' >> : x == x.decode('utf-8').encode('utf-8') << : False That str contains two utf-8 codepoints, which I guess python is normalizing into one unicode character, which it then encodes as one utf-8 codepoint. Like this: >> : u'\ud83c\udf38' << : u'\U0001f338' I don't entirely understand that, but having a different byte representation after round-tripping through unicode causes problems with replication and listings. This patch just rejects anything that doesn't re-encode to the same thing. If someone smarter wants to do something different, please speak up. Change-Id: I9ac48ac2693e4121be6585c6e4f5d0079e9bb3e4
This commit is contained in:
parent
87a82a35f4
commit
0080337897
@ -306,7 +306,8 @@ def check_utf8(string):
|
||||
if isinstance(string, unicode):
|
||||
string.encode('utf-8')
|
||||
else:
|
||||
string.decode('UTF-8')
|
||||
if string.decode('UTF-8').encode('UTF-8') != string:
|
||||
return False
|
||||
return '\x00' not in string
|
||||
# If string is unicode, decode() will raise UnicodeEncodeError
|
||||
# So, we should catch both UnicodeDecodeError & UnicodeEncodeError
|
||||
|
@ -416,6 +416,10 @@ class TestConstraints(unittest.TestCase):
|
||||
valid_utf8_str]:
|
||||
self.assertTrue(constraints.check_utf8(true_argument))
|
||||
|
||||
def test_check_utf8_non_canonical(self):
|
||||
self.assertFalse(constraints.check_utf8('\xed\xa0\xbc\xed\xbc\xb8'))
|
||||
self.assertFalse(constraints.check_utf8('\xed\xa0\xbd\xed\xb9\x88'))
|
||||
|
||||
def test_validate_bad_meta(self):
|
||||
req = Request.blank(
|
||||
'/v/a/c/o',
|
||||
|
Loading…
x
Reference in New Issue
Block a user