Reject object names with Unicode surrogates
Technically, you can't encode surrogates into UTF-8 at all, but Python 2 lets you get away with it. Python 3 does not. We already have a check for surrogate pairs (commit 0080337), but not one for lone surrogates. This commit forbids object names with lone surrogates in them. The problem with surrogates is trivially reproducible: swift@saio:~$ python2.7 Python 2.7.3 (default, Feb 27 2014, 19:58:35) [GCC 4.6.3] on linux2 Type "help", "copyright", "credits" or "license" for more information. >>> b'\xed\xa0\xbc'.decode('utf-8') u'\ud83c' >>> swift@saio:~$ python3.3 Python 3.3.5 (default, Aug 4 2014, 15:27:24) [GCC 4.6.3] on linux Type "help", "copyright", "credits" or "license" for more information. >>> b'\xed\xa0\xbc'.decode('utf-8') Traceback (most recent call last): File "<stdin>", line 1, in <module> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0: invalid continuation byte >>> See also http://bugs.python.org/issue9133 Change-Id: I7c31022e8a028c3cdf2ed1586349509d96cfded9
This commit is contained in:
parent
772dc5d059
commit
331b14238e
@ -306,7 +306,12 @@ def check_utf8(string):
|
||||
if isinstance(string, unicode):
|
||||
string.encode('utf-8')
|
||||
else:
|
||||
if string.decode('UTF-8').encode('UTF-8') != string:
|
||||
decoded = string.decode('UTF-8')
|
||||
if decoded.encode('UTF-8') != string:
|
||||
return False
|
||||
# A UTF-8 string with surrogates in it is invalid.
|
||||
if any(0xD800 <= ord(codepoint) <= 0xDFFF
|
||||
for codepoint in decoded):
|
||||
return False
|
||||
return '\x00' not in string
|
||||
# If string is unicode, decode() will raise UnicodeEncodeError
|
||||
|
@ -420,6 +420,10 @@ class TestConstraints(unittest.TestCase):
|
||||
self.assertFalse(constraints.check_utf8('\xed\xa0\xbc\xed\xbc\xb8'))
|
||||
self.assertFalse(constraints.check_utf8('\xed\xa0\xbd\xed\xb9\x88'))
|
||||
|
||||
def test_check_utf8_lone_surrogates(self):
|
||||
self.assertFalse(constraints.check_utf8('\xed\xa0\xbc'))
|
||||
self.assertFalse(constraints.check_utf8('\xed\xb9\x88'))
|
||||
|
||||
def test_validate_bad_meta(self):
|
||||
req = Request.blank(
|
||||
'/v/a/c/o',
|
||||
|
Loading…
x
Reference in New Issue
Block a user