reject problematic object names

We had this problem: >> : x = '\xed\xa0\xbc\xed\xbc\xb8' >> : x == x.decode('utf-8').encode('utf-8') << : False That str contains two utf-8 codepoints, which I guess python is normalizing into one unicode character, which it then encodes as one utf-8 codepoint. Like this: >> : u'\ud83c\udf38' << : u'\U0001f338' I don't entirely understand that, but having a different byte representation after round-tripping through unicode causes problems with replication and listings. This patch just rejects anything that doesn't re-encode to the same thing. If someone smarter wants to do something different, please speak up. Change-Id: I9ac48ac2693e4121be6585c6e4f5d0079e9bb3e4
2014-10-27 16:29:07 +00:00 · 2014-10-27 16:29:07 +00:00 · 0080337897
commit 0080337897
parent 87a82a35f4
2 changed files with 6 additions and 1 deletions
--- a/swift/common/constraints.py
+++ b/swift/common/constraints.py
@ -306,7 +306,8 @@ def check_utf8(string):
        if isinstance(string, unicode):
            string.encode('utf-8')
        else:
-            string.decode('UTF-8')
+            if string.decode('UTF-8').encode('UTF-8') != string:
+                return False
        return '\x00' not in string
    # If string is unicode, decode() will raise UnicodeEncodeError
    # So, we should catch both UnicodeDecodeError & UnicodeEncodeError
--- a/test/unit/common/test_constraints.py
+++ b/test/unit/common/test_constraints.py
@ -416,6 +416,10 @@ class TestConstraints(unittest.TestCase):
                              valid_utf8_str]:
            self.assertTrue(constraints.check_utf8(true_argument))

+    def test_check_utf8_non_canonical(self):
+        self.assertFalse(constraints.check_utf8('\xed\xa0\xbc\xed\xbc\xb8'))
+        self.assertFalse(constraints.check_utf8('\xed\xa0\xbd\xed\xb9\x88'))
+
    def test_validate_bad_meta(self):
        req = Request.blank(
            '/v/a/c/o',