From 7f2406874ec187fd65dc7aef53d184236aa7edc8 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 11 Mar 2015 16:21:49 +0100 Subject: [PATCH] Add exception_to_unicode() function There is no simple way working on Python 2 and Python 3 to get the message of an exception as a Unicode string. This new functions uses an heuristic to get the encoding of the exception message. It tries UTF-8 (which is a superset of ASCII), the locale encoding, or fallback to ISO-8859-1 (which never fails). This function is required to log exceptions using the logging module when the exception message contains non-ASCII characters. - logging.log() only works with non-ASCII characters on Python 2 and Python 3 if the message is formatted as Unicode. For example, logging.log(b'error: %s', b'\xe9') fails in most cases. - unicode % bytes doesn't work with non-ASCII characters - logging.log(u'error: %s', exc) doesn't work if the exception message contains a non-ASCII character Only logging.log(u'error: %s', exception_to_unicode(exc)) works in all cases: ASCII or non-ASCII exception message, Python 2 and Python 3. Co-Authored-By: Joshua Harlow Change-Id: I241b7c81c7ae3d26f81790e9180678dc9af81e22 --- oslo_utils/encodeutils.py | 56 +++++++++++ oslo_utils/tests/tests_encodeutils.py | 128 ++++++++++++++++++++++++++ 2 files changed, 184 insertions(+) diff --git a/oslo_utils/encodeutils.py b/oslo_utils/encodeutils.py index 14bd7175..7d8af13b 100644 --- a/oslo_utils/encodeutils.py +++ b/oslo_utils/encodeutils.py @@ -93,3 +93,59 @@ def safe_encode(text, incoming=None, return text.encode(encoding, errors) else: return text + + +def exception_to_unicode(exc): + """Get the message of an exception as a Unicode string. + + On Python 3, the exception message is always a Unicode string. On + Python 2, the exception message is a bytes string *most* of the time. + + If the exception message is a bytes strings, try to decode it from UTF-8 + (superset of ASCII), from the locale encoding, or fallback to decoding it + from ISO-8859-1 (which never fails). + """ + msg = None + if six.PY2: + # Don't call directly unicode(exc), because it fails with + # UnicodeDecodeError on Python 2 if exc.__unicode__() returns a bytes + # string not decodable from the default encoding (ASCII) + try: + msg = exc.__unicode__() + except UnicodeError: + pass + + if msg is None: + # Don't call directly str(exc), because it fails with + # UnicodeEncodeError on Python 2 if exc.__str__() returns a Unicode + # string not encodable to the default encoding (ASCII) + msg = exc.__str__() + + if isinstance(msg, six.text_type): + # This should be the default path on Python 3 and an *optional* path + # on Python 2 (if for some reason the exception message was already + # in unicode instead of the more typical bytes string); so avoid + # further converting to unicode in both of these cases. + return msg + + try: + # Try to decode from UTF-8 (superset of ASCII). The decoder fails + # if the string is not a valid UTF-8 string: the UTF-8 codec includes + # a validation algorithm to ensure the consistency of the codec. + return msg.decode('utf-8') + except UnicodeDecodeError: + pass + + # Try the locale encoding, most error messages are encoded to this encoding + # (ex: os.strerror(errno)) + encoding = sys.getfilesystemencoding() + try: + return msg.decode(encoding) + except UnicodeDecodeError: + pass + + # The encoding is not ASCII, not UTF-8, nor the locale encoding. Fallback + # to the ISO-8859-1 encoding which never fails. It will produce mojibake + # if the message is not encoded to ISO-8859-1, but we don't want a super + # complex heuristic to get the encoding of an exception message. + return msg.decode('latin1') diff --git a/oslo_utils/tests/tests_encodeutils.py b/oslo_utils/tests/tests_encodeutils.py index 6af4bfb5..d0f696ed 100644 --- a/oslo_utils/tests/tests_encodeutils.py +++ b/oslo_utils/tests/tests_encodeutils.py @@ -18,6 +18,7 @@ import mock from oslotest import base as test_base import six +import testtools from oslo_utils import encodeutils @@ -103,3 +104,130 @@ class EncodeUtilsTest(test_base.BaseTestCase): text=text, incoming='utf-8', encoding='iso-8859-1') self.assertNotEqual(text, result) self.assertNotEqual(six.b("foo\xf1bar"), result) + + +class ExceptionToUnicodeTest(test_base.BaseTestCase): + + def test_str_exception(self): + # The regular Exception class cannot be used directly: + # Exception(u'\xe9').__str__() raises an UnicodeEncodeError + # on Python 2 + class StrException(Exception): + def __init__(self, value): + Exception.__init__(self) + self.value = value + + def __str__(self): + return self.value + + # On Python 3, an exception which returns bytes with is __str__() + # method (like StrException(bytes)) is probably a bug, but it was not + # harder to support this silly case in exception_to_unicode(). + + # Decode from ASCII + exc = StrException(b'bytes ascii') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'bytes ascii') + + # Decode from UTF-8 + exc = StrException(b'utf-8 \xc3\xa9\xe2\x82\xac') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'utf-8 \xe9\u20ac') + + # Force the locale encoding to ASCII to test the fallback + with mock.patch('sys.getfilesystemencoding', return_value='ascii'): + # Fallback: decode from ISO-8859-1 + exc = StrException(b'rawbytes \x80\xff') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'rawbytes \x80\xff') + + # No conversion needed + exc = StrException(u'unicode ascii') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'unicode ascii') + + # No conversion needed + exc = StrException(u'unicode \xe9\u20ac') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'unicode \xe9\u20ac') + + # Test the locale encoding + with mock.patch('sys.getfilesystemencoding', return_value='koi8_r'): + exc = StrException(b'\xf2\xd5\xd3\xd3\xcb\xc9\xca') + # Decode from the locale encoding + # (the message cannot be decoded from ASCII nor UTF-8) + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439') + + @testtools.skipIf(six.PY3, 'test specific to Python 2') + def test_unicode_exception(self): + # Exception with a __unicode__() method, but no __str__() + class UnicodeException(Exception): + def __init__(self, value): + Exception.__init__(self) + self.value = value + + def __unicode__(self): + return self.value + + # __unicode__() returns unicode + exc = UnicodeException(u'unicode \xe9\u20ac') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'unicode \xe9\u20ac') + + # __unicode__() returns bytes (does this case really happen in the + # wild?) + exc = UnicodeException(b'utf-8 \xc3\xa9\xe2\x82\xac') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'utf-8 \xe9\u20ac') + + @testtools.skipIf(six.PY3, 'test specific to Python 2') + def test_unicode_or_str_exception(self): + # Exception with __str__() and __unicode__() methods + class UnicodeOrStrException(Exception): + def __init__(self, unicode_value, str_value): + Exception.__init__(self) + self.unicode_value = unicode_value + self.str_value = str_value + + def __unicode__(self): + return self.unicode_value + + def __str__(self): + return self.str_value + + # __unicode__() returns unicode + exc = UnicodeOrStrException(u'unicode \xe9\u20ac', b'str') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'unicode \xe9\u20ac') + + # __unicode__() returns bytes (does this case really happen in the + # wild?) + exc = UnicodeOrStrException(b'utf-8 \xc3\xa9\xe2\x82\xac', b'str') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'utf-8 \xe9\u20ac') + + @testtools.skipIf(six.PY3, 'test specific to Python 2') + def test_unicode_only_exception(self): + # Exception with a __unicode__() method and a __str__() which + # raises an exception (similar to the Message class of oslo_i18n) + class UnicodeOnlyException(Exception): + def __init__(self, value): + Exception.__init__(self) + self.value = value + + def __unicode__(self): + return self.value + + def __str__(self): + raise UnicodeError("use unicode()") + + # __unicode__() returns unicode + exc = UnicodeOnlyException(u'unicode \xe9\u20ac') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'unicode \xe9\u20ac') + + # __unicode__() returns bytes + exc = UnicodeOnlyException(b'utf-8 \xc3\xa9\xe2\x82\xac') + self.assertEqual(encodeutils.exception_to_unicode(exc), + u'utf-8 \xe9\u20ac')