From 7f2406874ec187fd65dc7aef53d184236aa7edc8 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@redhat.com>
Date: Wed, 11 Mar 2015 16:21:49 +0100
Subject: [PATCH] Add exception_to_unicode() function

There is no simple way working on Python 2 and Python 3 to get the
message of an exception as a Unicode string. This new functions uses an
heuristic to get the encoding of the exception message. It tries UTF-8
(which is a superset of ASCII), the locale encoding, or fallback to
ISO-8859-1 (which never fails).

This function is required to log exceptions using the logging module
when the exception message contains non-ASCII characters.

- logging.log() only works with non-ASCII characters on Python 2 and
  Python 3 if the message is formatted as Unicode. For example,
  logging.log(b'error: %s', b'\xe9') fails in most cases.
- unicode % bytes doesn't work with non-ASCII characters
- logging.log(u'error: %s', exc) doesn't work if the exception message
  contains a non-ASCII character

Only logging.log(u'error: %s', exception_to_unicode(exc)) works in all
cases: ASCII or non-ASCII exception message, Python 2 and Python 3.

Co-Authored-By: Joshua Harlow <harlowja@gmail.com>

Change-Id: I241b7c81c7ae3d26f81790e9180678dc9af81e22
---
 oslo_utils/encodeutils.py             |  56 +++++++++++
 oslo_utils/tests/tests_encodeutils.py | 128 ++++++++++++++++++++++++++
 2 files changed, 184 insertions(+)

diff --git a/oslo_utils/encodeutils.py b/oslo_utils/encodeutils.py
index 14bd7175..7d8af13b 100644
--- a/oslo_utils/encodeutils.py
+++ b/oslo_utils/encodeutils.py
@@ -93,3 +93,59 @@ def safe_encode(text, incoming=None,
         return text.encode(encoding, errors)
     else:
         return text
+
+
+def exception_to_unicode(exc):
+    """Get the message of an exception as a Unicode string.
+
+    On Python 3, the exception message is always a Unicode string. On
+    Python 2, the exception message is a bytes string *most* of the time.
+
+    If the exception message is a bytes strings, try to decode it from UTF-8
+    (superset of ASCII), from the locale encoding, or fallback to decoding it
+    from ISO-8859-1 (which never fails).
+    """
+    msg = None
+    if six.PY2:
+        # Don't call directly unicode(exc), because it fails with
+        # UnicodeDecodeError on Python 2 if exc.__unicode__() returns a bytes
+        # string not decodable from the default encoding (ASCII)
+        try:
+            msg = exc.__unicode__()
+        except UnicodeError:
+            pass
+
+    if msg is None:
+        # Don't call directly str(exc), because it fails with
+        # UnicodeEncodeError on Python 2 if exc.__str__() returns a Unicode
+        # string not encodable to the default encoding (ASCII)
+        msg = exc.__str__()
+
+    if isinstance(msg, six.text_type):
+        # This should be the default path on Python 3 and an *optional* path
+        # on Python 2 (if for some reason the exception message was already
+        # in unicode instead of the more typical bytes string); so avoid
+        # further converting to unicode in both of these cases.
+        return msg
+
+    try:
+        # Try to decode from UTF-8 (superset of ASCII). The decoder fails
+        # if the string is not a valid UTF-8 string: the UTF-8 codec includes
+        # a validation algorithm to ensure the consistency of the codec.
+        return msg.decode('utf-8')
+    except UnicodeDecodeError:
+        pass
+
+    # Try the locale encoding, most error messages are encoded to this encoding
+    # (ex: os.strerror(errno))
+    encoding = sys.getfilesystemencoding()
+    try:
+        return msg.decode(encoding)
+    except UnicodeDecodeError:
+        pass
+
+    # The encoding is not ASCII, not UTF-8, nor the locale encoding. Fallback
+    # to the ISO-8859-1 encoding which never fails. It will produce mojibake
+    # if the message is not encoded to ISO-8859-1, but we don't want a super
+    # complex heuristic to get the encoding of an exception message.
+    return msg.decode('latin1')
diff --git a/oslo_utils/tests/tests_encodeutils.py b/oslo_utils/tests/tests_encodeutils.py
index 6af4bfb5..d0f696ed 100644
--- a/oslo_utils/tests/tests_encodeutils.py
+++ b/oslo_utils/tests/tests_encodeutils.py
@@ -18,6 +18,7 @@
 import mock
 from oslotest import base as test_base
 import six
+import testtools
 
 from oslo_utils import encodeutils
 
@@ -103,3 +104,130 @@ class EncodeUtilsTest(test_base.BaseTestCase):
             text=text, incoming='utf-8', encoding='iso-8859-1')
         self.assertNotEqual(text, result)
         self.assertNotEqual(six.b("foo\xf1bar"), result)
+
+
+class ExceptionToUnicodeTest(test_base.BaseTestCase):
+
+    def test_str_exception(self):
+        # The regular Exception class cannot be used directly:
+        # Exception(u'\xe9').__str__() raises an UnicodeEncodeError
+        # on Python 2
+        class StrException(Exception):
+            def __init__(self, value):
+                Exception.__init__(self)
+                self.value = value
+
+            def __str__(self):
+                return self.value
+
+        # On Python 3, an exception which returns bytes with is __str__()
+        # method (like StrException(bytes)) is probably a bug, but it was not
+        # harder to support this silly case in exception_to_unicode().
+
+        # Decode from ASCII
+        exc = StrException(b'bytes ascii')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'bytes ascii')
+
+        # Decode from UTF-8
+        exc = StrException(b'utf-8 \xc3\xa9\xe2\x82\xac')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'utf-8 \xe9\u20ac')
+
+        # Force the locale encoding to ASCII to test the fallback
+        with mock.patch('sys.getfilesystemencoding', return_value='ascii'):
+            # Fallback: decode from ISO-8859-1
+            exc = StrException(b'rawbytes \x80\xff')
+            self.assertEqual(encodeutils.exception_to_unicode(exc),
+                             u'rawbytes \x80\xff')
+
+        # No conversion needed
+        exc = StrException(u'unicode ascii')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'unicode ascii')
+
+        # No conversion needed
+        exc = StrException(u'unicode \xe9\u20ac')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'unicode \xe9\u20ac')
+
+        # Test the locale encoding
+        with mock.patch('sys.getfilesystemencoding', return_value='koi8_r'):
+            exc = StrException(b'\xf2\xd5\xd3\xd3\xcb\xc9\xca')
+            # Decode from the locale encoding
+            # (the message cannot be decoded from ASCII nor UTF-8)
+            self.assertEqual(encodeutils.exception_to_unicode(exc),
+                             u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439')
+
+    @testtools.skipIf(six.PY3, 'test specific to Python 2')
+    def test_unicode_exception(self):
+        # Exception with a __unicode__() method, but no __str__()
+        class UnicodeException(Exception):
+            def __init__(self, value):
+                Exception.__init__(self)
+                self.value = value
+
+            def __unicode__(self):
+                return self.value
+
+        # __unicode__() returns unicode
+        exc = UnicodeException(u'unicode \xe9\u20ac')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'unicode \xe9\u20ac')
+
+        # __unicode__() returns bytes (does this case really happen in the
+        # wild?)
+        exc = UnicodeException(b'utf-8 \xc3\xa9\xe2\x82\xac')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'utf-8 \xe9\u20ac')
+
+    @testtools.skipIf(six.PY3, 'test specific to Python 2')
+    def test_unicode_or_str_exception(self):
+        # Exception with __str__() and __unicode__() methods
+        class UnicodeOrStrException(Exception):
+            def __init__(self, unicode_value, str_value):
+                Exception.__init__(self)
+                self.unicode_value = unicode_value
+                self.str_value = str_value
+
+            def __unicode__(self):
+                return self.unicode_value
+
+            def __str__(self):
+                return self.str_value
+
+        # __unicode__() returns unicode
+        exc = UnicodeOrStrException(u'unicode \xe9\u20ac', b'str')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'unicode \xe9\u20ac')
+
+        # __unicode__() returns bytes (does this case really happen in the
+        # wild?)
+        exc = UnicodeOrStrException(b'utf-8 \xc3\xa9\xe2\x82\xac', b'str')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'utf-8 \xe9\u20ac')
+
+    @testtools.skipIf(six.PY3, 'test specific to Python 2')
+    def test_unicode_only_exception(self):
+        # Exception with a __unicode__() method and a __str__() which
+        # raises an exception (similar to the Message class of oslo_i18n)
+        class UnicodeOnlyException(Exception):
+            def __init__(self, value):
+                Exception.__init__(self)
+                self.value = value
+
+            def __unicode__(self):
+                return self.value
+
+            def __str__(self):
+                raise UnicodeError("use unicode()")
+
+        # __unicode__() returns unicode
+        exc = UnicodeOnlyException(u'unicode \xe9\u20ac')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'unicode \xe9\u20ac')
+
+        # __unicode__() returns bytes
+        exc = UnicodeOnlyException(b'utf-8 \xc3\xa9\xe2\x82\xac')
+        self.assertEqual(encodeutils.exception_to_unicode(exc),
+                         u'utf-8 \xe9\u20ac')