Add stream-based detection

This effectively unifies the InfoWrapper and detect_file_format() behavior into a single approach that can work stream-based or with a file on disk. Note that the wrapper name is also changed as "InfoWrapper" was named for glance's intended use-case, which was metadata extraction. Since this has obviously grown in scope, now is the time to change that name. Change-Id: Id61f7472f791fc258ec7d0238568c379a7b27823
2024-08-07 07:40:24 -07:00 · 2024-08-07 07:40:24 -07:00 · 91af49beed
commit 91af49beed
parent a0481d5a61
2 changed files with 153 additions and 131 deletions
--- a/oslo_utils/imageutils/format_inspector.py
+++ b/oslo_utils/imageutils/format_inspector.py
@ -1256,38 +1256,57 @@ class GPTInspector(FileInspector):
            raise SafetyViolation('GPT MBR has no partitions defined')
-class InfoWrapper(object):
+class InspectWrapper:
-    """A file-like object that wraps another and updates a format inspector.
+    """A file-like object that wraps another and detects the format.
-    This passes chunks to the format inspector while reading. If the inspector
+    This passes chunks to a group of format inspectors (default: all)
-    fails, it logs the error and stops calling it, but continues proxying data
+    while reading. After the stream is finished (or enough has been read to
-    from the source to its user.
+    make a confident decision), the format attribute will provide the
    inspector object that matched.
    :param source: The file-like input stream to wrap
    :param expected_format: The format name anticipated to match, if any.
                            If set to a format name, reading of the stream will
                            be interrupted if the matching inspector raises
                            an error (indicting a mismatch or any other
                            problem). This allows the caller to abort before
                            all data is processed.
    :param allowed_formats: A list of format names that limits the inspector
                            objects that will be used. This may be a security
                            hole if used improperly, but may be used to limit
                            the detected formats to some smaller scope.
    """
-
+    def __init__(self, source, expected_format=None, allowed_formats=None):
    def __init__(self, source, fmt):
        self._source = source
-        self._format = fmt
+        self._expected_format = expected_format
-        self._error = False
+        self._errored_inspectors = set()
        self._inspectors = {v() for k, v in ALL_FORMATS.items()
                            if not allowed_formats or k in allowed_formats}
        self._finished = False
    def __iter__(self):
        return self
    def _process_chunk(self, chunk):
-        if not self._error:
+        for inspector in [i for i in self._inspectors
                          if i not in self._errored_inspectors]:
            try:
-                self._format.eat_chunk(chunk)
+                inspector.eat_chunk(chunk)
            except Exception as e:
                if inspector.NAME == self._expected_format:
                    # If our desired inspector has failed, we cannot continue
                    raise
                # Absolutely do not allow the format inspector to break
-                # our streaming of the image. If we failed, just stop
+                # our streaming of the image for non-expected formats. If we
-                # trying, log and keep going.
+                # failed, just stop trying, log and keep going.
-                LOG.error('Format inspector failed, aborting: %s', e)
+                LOG.debug('Format inspector failed, aborting: %s', e)
-                self._error = True
+                self._errored_inspectors.add(inspector)
    def __next__(self):
        try:
            chunk = next(self._source)
        except StopIteration:
-            self._format.finish()
+            self._finish()
            raise
        self._process_chunk(chunk)
        return chunk
@ -1297,10 +1316,54 @@ class InfoWrapper(object):
        self._process_chunk(chunk)
        return chunk
    def _finish(self):
        for inspector in self._inspectors:
            inspector.finish()
        self._finished = True
    def close(self):
        if hasattr(self._source, 'close'):
            self._source.close()
-        self._format.finish()
+        self._finish()
    @property
    def format(self):
        """The format determined from the content.
        If this is None, a decision has not been reached. Otherwise,
        it is a FileInspector that matches (which may be RawFileInspector
        if no other formats matched and enough of the stream has been read
        to make that determination). If more than one format matched, then
        ImageFormatError is raised. If the allowed_formats was constrained
        and raw was not included, then this will raise ImageFormatError to
        indicate that no suitable match was found.
        """
        non_raw = set([i for i in self._inspectors if i.NAME != 'raw'])
        complete = all([i.complete for i in non_raw])
        matches = [i for i in non_raw if i.format_match]
        if not complete and not self._finished:
            # We do not know what our format is if we're still in progress
            # of reading the stream and have incomplete inspectors. However,
            # if EOF has been signaled, then we can assume the incomplete ones
            # are not matches.
            return None
        if len(matches) > 1:
            # Multiple format matches mean that not only can we not return a
            # decision here, but also means that there may be something
            # nefarious going on (i.e. hiding one header in another).
            raise ImageFormatError('Multiple formats detected: %s' % ','.join(
                str(i) for i in matches))
        if not matches:
            try:
                # If nothing *specific* matched, we return the raw format to
                # indicate that we do not recognize this content at all.
                return [x for x in self._inspectors if str(x) == 'raw'][0]
            except IndexError:
                raise ImageFormatError(
                    'Content does not match any allowed format')
        # The expected outcome of this is a single match of something specific
        return matches[0]
 ALL_FORMATS = {
@ -1337,36 +1400,12 @@ def detect_file_format(filename):
    :returns: A FormatInspector instance matching the file.
    :raises: ImageFormatError if multiple formats are detected.
    """
    inspectors = {k: v() for k, v in ALL_FORMATS.items()}
    detections = []
    with open(filename, 'rb') as f:
-        for chunk in _chunked_reader(f, 4096):
+        wrapper = InspectWrapper(f)
-            for format, inspector in list(inspectors.items()):
+        try:
-                try:
+            for _chunk in _chunked_reader(wrapper, 4096):
-                    inspector.eat_chunk(chunk)
+                if wrapper.format:
-                except ImageFormatError:
+                    return wrapper.format
-                    # No match, so stop considering this format
+        finally:
-                    inspectors.pop(format)
+            wrapper.close()
-                    continue
+        return wrapper.format
                if (inspector.format_match and inspector.complete and
                        format != 'raw'):
                    # record all match (other than raw)
                    detections.append(inspector)
                    inspectors.pop(format)
            if all(i.complete for i in inspectors.values()):
                # If all the inspectors are sure they are not a match, avoid
                # reading to the end of the file to settle on 'raw'.
                break
    for format, inspector in list(inspectors.items()):
        inspector.finish()
        if inspector.format_match and inspector.complete and format != 'raw':
            detections.append(inspector)
            inspectors.pop(format)
    if len(detections) > 1:
        all_formats = [str(inspector) for inspector in detections]
        raise ImageFormatError(
            'Multiple formats detected: %s' % ', '.join(all_formats))
    return inspectors['raw'] if not detections else detections[0]
--- a/oslo_utils/tests/imageutils/test_format_inspector.py
+++ b/oslo_utils/tests/imageutils/test_format_inspector.py
@ -15,7 +15,6 @@
 import io
 import os
 import re
 import struct
 import subprocess
 import tempfile
@ -25,21 +24,19 @@ import ddt
 from oslo_utils import units
 from oslo_utils.imageutils import format_inspector
 from oslo_utils.imageutils import QemuImgInfo
 from oslotest import base as test_base
 TEST_IMAGE_PREFIX = 'oslo-unittest-formatinspector-'
-def get_size_from_qemu_img(filename):
+def get_size_format_from_qemu_img(filename):
-    output = subprocess.check_output('qemu-img info "%s"' % filename,
+    output = subprocess.check_output(
-                                     shell=True)
+        'qemu-img info --output=json "%s"' % filename,
-    for line in output.split(b'\n'):
+        shell=True)
-        m = re.search(b'^virtual size: .* .([0-9]+) bytes', line.strip())
+    info = QemuImgInfo(output, format='json')
-        if m:
+    return info.virtual_size, info.file_format
            return int(m.group(1))
    raise Exception('Could not find virtual size with qemu-img')
@ddt.ddt
@ -217,11 +214,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
        return fn
    def _test_format_at_block_size(self, format_name, img, block_size):
-        fmt = format_inspector.get_inspector(format_name)()
+        wrapper = format_inspector.InspectWrapper(open(img, 'rb'),
-        self.assertIsNotNone(fmt,
+                                                  format_name)
                             'Did not get format inspector for %s' % (
                                 format_name))
        wrapper = format_inspector.InfoWrapper(open(img, 'rb'), fmt)
        while True:
            chunk = wrapper.read(block_size)
@ -229,7 +223,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
                break
        wrapper.close()
-        return fmt
+        self.assertIsNotNone(wrapper.format, 'Failed to detect format')
        return wrapper.format
    def _test_format_at_image_size(self, format_name, image_size,
                                   subformat=None, safety_check=False):
@ -244,7 +239,7 @@ class TestFormatInspectors(test_base.BaseTestCase):
        # Some formats have internal alignment restrictions making this not
        # always exactly like image_size, so get the real value for comparison
-        virtual_size = get_size_from_qemu_img(img)
+        virtual_size, _ = get_size_format_from_qemu_img(img)
        # Read the format in various sizes, some of which will read whole
        # sections in a single read, others will be completely unaligned, etc.
@ -326,25 +321,18 @@ class TestFormatInspectors(test_base.BaseTestCase):
        return qcow, iso, fn
    def test_bad_iso_qcow2(self):
-
+        # Test that an iso with a qcow2 header in the system area will be
        # rejected because it matches more than one format (iso and qcow2).
        # This is an important case because qemu-img does not support iso,
        # and can be fooled into thinking one is a qcow2 by putting the header
        # for one in ISO9660's "system area", which is technically a valid
        # thing to do.
        _, _, fn = self._generate_bad_iso()
-        iso_check = self._test_format_at_block_size('iso', fn, 4 * units.Ki)
+        self.assertRaisesRegex(format_inspector.ImageFormatError,
-        qcow_check = self._test_format_at_block_size('qcow2', fn, 4 * units.Ki)
+                               'Multiple formats detected',
-        # this system area of the ISO file is not considered part of the format
+                               self._test_format_at_block_size,
-        # the qcow2 header is in the system area of the ISO file
+                               'iso', fn, 4 * units.Ki)
        # so the ISO file is still valid
        self.assertTrue(iso_check.format_match)
        # the qcow2 header is in the system area of the ISO file
        # but that will be parsed by the qcow2 format inspector
        # and it will match
        self.assertTrue(qcow_check.format_match)
        # if we call format_inspector.detect_file_format it should detect
        # and raise an exception because both match internally.
        e = self.assertRaises(
            format_inspector.ImageFormatError,
            format_inspector.detect_file_format, fn)
        self.assertIn('Multiple formats detected', str(e))
    def test_from_file_reads_minimum(self):
        img = self._create_img('qcow2', 10 * units.Mi)
@ -387,14 +375,10 @@ class TestFormatInspectors(test_base.BaseTestCase):
        # Read the format in various sizes, some of which will read whole
        # sections in a single read, others will be completely unaligned, etc.
        for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
-            fmt = self._test_format_at_block_size(format_name, img, block_size)
+            self.assertRaisesRegex(format_inspector.ImageFormatError,
-            self.assertTrue(fmt.format_match,
+                                   'Wrong descriptor location',
-                            'Failed to match %s at size %i block %i' % (
+                                   self._test_format_at_block_size,
-                                format_name, image_size, block_size))
+                                   'vmdk', img, block_size)
            self.assertEqual(0, fmt.virtual_size,
                             ('Calculated a virtual size for a corrupt %s at '
                              'size %i block %i') % (format_name, image_size,
                                                     block_size))
    def test_vmdk_bad_descriptor_offset(self):
        self._test_vmdk_bad_descriptor_offset()
@ -559,46 +543,46 @@ class TestFormatInspectors(test_base.BaseTestCase):
    def test_vdi(self):
        self._test_format('vdi')
-    def _test_format_with_invalid_data(self, format_name):
+    def test_invalid_data(self):
-        fmt = format_inspector.get_inspector(format_name)()
+        wrapper = format_inspector.InspectWrapper(open(__file__, 'rb'))
        wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
        while True:
            chunk = wrapper.read(32)
            if not chunk:
                break
        wrapper.close()
-        self.assertFalse(fmt.format_match)
+        # Make sure this was not detected as any other format
-        self.assertEqual(0, fmt.virtual_size)
+        self.assertEqual('raw', str(wrapper.format))
        memory = sum(fmt.context_info.values())
        self.assertLess(memory, 512 * units.Ki,
                        'Format used more than 512KiB of memory: %s' % (
                            fmt.context_info))
-    def test_qcow2_invalid(self):
+        # Make sure that all of the other inspectors do not match and did not
-        self._test_format_with_invalid_data('qcow2')
+        # use too much memory
        for fmt in wrapper._inspectors:
            if str(fmt) == 'raw':
                continue
            self.assertFalse(fmt.format_match)
            memory = sum(fmt.context_info.values())
            self.assertLess(memory, 512 * units.Ki,
                            'Format used more than 512KiB of memory: %s' % (
                                fmt.context_info))
-    def test_vhd_invalid(self):
+    def test_invalid_data_without_raw(self):
-        self._test_format_with_invalid_data('vhd')
+        wrapper = format_inspector.InspectWrapper(
            open(__file__, 'rb'),
            allowed_formats=['qcow2', 'vmdk'])
        while True:
            chunk = wrapper.read(32)
            if not chunk:
                break
-    def test_vhdx_invalid(self):
+        wrapper.close()
-        self._test_format_with_invalid_data('vhdx')
+        # Make sure this was not detected as any other format
-
+        self.assertRaises(format_inspector.ImageFormatError,
-    def test_vmdk_invalid(self):
+                          lambda: wrapper.format)
        self._test_format_with_invalid_data('vmdk')
    def test_vdi_invalid(self):
        self._test_format_with_invalid_data('vdi')
    def test_vmdk_invalid_type(self):
-        fmt = format_inspector.get_inspector('vmdk')()
+        fmt = format_inspector.VMDKInspector()
-        wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
+        with open(__file__, 'rb') as f:
-        while True:
+            fmt.eat_chunk(f.read())
            chunk = wrapper.read(32)
            if not chunk:
                break
        wrapper.close()
        fake_rgn = mock.MagicMock()
        fake_rgn.complete = True
@ -941,8 +925,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
    def _get_wrapper(self, data):
        source = io.BytesIO(data)
-        fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
+        return format_inspector.InspectWrapper(source)
        return format_inspector.InfoWrapper(source, fake_fmt)
    def test_info_wrapper_file_like(self):
        data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
@ -967,9 +950,10 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
        self.assertEqual(data, read_data)
-    def test_info_wrapper_file_like_eats_error(self):
+    @mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
    def test_info_wrapper_file_like_eats_error(self, mock_eat):
        wrapper = self._get_wrapper(b'123456')
-        wrapper._format.eat_chunk.side_effect = Exception('fail')
+        mock_eat.side_effect = Exception('fail')
        data = b''
        while True:
@ -983,13 +967,12 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
        # Make sure we only called this once and never again after
        # the error was raised
-        wrapper._format.eat_chunk.assert_called_once_with(b'123')
+        mock_eat.assert_called_once_with(b'123')
-    def test_info_wrapper_iter_like_eats_error(self):
+    @mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
-        fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
+    def test_wrapper_iter_like_eats_error(self, mock_eat):
-        wrapper = format_inspector.InfoWrapper(iter([b'123', b'456']),
+        wrapper = format_inspector.InspectWrapper(iter([b'123', b'456']))
-                                               fake_fmt)
+        mock_eat.side_effect = Exception('fail')
        fake_fmt.eat_chunk.side_effect = Exception('fail')
        data = b''
        for chunk in wrapper:
@ -1000,7 +983,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
        # Make sure we only called this once and never again after
        # the error was raised
-        fake_fmt.eat_chunk.assert_called_once_with(b'123')
+        mock_eat.assert_called_once_with(b'123')
    def test_get_inspector(self):
        self.assertEqual(format_inspector.QcowInspector,