Add stream-based detection

This effectively unifies the InfoWrapper and detect_file_format()
behavior into a single approach that can work stream-based or with
a file on disk.

Note that the wrapper name is also changed as "InfoWrapper" was
named for glance's intended use-case, which was metadata extraction.
Since this has obviously grown in scope, now is the time to change
that name.

Change-Id: Id61f7472f791fc258ec7d0238568c379a7b27823
This commit is contained in:
Dan Smith 2024-08-07 07:40:24 -07:00
parent a0481d5a61
commit 91af49beed
2 changed files with 153 additions and 131 deletions

View File

@ -1256,38 +1256,57 @@ class GPTInspector(FileInspector):
raise SafetyViolation('GPT MBR has no partitions defined') raise SafetyViolation('GPT MBR has no partitions defined')
class InfoWrapper(object): class InspectWrapper:
"""A file-like object that wraps another and updates a format inspector. """A file-like object that wraps another and detects the format.
This passes chunks to the format inspector while reading. If the inspector This passes chunks to a group of format inspectors (default: all)
fails, it logs the error and stops calling it, but continues proxying data while reading. After the stream is finished (or enough has been read to
from the source to its user. make a confident decision), the format attribute will provide the
inspector object that matched.
:param source: The file-like input stream to wrap
:param expected_format: The format name anticipated to match, if any.
If set to a format name, reading of the stream will
be interrupted if the matching inspector raises
an error (indicting a mismatch or any other
problem). This allows the caller to abort before
all data is processed.
:param allowed_formats: A list of format names that limits the inspector
objects that will be used. This may be a security
hole if used improperly, but may be used to limit
the detected formats to some smaller scope.
""" """
def __init__(self, source, expected_format=None, allowed_formats=None):
def __init__(self, source, fmt):
self._source = source self._source = source
self._format = fmt self._expected_format = expected_format
self._error = False self._errored_inspectors = set()
self._inspectors = {v() for k, v in ALL_FORMATS.items()
if not allowed_formats or k in allowed_formats}
self._finished = False
def __iter__(self): def __iter__(self):
return self return self
def _process_chunk(self, chunk): def _process_chunk(self, chunk):
if not self._error: for inspector in [i for i in self._inspectors
if i not in self._errored_inspectors]:
try: try:
self._format.eat_chunk(chunk) inspector.eat_chunk(chunk)
except Exception as e: except Exception as e:
if inspector.NAME == self._expected_format:
# If our desired inspector has failed, we cannot continue
raise
# Absolutely do not allow the format inspector to break # Absolutely do not allow the format inspector to break
# our streaming of the image. If we failed, just stop # our streaming of the image for non-expected formats. If we
# trying, log and keep going. # failed, just stop trying, log and keep going.
LOG.error('Format inspector failed, aborting: %s', e) LOG.debug('Format inspector failed, aborting: %s', e)
self._error = True self._errored_inspectors.add(inspector)
def __next__(self): def __next__(self):
try: try:
chunk = next(self._source) chunk = next(self._source)
except StopIteration: except StopIteration:
self._format.finish() self._finish()
raise raise
self._process_chunk(chunk) self._process_chunk(chunk)
return chunk return chunk
@ -1297,10 +1316,54 @@ class InfoWrapper(object):
self._process_chunk(chunk) self._process_chunk(chunk)
return chunk return chunk
def _finish(self):
for inspector in self._inspectors:
inspector.finish()
self._finished = True
def close(self): def close(self):
if hasattr(self._source, 'close'): if hasattr(self._source, 'close'):
self._source.close() self._source.close()
self._format.finish() self._finish()
@property
def format(self):
"""The format determined from the content.
If this is None, a decision has not been reached. Otherwise,
it is a FileInspector that matches (which may be RawFileInspector
if no other formats matched and enough of the stream has been read
to make that determination). If more than one format matched, then
ImageFormatError is raised. If the allowed_formats was constrained
and raw was not included, then this will raise ImageFormatError to
indicate that no suitable match was found.
"""
non_raw = set([i for i in self._inspectors if i.NAME != 'raw'])
complete = all([i.complete for i in non_raw])
matches = [i for i in non_raw if i.format_match]
if not complete and not self._finished:
# We do not know what our format is if we're still in progress
# of reading the stream and have incomplete inspectors. However,
# if EOF has been signaled, then we can assume the incomplete ones
# are not matches.
return None
if len(matches) > 1:
# Multiple format matches mean that not only can we not return a
# decision here, but also means that there may be something
# nefarious going on (i.e. hiding one header in another).
raise ImageFormatError('Multiple formats detected: %s' % ','.join(
str(i) for i in matches))
if not matches:
try:
# If nothing *specific* matched, we return the raw format to
# indicate that we do not recognize this content at all.
return [x for x in self._inspectors if str(x) == 'raw'][0]
except IndexError:
raise ImageFormatError(
'Content does not match any allowed format')
# The expected outcome of this is a single match of something specific
return matches[0]
ALL_FORMATS = { ALL_FORMATS = {
@ -1337,36 +1400,12 @@ def detect_file_format(filename):
:returns: A FormatInspector instance matching the file. :returns: A FormatInspector instance matching the file.
:raises: ImageFormatError if multiple formats are detected. :raises: ImageFormatError if multiple formats are detected.
""" """
inspectors = {k: v() for k, v in ALL_FORMATS.items()}
detections = []
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
for chunk in _chunked_reader(f, 4096): wrapper = InspectWrapper(f)
for format, inspector in list(inspectors.items()): try:
try: for _chunk in _chunked_reader(wrapper, 4096):
inspector.eat_chunk(chunk) if wrapper.format:
except ImageFormatError: return wrapper.format
# No match, so stop considering this format finally:
inspectors.pop(format) wrapper.close()
continue return wrapper.format
if (inspector.format_match and inspector.complete and
format != 'raw'):
# record all match (other than raw)
detections.append(inspector)
inspectors.pop(format)
if all(i.complete for i in inspectors.values()):
# If all the inspectors are sure they are not a match, avoid
# reading to the end of the file to settle on 'raw'.
break
for format, inspector in list(inspectors.items()):
inspector.finish()
if inspector.format_match and inspector.complete and format != 'raw':
detections.append(inspector)
inspectors.pop(format)
if len(detections) > 1:
all_formats = [str(inspector) for inspector in detections]
raise ImageFormatError(
'Multiple formats detected: %s' % ', '.join(all_formats))
return inspectors['raw'] if not detections else detections[0]

View File

@ -15,7 +15,6 @@
import io import io
import os import os
import re
import struct import struct
import subprocess import subprocess
import tempfile import tempfile
@ -25,21 +24,19 @@ import ddt
from oslo_utils import units from oslo_utils import units
from oslo_utils.imageutils import format_inspector from oslo_utils.imageutils import format_inspector
from oslo_utils.imageutils import QemuImgInfo
from oslotest import base as test_base from oslotest import base as test_base
TEST_IMAGE_PREFIX = 'oslo-unittest-formatinspector-' TEST_IMAGE_PREFIX = 'oslo-unittest-formatinspector-'
def get_size_from_qemu_img(filename): def get_size_format_from_qemu_img(filename):
output = subprocess.check_output('qemu-img info "%s"' % filename, output = subprocess.check_output(
shell=True) 'qemu-img info --output=json "%s"' % filename,
for line in output.split(b'\n'): shell=True)
m = re.search(b'^virtual size: .* .([0-9]+) bytes', line.strip()) info = QemuImgInfo(output, format='json')
if m: return info.virtual_size, info.file_format
return int(m.group(1))
raise Exception('Could not find virtual size with qemu-img')
@ddt.ddt @ddt.ddt
@ -217,11 +214,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
return fn return fn
def _test_format_at_block_size(self, format_name, img, block_size): def _test_format_at_block_size(self, format_name, img, block_size):
fmt = format_inspector.get_inspector(format_name)() wrapper = format_inspector.InspectWrapper(open(img, 'rb'),
self.assertIsNotNone(fmt, format_name)
'Did not get format inspector for %s' % (
format_name))
wrapper = format_inspector.InfoWrapper(open(img, 'rb'), fmt)
while True: while True:
chunk = wrapper.read(block_size) chunk = wrapper.read(block_size)
@ -229,7 +223,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
break break
wrapper.close() wrapper.close()
return fmt self.assertIsNotNone(wrapper.format, 'Failed to detect format')
return wrapper.format
def _test_format_at_image_size(self, format_name, image_size, def _test_format_at_image_size(self, format_name, image_size,
subformat=None, safety_check=False): subformat=None, safety_check=False):
@ -244,7 +239,7 @@ class TestFormatInspectors(test_base.BaseTestCase):
# Some formats have internal alignment restrictions making this not # Some formats have internal alignment restrictions making this not
# always exactly like image_size, so get the real value for comparison # always exactly like image_size, so get the real value for comparison
virtual_size = get_size_from_qemu_img(img) virtual_size, _ = get_size_format_from_qemu_img(img)
# Read the format in various sizes, some of which will read whole # Read the format in various sizes, some of which will read whole
# sections in a single read, others will be completely unaligned, etc. # sections in a single read, others will be completely unaligned, etc.
@ -326,25 +321,18 @@ class TestFormatInspectors(test_base.BaseTestCase):
return qcow, iso, fn return qcow, iso, fn
def test_bad_iso_qcow2(self): def test_bad_iso_qcow2(self):
# Test that an iso with a qcow2 header in the system area will be
# rejected because it matches more than one format (iso and qcow2).
# This is an important case because qemu-img does not support iso,
# and can be fooled into thinking one is a qcow2 by putting the header
# for one in ISO9660's "system area", which is technically a valid
# thing to do.
_, _, fn = self._generate_bad_iso() _, _, fn = self._generate_bad_iso()
iso_check = self._test_format_at_block_size('iso', fn, 4 * units.Ki) self.assertRaisesRegex(format_inspector.ImageFormatError,
qcow_check = self._test_format_at_block_size('qcow2', fn, 4 * units.Ki) 'Multiple formats detected',
# this system area of the ISO file is not considered part of the format self._test_format_at_block_size,
# the qcow2 header is in the system area of the ISO file 'iso', fn, 4 * units.Ki)
# so the ISO file is still valid
self.assertTrue(iso_check.format_match)
# the qcow2 header is in the system area of the ISO file
# but that will be parsed by the qcow2 format inspector
# and it will match
self.assertTrue(qcow_check.format_match)
# if we call format_inspector.detect_file_format it should detect
# and raise an exception because both match internally.
e = self.assertRaises(
format_inspector.ImageFormatError,
format_inspector.detect_file_format, fn)
self.assertIn('Multiple formats detected', str(e))
def test_from_file_reads_minimum(self): def test_from_file_reads_minimum(self):
img = self._create_img('qcow2', 10 * units.Mi) img = self._create_img('qcow2', 10 * units.Mi)
@ -387,14 +375,10 @@ class TestFormatInspectors(test_base.BaseTestCase):
# Read the format in various sizes, some of which will read whole # Read the format in various sizes, some of which will read whole
# sections in a single read, others will be completely unaligned, etc. # sections in a single read, others will be completely unaligned, etc.
for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi): for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
fmt = self._test_format_at_block_size(format_name, img, block_size) self.assertRaisesRegex(format_inspector.ImageFormatError,
self.assertTrue(fmt.format_match, 'Wrong descriptor location',
'Failed to match %s at size %i block %i' % ( self._test_format_at_block_size,
format_name, image_size, block_size)) 'vmdk', img, block_size)
self.assertEqual(0, fmt.virtual_size,
('Calculated a virtual size for a corrupt %s at '
'size %i block %i') % (format_name, image_size,
block_size))
def test_vmdk_bad_descriptor_offset(self): def test_vmdk_bad_descriptor_offset(self):
self._test_vmdk_bad_descriptor_offset() self._test_vmdk_bad_descriptor_offset()
@ -559,46 +543,46 @@ class TestFormatInspectors(test_base.BaseTestCase):
def test_vdi(self): def test_vdi(self):
self._test_format('vdi') self._test_format('vdi')
def _test_format_with_invalid_data(self, format_name): def test_invalid_data(self):
fmt = format_inspector.get_inspector(format_name)() wrapper = format_inspector.InspectWrapper(open(__file__, 'rb'))
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
while True: while True:
chunk = wrapper.read(32) chunk = wrapper.read(32)
if not chunk: if not chunk:
break break
wrapper.close() wrapper.close()
self.assertFalse(fmt.format_match) # Make sure this was not detected as any other format
self.assertEqual(0, fmt.virtual_size) self.assertEqual('raw', str(wrapper.format))
memory = sum(fmt.context_info.values())
self.assertLess(memory, 512 * units.Ki,
'Format used more than 512KiB of memory: %s' % (
fmt.context_info))
def test_qcow2_invalid(self): # Make sure that all of the other inspectors do not match and did not
self._test_format_with_invalid_data('qcow2') # use too much memory
for fmt in wrapper._inspectors:
if str(fmt) == 'raw':
continue
self.assertFalse(fmt.format_match)
memory = sum(fmt.context_info.values())
self.assertLess(memory, 512 * units.Ki,
'Format used more than 512KiB of memory: %s' % (
fmt.context_info))
def test_vhd_invalid(self): def test_invalid_data_without_raw(self):
self._test_format_with_invalid_data('vhd') wrapper = format_inspector.InspectWrapper(
open(__file__, 'rb'),
allowed_formats=['qcow2', 'vmdk'])
while True:
chunk = wrapper.read(32)
if not chunk:
break
def test_vhdx_invalid(self): wrapper.close()
self._test_format_with_invalid_data('vhdx') # Make sure this was not detected as any other format
self.assertRaises(format_inspector.ImageFormatError,
def test_vmdk_invalid(self): lambda: wrapper.format)
self._test_format_with_invalid_data('vmdk')
def test_vdi_invalid(self):
self._test_format_with_invalid_data('vdi')
def test_vmdk_invalid_type(self): def test_vmdk_invalid_type(self):
fmt = format_inspector.get_inspector('vmdk')() fmt = format_inspector.VMDKInspector()
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt) with open(__file__, 'rb') as f:
while True: fmt.eat_chunk(f.read())
chunk = wrapper.read(32)
if not chunk:
break
wrapper.close()
fake_rgn = mock.MagicMock() fake_rgn = mock.MagicMock()
fake_rgn.complete = True fake_rgn.complete = True
@ -941,8 +925,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
def _get_wrapper(self, data): def _get_wrapper(self, data):
source = io.BytesIO(data) source = io.BytesIO(data)
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw')) return format_inspector.InspectWrapper(source)
return format_inspector.InfoWrapper(source, fake_fmt)
def test_info_wrapper_file_like(self): def test_info_wrapper_file_like(self):
data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z'))) data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
@ -967,9 +950,10 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
self.assertEqual(data, read_data) self.assertEqual(data, read_data)
def test_info_wrapper_file_like_eats_error(self): @mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
def test_info_wrapper_file_like_eats_error(self, mock_eat):
wrapper = self._get_wrapper(b'123456') wrapper = self._get_wrapper(b'123456')
wrapper._format.eat_chunk.side_effect = Exception('fail') mock_eat.side_effect = Exception('fail')
data = b'' data = b''
while True: while True:
@ -983,13 +967,12 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
# Make sure we only called this once and never again after # Make sure we only called this once and never again after
# the error was raised # the error was raised
wrapper._format.eat_chunk.assert_called_once_with(b'123') mock_eat.assert_called_once_with(b'123')
def test_info_wrapper_iter_like_eats_error(self): @mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw')) def test_wrapper_iter_like_eats_error(self, mock_eat):
wrapper = format_inspector.InfoWrapper(iter([b'123', b'456']), wrapper = format_inspector.InspectWrapper(iter([b'123', b'456']))
fake_fmt) mock_eat.side_effect = Exception('fail')
fake_fmt.eat_chunk.side_effect = Exception('fail')
data = b'' data = b''
for chunk in wrapper: for chunk in wrapper:
@ -1000,7 +983,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
# Make sure we only called this once and never again after # Make sure we only called this once and never again after
# the error was raised # the error was raised
fake_fmt.eat_chunk.assert_called_once_with(b'123') mock_eat.assert_called_once_with(b'123')
def test_get_inspector(self): def test_get_inspector(self):
self.assertEqual(format_inspector.QcowInspector, self.assertEqual(format_inspector.QcowInspector,