Add stream-based detection

This effectively unifies the InfoWrapper and detect_file_format()
behavior into a single approach that can work stream-based or with
a file on disk.

Note that the wrapper name is also changed as "InfoWrapper" was
named for glance's intended use-case, which was metadata extraction.
Since this has obviously grown in scope, now is the time to change
that name.

Change-Id: Id61f7472f791fc258ec7d0238568c379a7b27823
This commit is contained in:
Dan Smith 2024-08-07 07:40:24 -07:00
parent a0481d5a61
commit 91af49beed
2 changed files with 153 additions and 131 deletions

View File

@ -1256,38 +1256,57 @@ class GPTInspector(FileInspector):
raise SafetyViolation('GPT MBR has no partitions defined')
class InfoWrapper(object):
"""A file-like object that wraps another and updates a format inspector.
class InspectWrapper:
"""A file-like object that wraps another and detects the format.
This passes chunks to the format inspector while reading. If the inspector
fails, it logs the error and stops calling it, but continues proxying data
from the source to its user.
This passes chunks to a group of format inspectors (default: all)
while reading. After the stream is finished (or enough has been read to
make a confident decision), the format attribute will provide the
inspector object that matched.
:param source: The file-like input stream to wrap
:param expected_format: The format name anticipated to match, if any.
If set to a format name, reading of the stream will
be interrupted if the matching inspector raises
an error (indicting a mismatch or any other
problem). This allows the caller to abort before
all data is processed.
:param allowed_formats: A list of format names that limits the inspector
objects that will be used. This may be a security
hole if used improperly, but may be used to limit
the detected formats to some smaller scope.
"""
def __init__(self, source, fmt):
def __init__(self, source, expected_format=None, allowed_formats=None):
self._source = source
self._format = fmt
self._error = False
self._expected_format = expected_format
self._errored_inspectors = set()
self._inspectors = {v() for k, v in ALL_FORMATS.items()
if not allowed_formats or k in allowed_formats}
self._finished = False
def __iter__(self):
return self
def _process_chunk(self, chunk):
if not self._error:
for inspector in [i for i in self._inspectors
if i not in self._errored_inspectors]:
try:
self._format.eat_chunk(chunk)
inspector.eat_chunk(chunk)
except Exception as e:
if inspector.NAME == self._expected_format:
# If our desired inspector has failed, we cannot continue
raise
# Absolutely do not allow the format inspector to break
# our streaming of the image. If we failed, just stop
# trying, log and keep going.
LOG.error('Format inspector failed, aborting: %s', e)
self._error = True
# our streaming of the image for non-expected formats. If we
# failed, just stop trying, log and keep going.
LOG.debug('Format inspector failed, aborting: %s', e)
self._errored_inspectors.add(inspector)
def __next__(self):
try:
chunk = next(self._source)
except StopIteration:
self._format.finish()
self._finish()
raise
self._process_chunk(chunk)
return chunk
@ -1297,10 +1316,54 @@ class InfoWrapper(object):
self._process_chunk(chunk)
return chunk
def _finish(self):
for inspector in self._inspectors:
inspector.finish()
self._finished = True
def close(self):
if hasattr(self._source, 'close'):
self._source.close()
self._format.finish()
self._finish()
@property
def format(self):
"""The format determined from the content.
If this is None, a decision has not been reached. Otherwise,
it is a FileInspector that matches (which may be RawFileInspector
if no other formats matched and enough of the stream has been read
to make that determination). If more than one format matched, then
ImageFormatError is raised. If the allowed_formats was constrained
and raw was not included, then this will raise ImageFormatError to
indicate that no suitable match was found.
"""
non_raw = set([i for i in self._inspectors if i.NAME != 'raw'])
complete = all([i.complete for i in non_raw])
matches = [i for i in non_raw if i.format_match]
if not complete and not self._finished:
# We do not know what our format is if we're still in progress
# of reading the stream and have incomplete inspectors. However,
# if EOF has been signaled, then we can assume the incomplete ones
# are not matches.
return None
if len(matches) > 1:
# Multiple format matches mean that not only can we not return a
# decision here, but also means that there may be something
# nefarious going on (i.e. hiding one header in another).
raise ImageFormatError('Multiple formats detected: %s' % ','.join(
str(i) for i in matches))
if not matches:
try:
# If nothing *specific* matched, we return the raw format to
# indicate that we do not recognize this content at all.
return [x for x in self._inspectors if str(x) == 'raw'][0]
except IndexError:
raise ImageFormatError(
'Content does not match any allowed format')
# The expected outcome of this is a single match of something specific
return matches[0]
ALL_FORMATS = {
@ -1337,36 +1400,12 @@ def detect_file_format(filename):
:returns: A FormatInspector instance matching the file.
:raises: ImageFormatError if multiple formats are detected.
"""
inspectors = {k: v() for k, v in ALL_FORMATS.items()}
detections = []
with open(filename, 'rb') as f:
for chunk in _chunked_reader(f, 4096):
for format, inspector in list(inspectors.items()):
try:
inspector.eat_chunk(chunk)
except ImageFormatError:
# No match, so stop considering this format
inspectors.pop(format)
continue
if (inspector.format_match and inspector.complete and
format != 'raw'):
# record all match (other than raw)
detections.append(inspector)
inspectors.pop(format)
if all(i.complete for i in inspectors.values()):
# If all the inspectors are sure they are not a match, avoid
# reading to the end of the file to settle on 'raw'.
break
for format, inspector in list(inspectors.items()):
inspector.finish()
if inspector.format_match and inspector.complete and format != 'raw':
detections.append(inspector)
inspectors.pop(format)
if len(detections) > 1:
all_formats = [str(inspector) for inspector in detections]
raise ImageFormatError(
'Multiple formats detected: %s' % ', '.join(all_formats))
return inspectors['raw'] if not detections else detections[0]
wrapper = InspectWrapper(f)
try:
for _chunk in _chunked_reader(wrapper, 4096):
if wrapper.format:
return wrapper.format
finally:
wrapper.close()
return wrapper.format

View File

@ -15,7 +15,6 @@
import io
import os
import re
import struct
import subprocess
import tempfile
@ -25,21 +24,19 @@ import ddt
from oslo_utils import units
from oslo_utils.imageutils import format_inspector
from oslo_utils.imageutils import QemuImgInfo
from oslotest import base as test_base
TEST_IMAGE_PREFIX = 'oslo-unittest-formatinspector-'
def get_size_from_qemu_img(filename):
output = subprocess.check_output('qemu-img info "%s"' % filename,
shell=True)
for line in output.split(b'\n'):
m = re.search(b'^virtual size: .* .([0-9]+) bytes', line.strip())
if m:
return int(m.group(1))
raise Exception('Could not find virtual size with qemu-img')
def get_size_format_from_qemu_img(filename):
output = subprocess.check_output(
'qemu-img info --output=json "%s"' % filename,
shell=True)
info = QemuImgInfo(output, format='json')
return info.virtual_size, info.file_format
@ddt.ddt
@ -217,11 +214,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
return fn
def _test_format_at_block_size(self, format_name, img, block_size):
fmt = format_inspector.get_inspector(format_name)()
self.assertIsNotNone(fmt,
'Did not get format inspector for %s' % (
format_name))
wrapper = format_inspector.InfoWrapper(open(img, 'rb'), fmt)
wrapper = format_inspector.InspectWrapper(open(img, 'rb'),
format_name)
while True:
chunk = wrapper.read(block_size)
@ -229,7 +223,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
break
wrapper.close()
return fmt
self.assertIsNotNone(wrapper.format, 'Failed to detect format')
return wrapper.format
def _test_format_at_image_size(self, format_name, image_size,
subformat=None, safety_check=False):
@ -244,7 +239,7 @@ class TestFormatInspectors(test_base.BaseTestCase):
# Some formats have internal alignment restrictions making this not
# always exactly like image_size, so get the real value for comparison
virtual_size = get_size_from_qemu_img(img)
virtual_size, _ = get_size_format_from_qemu_img(img)
# Read the format in various sizes, some of which will read whole
# sections in a single read, others will be completely unaligned, etc.
@ -326,25 +321,18 @@ class TestFormatInspectors(test_base.BaseTestCase):
return qcow, iso, fn
def test_bad_iso_qcow2(self):
# Test that an iso with a qcow2 header in the system area will be
# rejected because it matches more than one format (iso and qcow2).
# This is an important case because qemu-img does not support iso,
# and can be fooled into thinking one is a qcow2 by putting the header
# for one in ISO9660's "system area", which is technically a valid
# thing to do.
_, _, fn = self._generate_bad_iso()
iso_check = self._test_format_at_block_size('iso', fn, 4 * units.Ki)
qcow_check = self._test_format_at_block_size('qcow2', fn, 4 * units.Ki)
# this system area of the ISO file is not considered part of the format
# the qcow2 header is in the system area of the ISO file
# so the ISO file is still valid
self.assertTrue(iso_check.format_match)
# the qcow2 header is in the system area of the ISO file
# but that will be parsed by the qcow2 format inspector
# and it will match
self.assertTrue(qcow_check.format_match)
# if we call format_inspector.detect_file_format it should detect
# and raise an exception because both match internally.
e = self.assertRaises(
format_inspector.ImageFormatError,
format_inspector.detect_file_format, fn)
self.assertIn('Multiple formats detected', str(e))
self.assertRaisesRegex(format_inspector.ImageFormatError,
'Multiple formats detected',
self._test_format_at_block_size,
'iso', fn, 4 * units.Ki)
def test_from_file_reads_minimum(self):
img = self._create_img('qcow2', 10 * units.Mi)
@ -387,14 +375,10 @@ class TestFormatInspectors(test_base.BaseTestCase):
# Read the format in various sizes, some of which will read whole
# sections in a single read, others will be completely unaligned, etc.
for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
fmt = self._test_format_at_block_size(format_name, img, block_size)
self.assertTrue(fmt.format_match,
'Failed to match %s at size %i block %i' % (
format_name, image_size, block_size))
self.assertEqual(0, fmt.virtual_size,
('Calculated a virtual size for a corrupt %s at '
'size %i block %i') % (format_name, image_size,
block_size))
self.assertRaisesRegex(format_inspector.ImageFormatError,
'Wrong descriptor location',
self._test_format_at_block_size,
'vmdk', img, block_size)
def test_vmdk_bad_descriptor_offset(self):
self._test_vmdk_bad_descriptor_offset()
@ -559,46 +543,46 @@ class TestFormatInspectors(test_base.BaseTestCase):
def test_vdi(self):
self._test_format('vdi')
def _test_format_with_invalid_data(self, format_name):
fmt = format_inspector.get_inspector(format_name)()
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
def test_invalid_data(self):
wrapper = format_inspector.InspectWrapper(open(__file__, 'rb'))
while True:
chunk = wrapper.read(32)
if not chunk:
break
wrapper.close()
self.assertFalse(fmt.format_match)
self.assertEqual(0, fmt.virtual_size)
memory = sum(fmt.context_info.values())
self.assertLess(memory, 512 * units.Ki,
'Format used more than 512KiB of memory: %s' % (
fmt.context_info))
# Make sure this was not detected as any other format
self.assertEqual('raw', str(wrapper.format))
def test_qcow2_invalid(self):
self._test_format_with_invalid_data('qcow2')
# Make sure that all of the other inspectors do not match and did not
# use too much memory
for fmt in wrapper._inspectors:
if str(fmt) == 'raw':
continue
self.assertFalse(fmt.format_match)
memory = sum(fmt.context_info.values())
self.assertLess(memory, 512 * units.Ki,
'Format used more than 512KiB of memory: %s' % (
fmt.context_info))
def test_vhd_invalid(self):
self._test_format_with_invalid_data('vhd')
def test_invalid_data_without_raw(self):
wrapper = format_inspector.InspectWrapper(
open(__file__, 'rb'),
allowed_formats=['qcow2', 'vmdk'])
while True:
chunk = wrapper.read(32)
if not chunk:
break
def test_vhdx_invalid(self):
self._test_format_with_invalid_data('vhdx')
def test_vmdk_invalid(self):
self._test_format_with_invalid_data('vmdk')
def test_vdi_invalid(self):
self._test_format_with_invalid_data('vdi')
wrapper.close()
# Make sure this was not detected as any other format
self.assertRaises(format_inspector.ImageFormatError,
lambda: wrapper.format)
def test_vmdk_invalid_type(self):
fmt = format_inspector.get_inspector('vmdk')()
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
while True:
chunk = wrapper.read(32)
if not chunk:
break
wrapper.close()
fmt = format_inspector.VMDKInspector()
with open(__file__, 'rb') as f:
fmt.eat_chunk(f.read())
fake_rgn = mock.MagicMock()
fake_rgn.complete = True
@ -941,8 +925,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
def _get_wrapper(self, data):
source = io.BytesIO(data)
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
return format_inspector.InfoWrapper(source, fake_fmt)
return format_inspector.InspectWrapper(source)
def test_info_wrapper_file_like(self):
data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
@ -967,9 +950,10 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
self.assertEqual(data, read_data)
def test_info_wrapper_file_like_eats_error(self):
@mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
def test_info_wrapper_file_like_eats_error(self, mock_eat):
wrapper = self._get_wrapper(b'123456')
wrapper._format.eat_chunk.side_effect = Exception('fail')
mock_eat.side_effect = Exception('fail')
data = b''
while True:
@ -983,13 +967,12 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
# Make sure we only called this once and never again after
# the error was raised
wrapper._format.eat_chunk.assert_called_once_with(b'123')
mock_eat.assert_called_once_with(b'123')
def test_info_wrapper_iter_like_eats_error(self):
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
wrapper = format_inspector.InfoWrapper(iter([b'123', b'456']),
fake_fmt)
fake_fmt.eat_chunk.side_effect = Exception('fail')
@mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
def test_wrapper_iter_like_eats_error(self, mock_eat):
wrapper = format_inspector.InspectWrapper(iter([b'123', b'456']))
mock_eat.side_effect = Exception('fail')
data = b''
for chunk in wrapper:
@ -1000,7 +983,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
# Make sure we only called this once and never again after
# the error was raised
fake_fmt.eat_chunk.assert_called_once_with(b'123')
mock_eat.assert_called_once_with(b'123')
def test_get_inspector(self):
self.assertEqual(format_inspector.QcowInspector,