Add stream-based detection
This effectively unifies the InfoWrapper and detect_file_format() behavior into a single approach that can work stream-based or with a file on disk. Note that the wrapper name is also changed as "InfoWrapper" was named for glance's intended use-case, which was metadata extraction. Since this has obviously grown in scope, now is the time to change that name. Change-Id: Id61f7472f791fc258ec7d0238568c379a7b27823
This commit is contained in:
parent
a0481d5a61
commit
91af49beed
@ -1256,38 +1256,57 @@ class GPTInspector(FileInspector):
|
||||
raise SafetyViolation('GPT MBR has no partitions defined')
|
||||
|
||||
|
||||
class InfoWrapper(object):
|
||||
"""A file-like object that wraps another and updates a format inspector.
|
||||
class InspectWrapper:
|
||||
"""A file-like object that wraps another and detects the format.
|
||||
|
||||
This passes chunks to the format inspector while reading. If the inspector
|
||||
fails, it logs the error and stops calling it, but continues proxying data
|
||||
from the source to its user.
|
||||
This passes chunks to a group of format inspectors (default: all)
|
||||
while reading. After the stream is finished (or enough has been read to
|
||||
make a confident decision), the format attribute will provide the
|
||||
inspector object that matched.
|
||||
|
||||
:param source: The file-like input stream to wrap
|
||||
:param expected_format: The format name anticipated to match, if any.
|
||||
If set to a format name, reading of the stream will
|
||||
be interrupted if the matching inspector raises
|
||||
an error (indicting a mismatch or any other
|
||||
problem). This allows the caller to abort before
|
||||
all data is processed.
|
||||
:param allowed_formats: A list of format names that limits the inspector
|
||||
objects that will be used. This may be a security
|
||||
hole if used improperly, but may be used to limit
|
||||
the detected formats to some smaller scope.
|
||||
"""
|
||||
|
||||
def __init__(self, source, fmt):
|
||||
def __init__(self, source, expected_format=None, allowed_formats=None):
|
||||
self._source = source
|
||||
self._format = fmt
|
||||
self._error = False
|
||||
self._expected_format = expected_format
|
||||
self._errored_inspectors = set()
|
||||
self._inspectors = {v() for k, v in ALL_FORMATS.items()
|
||||
if not allowed_formats or k in allowed_formats}
|
||||
self._finished = False
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def _process_chunk(self, chunk):
|
||||
if not self._error:
|
||||
for inspector in [i for i in self._inspectors
|
||||
if i not in self._errored_inspectors]:
|
||||
try:
|
||||
self._format.eat_chunk(chunk)
|
||||
inspector.eat_chunk(chunk)
|
||||
except Exception as e:
|
||||
if inspector.NAME == self._expected_format:
|
||||
# If our desired inspector has failed, we cannot continue
|
||||
raise
|
||||
# Absolutely do not allow the format inspector to break
|
||||
# our streaming of the image. If we failed, just stop
|
||||
# trying, log and keep going.
|
||||
LOG.error('Format inspector failed, aborting: %s', e)
|
||||
self._error = True
|
||||
# our streaming of the image for non-expected formats. If we
|
||||
# failed, just stop trying, log and keep going.
|
||||
LOG.debug('Format inspector failed, aborting: %s', e)
|
||||
self._errored_inspectors.add(inspector)
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
chunk = next(self._source)
|
||||
except StopIteration:
|
||||
self._format.finish()
|
||||
self._finish()
|
||||
raise
|
||||
self._process_chunk(chunk)
|
||||
return chunk
|
||||
@ -1297,10 +1316,54 @@ class InfoWrapper(object):
|
||||
self._process_chunk(chunk)
|
||||
return chunk
|
||||
|
||||
def _finish(self):
|
||||
for inspector in self._inspectors:
|
||||
inspector.finish()
|
||||
self._finished = True
|
||||
|
||||
def close(self):
|
||||
if hasattr(self._source, 'close'):
|
||||
self._source.close()
|
||||
self._format.finish()
|
||||
self._finish()
|
||||
|
||||
@property
|
||||
def format(self):
|
||||
"""The format determined from the content.
|
||||
|
||||
If this is None, a decision has not been reached. Otherwise,
|
||||
it is a FileInspector that matches (which may be RawFileInspector
|
||||
if no other formats matched and enough of the stream has been read
|
||||
to make that determination). If more than one format matched, then
|
||||
ImageFormatError is raised. If the allowed_formats was constrained
|
||||
and raw was not included, then this will raise ImageFormatError to
|
||||
indicate that no suitable match was found.
|
||||
"""
|
||||
non_raw = set([i for i in self._inspectors if i.NAME != 'raw'])
|
||||
complete = all([i.complete for i in non_raw])
|
||||
matches = [i for i in non_raw if i.format_match]
|
||||
if not complete and not self._finished:
|
||||
# We do not know what our format is if we're still in progress
|
||||
# of reading the stream and have incomplete inspectors. However,
|
||||
# if EOF has been signaled, then we can assume the incomplete ones
|
||||
# are not matches.
|
||||
return None
|
||||
if len(matches) > 1:
|
||||
# Multiple format matches mean that not only can we not return a
|
||||
# decision here, but also means that there may be something
|
||||
# nefarious going on (i.e. hiding one header in another).
|
||||
raise ImageFormatError('Multiple formats detected: %s' % ','.join(
|
||||
str(i) for i in matches))
|
||||
if not matches:
|
||||
try:
|
||||
# If nothing *specific* matched, we return the raw format to
|
||||
# indicate that we do not recognize this content at all.
|
||||
return [x for x in self._inspectors if str(x) == 'raw'][0]
|
||||
except IndexError:
|
||||
raise ImageFormatError(
|
||||
'Content does not match any allowed format')
|
||||
|
||||
# The expected outcome of this is a single match of something specific
|
||||
return matches[0]
|
||||
|
||||
|
||||
ALL_FORMATS = {
|
||||
@ -1337,36 +1400,12 @@ def detect_file_format(filename):
|
||||
:returns: A FormatInspector instance matching the file.
|
||||
:raises: ImageFormatError if multiple formats are detected.
|
||||
"""
|
||||
inspectors = {k: v() for k, v in ALL_FORMATS.items()}
|
||||
detections = []
|
||||
with open(filename, 'rb') as f:
|
||||
for chunk in _chunked_reader(f, 4096):
|
||||
for format, inspector in list(inspectors.items()):
|
||||
try:
|
||||
inspector.eat_chunk(chunk)
|
||||
except ImageFormatError:
|
||||
# No match, so stop considering this format
|
||||
inspectors.pop(format)
|
||||
continue
|
||||
if (inspector.format_match and inspector.complete and
|
||||
format != 'raw'):
|
||||
# record all match (other than raw)
|
||||
detections.append(inspector)
|
||||
inspectors.pop(format)
|
||||
if all(i.complete for i in inspectors.values()):
|
||||
# If all the inspectors are sure they are not a match, avoid
|
||||
# reading to the end of the file to settle on 'raw'.
|
||||
break
|
||||
|
||||
for format, inspector in list(inspectors.items()):
|
||||
inspector.finish()
|
||||
if inspector.format_match and inspector.complete and format != 'raw':
|
||||
detections.append(inspector)
|
||||
inspectors.pop(format)
|
||||
|
||||
if len(detections) > 1:
|
||||
all_formats = [str(inspector) for inspector in detections]
|
||||
raise ImageFormatError(
|
||||
'Multiple formats detected: %s' % ', '.join(all_formats))
|
||||
|
||||
return inspectors['raw'] if not detections else detections[0]
|
||||
wrapper = InspectWrapper(f)
|
||||
try:
|
||||
for _chunk in _chunked_reader(wrapper, 4096):
|
||||
if wrapper.format:
|
||||
return wrapper.format
|
||||
finally:
|
||||
wrapper.close()
|
||||
return wrapper.format
|
||||
|
@ -15,7 +15,6 @@
|
||||
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import subprocess
|
||||
import tempfile
|
||||
@ -25,21 +24,19 @@ import ddt
|
||||
from oslo_utils import units
|
||||
|
||||
from oslo_utils.imageutils import format_inspector
|
||||
from oslo_utils.imageutils import QemuImgInfo
|
||||
from oslotest import base as test_base
|
||||
|
||||
|
||||
TEST_IMAGE_PREFIX = 'oslo-unittest-formatinspector-'
|
||||
|
||||
|
||||
def get_size_from_qemu_img(filename):
|
||||
output = subprocess.check_output('qemu-img info "%s"' % filename,
|
||||
shell=True)
|
||||
for line in output.split(b'\n'):
|
||||
m = re.search(b'^virtual size: .* .([0-9]+) bytes', line.strip())
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
|
||||
raise Exception('Could not find virtual size with qemu-img')
|
||||
def get_size_format_from_qemu_img(filename):
|
||||
output = subprocess.check_output(
|
||||
'qemu-img info --output=json "%s"' % filename,
|
||||
shell=True)
|
||||
info = QemuImgInfo(output, format='json')
|
||||
return info.virtual_size, info.file_format
|
||||
|
||||
|
||||
@ddt.ddt
|
||||
@ -217,11 +214,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
||||
return fn
|
||||
|
||||
def _test_format_at_block_size(self, format_name, img, block_size):
|
||||
fmt = format_inspector.get_inspector(format_name)()
|
||||
self.assertIsNotNone(fmt,
|
||||
'Did not get format inspector for %s' % (
|
||||
format_name))
|
||||
wrapper = format_inspector.InfoWrapper(open(img, 'rb'), fmt)
|
||||
wrapper = format_inspector.InspectWrapper(open(img, 'rb'),
|
||||
format_name)
|
||||
|
||||
while True:
|
||||
chunk = wrapper.read(block_size)
|
||||
@ -229,7 +223,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
||||
break
|
||||
|
||||
wrapper.close()
|
||||
return fmt
|
||||
self.assertIsNotNone(wrapper.format, 'Failed to detect format')
|
||||
return wrapper.format
|
||||
|
||||
def _test_format_at_image_size(self, format_name, image_size,
|
||||
subformat=None, safety_check=False):
|
||||
@ -244,7 +239,7 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
||||
|
||||
# Some formats have internal alignment restrictions making this not
|
||||
# always exactly like image_size, so get the real value for comparison
|
||||
virtual_size = get_size_from_qemu_img(img)
|
||||
virtual_size, _ = get_size_format_from_qemu_img(img)
|
||||
|
||||
# Read the format in various sizes, some of which will read whole
|
||||
# sections in a single read, others will be completely unaligned, etc.
|
||||
@ -326,25 +321,18 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
||||
return qcow, iso, fn
|
||||
|
||||
def test_bad_iso_qcow2(self):
|
||||
|
||||
# Test that an iso with a qcow2 header in the system area will be
|
||||
# rejected because it matches more than one format (iso and qcow2).
|
||||
# This is an important case because qemu-img does not support iso,
|
||||
# and can be fooled into thinking one is a qcow2 by putting the header
|
||||
# for one in ISO9660's "system area", which is technically a valid
|
||||
# thing to do.
|
||||
_, _, fn = self._generate_bad_iso()
|
||||
|
||||
iso_check = self._test_format_at_block_size('iso', fn, 4 * units.Ki)
|
||||
qcow_check = self._test_format_at_block_size('qcow2', fn, 4 * units.Ki)
|
||||
# this system area of the ISO file is not considered part of the format
|
||||
# the qcow2 header is in the system area of the ISO file
|
||||
# so the ISO file is still valid
|
||||
self.assertTrue(iso_check.format_match)
|
||||
# the qcow2 header is in the system area of the ISO file
|
||||
# but that will be parsed by the qcow2 format inspector
|
||||
# and it will match
|
||||
self.assertTrue(qcow_check.format_match)
|
||||
# if we call format_inspector.detect_file_format it should detect
|
||||
# and raise an exception because both match internally.
|
||||
e = self.assertRaises(
|
||||
format_inspector.ImageFormatError,
|
||||
format_inspector.detect_file_format, fn)
|
||||
self.assertIn('Multiple formats detected', str(e))
|
||||
self.assertRaisesRegex(format_inspector.ImageFormatError,
|
||||
'Multiple formats detected',
|
||||
self._test_format_at_block_size,
|
||||
'iso', fn, 4 * units.Ki)
|
||||
|
||||
def test_from_file_reads_minimum(self):
|
||||
img = self._create_img('qcow2', 10 * units.Mi)
|
||||
@ -387,14 +375,10 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
||||
# Read the format in various sizes, some of which will read whole
|
||||
# sections in a single read, others will be completely unaligned, etc.
|
||||
for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
|
||||
fmt = self._test_format_at_block_size(format_name, img, block_size)
|
||||
self.assertTrue(fmt.format_match,
|
||||
'Failed to match %s at size %i block %i' % (
|
||||
format_name, image_size, block_size))
|
||||
self.assertEqual(0, fmt.virtual_size,
|
||||
('Calculated a virtual size for a corrupt %s at '
|
||||
'size %i block %i') % (format_name, image_size,
|
||||
block_size))
|
||||
self.assertRaisesRegex(format_inspector.ImageFormatError,
|
||||
'Wrong descriptor location',
|
||||
self._test_format_at_block_size,
|
||||
'vmdk', img, block_size)
|
||||
|
||||
def test_vmdk_bad_descriptor_offset(self):
|
||||
self._test_vmdk_bad_descriptor_offset()
|
||||
@ -559,46 +543,46 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
||||
def test_vdi(self):
|
||||
self._test_format('vdi')
|
||||
|
||||
def _test_format_with_invalid_data(self, format_name):
|
||||
fmt = format_inspector.get_inspector(format_name)()
|
||||
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
|
||||
def test_invalid_data(self):
|
||||
wrapper = format_inspector.InspectWrapper(open(__file__, 'rb'))
|
||||
while True:
|
||||
chunk = wrapper.read(32)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
wrapper.close()
|
||||
self.assertFalse(fmt.format_match)
|
||||
self.assertEqual(0, fmt.virtual_size)
|
||||
memory = sum(fmt.context_info.values())
|
||||
self.assertLess(memory, 512 * units.Ki,
|
||||
'Format used more than 512KiB of memory: %s' % (
|
||||
fmt.context_info))
|
||||
# Make sure this was not detected as any other format
|
||||
self.assertEqual('raw', str(wrapper.format))
|
||||
|
||||
def test_qcow2_invalid(self):
|
||||
self._test_format_with_invalid_data('qcow2')
|
||||
# Make sure that all of the other inspectors do not match and did not
|
||||
# use too much memory
|
||||
for fmt in wrapper._inspectors:
|
||||
if str(fmt) == 'raw':
|
||||
continue
|
||||
self.assertFalse(fmt.format_match)
|
||||
memory = sum(fmt.context_info.values())
|
||||
self.assertLess(memory, 512 * units.Ki,
|
||||
'Format used more than 512KiB of memory: %s' % (
|
||||
fmt.context_info))
|
||||
|
||||
def test_vhd_invalid(self):
|
||||
self._test_format_with_invalid_data('vhd')
|
||||
def test_invalid_data_without_raw(self):
|
||||
wrapper = format_inspector.InspectWrapper(
|
||||
open(__file__, 'rb'),
|
||||
allowed_formats=['qcow2', 'vmdk'])
|
||||
while True:
|
||||
chunk = wrapper.read(32)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
def test_vhdx_invalid(self):
|
||||
self._test_format_with_invalid_data('vhdx')
|
||||
|
||||
def test_vmdk_invalid(self):
|
||||
self._test_format_with_invalid_data('vmdk')
|
||||
|
||||
def test_vdi_invalid(self):
|
||||
self._test_format_with_invalid_data('vdi')
|
||||
wrapper.close()
|
||||
# Make sure this was not detected as any other format
|
||||
self.assertRaises(format_inspector.ImageFormatError,
|
||||
lambda: wrapper.format)
|
||||
|
||||
def test_vmdk_invalid_type(self):
|
||||
fmt = format_inspector.get_inspector('vmdk')()
|
||||
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
|
||||
while True:
|
||||
chunk = wrapper.read(32)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
wrapper.close()
|
||||
fmt = format_inspector.VMDKInspector()
|
||||
with open(__file__, 'rb') as f:
|
||||
fmt.eat_chunk(f.read())
|
||||
|
||||
fake_rgn = mock.MagicMock()
|
||||
fake_rgn.complete = True
|
||||
@ -941,8 +925,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
||||
|
||||
def _get_wrapper(self, data):
|
||||
source = io.BytesIO(data)
|
||||
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
|
||||
return format_inspector.InfoWrapper(source, fake_fmt)
|
||||
return format_inspector.InspectWrapper(source)
|
||||
|
||||
def test_info_wrapper_file_like(self):
|
||||
data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
|
||||
@ -967,9 +950,10 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
||||
|
||||
self.assertEqual(data, read_data)
|
||||
|
||||
def test_info_wrapper_file_like_eats_error(self):
|
||||
@mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
|
||||
def test_info_wrapper_file_like_eats_error(self, mock_eat):
|
||||
wrapper = self._get_wrapper(b'123456')
|
||||
wrapper._format.eat_chunk.side_effect = Exception('fail')
|
||||
mock_eat.side_effect = Exception('fail')
|
||||
|
||||
data = b''
|
||||
while True:
|
||||
@ -983,13 +967,12 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
||||
|
||||
# Make sure we only called this once and never again after
|
||||
# the error was raised
|
||||
wrapper._format.eat_chunk.assert_called_once_with(b'123')
|
||||
mock_eat.assert_called_once_with(b'123')
|
||||
|
||||
def test_info_wrapper_iter_like_eats_error(self):
|
||||
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
|
||||
wrapper = format_inspector.InfoWrapper(iter([b'123', b'456']),
|
||||
fake_fmt)
|
||||
fake_fmt.eat_chunk.side_effect = Exception('fail')
|
||||
@mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
|
||||
def test_wrapper_iter_like_eats_error(self, mock_eat):
|
||||
wrapper = format_inspector.InspectWrapper(iter([b'123', b'456']))
|
||||
mock_eat.side_effect = Exception('fail')
|
||||
|
||||
data = b''
|
||||
for chunk in wrapper:
|
||||
@ -1000,7 +983,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
||||
|
||||
# Make sure we only called this once and never again after
|
||||
# the error was raised
|
||||
fake_fmt.eat_chunk.assert_called_once_with(b'123')
|
||||
mock_eat.assert_called_once_with(b'123')
|
||||
|
||||
def test_get_inspector(self):
|
||||
self.assertEqual(format_inspector.QcowInspector,
|
||||
|
Loading…
x
Reference in New Issue
Block a user