Add stream-based detection
This effectively unifies the InfoWrapper and detect_file_format() behavior into a single approach that can work stream-based or with a file on disk. Note that the wrapper name is also changed as "InfoWrapper" was named for glance's intended use-case, which was metadata extraction. Since this has obviously grown in scope, now is the time to change that name. Change-Id: Id61f7472f791fc258ec7d0238568c379a7b27823
This commit is contained in:
parent
a0481d5a61
commit
91af49beed
@ -1256,38 +1256,57 @@ class GPTInspector(FileInspector):
|
|||||||
raise SafetyViolation('GPT MBR has no partitions defined')
|
raise SafetyViolation('GPT MBR has no partitions defined')
|
||||||
|
|
||||||
|
|
||||||
class InfoWrapper(object):
|
class InspectWrapper:
|
||||||
"""A file-like object that wraps another and updates a format inspector.
|
"""A file-like object that wraps another and detects the format.
|
||||||
|
|
||||||
This passes chunks to the format inspector while reading. If the inspector
|
This passes chunks to a group of format inspectors (default: all)
|
||||||
fails, it logs the error and stops calling it, but continues proxying data
|
while reading. After the stream is finished (or enough has been read to
|
||||||
from the source to its user.
|
make a confident decision), the format attribute will provide the
|
||||||
|
inspector object that matched.
|
||||||
|
|
||||||
|
:param source: The file-like input stream to wrap
|
||||||
|
:param expected_format: The format name anticipated to match, if any.
|
||||||
|
If set to a format name, reading of the stream will
|
||||||
|
be interrupted if the matching inspector raises
|
||||||
|
an error (indicting a mismatch or any other
|
||||||
|
problem). This allows the caller to abort before
|
||||||
|
all data is processed.
|
||||||
|
:param allowed_formats: A list of format names that limits the inspector
|
||||||
|
objects that will be used. This may be a security
|
||||||
|
hole if used improperly, but may be used to limit
|
||||||
|
the detected formats to some smaller scope.
|
||||||
"""
|
"""
|
||||||
|
def __init__(self, source, expected_format=None, allowed_formats=None):
|
||||||
def __init__(self, source, fmt):
|
|
||||||
self._source = source
|
self._source = source
|
||||||
self._format = fmt
|
self._expected_format = expected_format
|
||||||
self._error = False
|
self._errored_inspectors = set()
|
||||||
|
self._inspectors = {v() for k, v in ALL_FORMATS.items()
|
||||||
|
if not allowed_formats or k in allowed_formats}
|
||||||
|
self._finished = False
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _process_chunk(self, chunk):
|
def _process_chunk(self, chunk):
|
||||||
if not self._error:
|
for inspector in [i for i in self._inspectors
|
||||||
|
if i not in self._errored_inspectors]:
|
||||||
try:
|
try:
|
||||||
self._format.eat_chunk(chunk)
|
inspector.eat_chunk(chunk)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if inspector.NAME == self._expected_format:
|
||||||
|
# If our desired inspector has failed, we cannot continue
|
||||||
|
raise
|
||||||
# Absolutely do not allow the format inspector to break
|
# Absolutely do not allow the format inspector to break
|
||||||
# our streaming of the image. If we failed, just stop
|
# our streaming of the image for non-expected formats. If we
|
||||||
# trying, log and keep going.
|
# failed, just stop trying, log and keep going.
|
||||||
LOG.error('Format inspector failed, aborting: %s', e)
|
LOG.debug('Format inspector failed, aborting: %s', e)
|
||||||
self._error = True
|
self._errored_inspectors.add(inspector)
|
||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
try:
|
try:
|
||||||
chunk = next(self._source)
|
chunk = next(self._source)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
self._format.finish()
|
self._finish()
|
||||||
raise
|
raise
|
||||||
self._process_chunk(chunk)
|
self._process_chunk(chunk)
|
||||||
return chunk
|
return chunk
|
||||||
@ -1297,10 +1316,54 @@ class InfoWrapper(object):
|
|||||||
self._process_chunk(chunk)
|
self._process_chunk(chunk)
|
||||||
return chunk
|
return chunk
|
||||||
|
|
||||||
|
def _finish(self):
|
||||||
|
for inspector in self._inspectors:
|
||||||
|
inspector.finish()
|
||||||
|
self._finished = True
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if hasattr(self._source, 'close'):
|
if hasattr(self._source, 'close'):
|
||||||
self._source.close()
|
self._source.close()
|
||||||
self._format.finish()
|
self._finish()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def format(self):
|
||||||
|
"""The format determined from the content.
|
||||||
|
|
||||||
|
If this is None, a decision has not been reached. Otherwise,
|
||||||
|
it is a FileInspector that matches (which may be RawFileInspector
|
||||||
|
if no other formats matched and enough of the stream has been read
|
||||||
|
to make that determination). If more than one format matched, then
|
||||||
|
ImageFormatError is raised. If the allowed_formats was constrained
|
||||||
|
and raw was not included, then this will raise ImageFormatError to
|
||||||
|
indicate that no suitable match was found.
|
||||||
|
"""
|
||||||
|
non_raw = set([i for i in self._inspectors if i.NAME != 'raw'])
|
||||||
|
complete = all([i.complete for i in non_raw])
|
||||||
|
matches = [i for i in non_raw if i.format_match]
|
||||||
|
if not complete and not self._finished:
|
||||||
|
# We do not know what our format is if we're still in progress
|
||||||
|
# of reading the stream and have incomplete inspectors. However,
|
||||||
|
# if EOF has been signaled, then we can assume the incomplete ones
|
||||||
|
# are not matches.
|
||||||
|
return None
|
||||||
|
if len(matches) > 1:
|
||||||
|
# Multiple format matches mean that not only can we not return a
|
||||||
|
# decision here, but also means that there may be something
|
||||||
|
# nefarious going on (i.e. hiding one header in another).
|
||||||
|
raise ImageFormatError('Multiple formats detected: %s' % ','.join(
|
||||||
|
str(i) for i in matches))
|
||||||
|
if not matches:
|
||||||
|
try:
|
||||||
|
# If nothing *specific* matched, we return the raw format to
|
||||||
|
# indicate that we do not recognize this content at all.
|
||||||
|
return [x for x in self._inspectors if str(x) == 'raw'][0]
|
||||||
|
except IndexError:
|
||||||
|
raise ImageFormatError(
|
||||||
|
'Content does not match any allowed format')
|
||||||
|
|
||||||
|
# The expected outcome of this is a single match of something specific
|
||||||
|
return matches[0]
|
||||||
|
|
||||||
|
|
||||||
ALL_FORMATS = {
|
ALL_FORMATS = {
|
||||||
@ -1337,36 +1400,12 @@ def detect_file_format(filename):
|
|||||||
:returns: A FormatInspector instance matching the file.
|
:returns: A FormatInspector instance matching the file.
|
||||||
:raises: ImageFormatError if multiple formats are detected.
|
:raises: ImageFormatError if multiple formats are detected.
|
||||||
"""
|
"""
|
||||||
inspectors = {k: v() for k, v in ALL_FORMATS.items()}
|
|
||||||
detections = []
|
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
for chunk in _chunked_reader(f, 4096):
|
wrapper = InspectWrapper(f)
|
||||||
for format, inspector in list(inspectors.items()):
|
try:
|
||||||
try:
|
for _chunk in _chunked_reader(wrapper, 4096):
|
||||||
inspector.eat_chunk(chunk)
|
if wrapper.format:
|
||||||
except ImageFormatError:
|
return wrapper.format
|
||||||
# No match, so stop considering this format
|
finally:
|
||||||
inspectors.pop(format)
|
wrapper.close()
|
||||||
continue
|
return wrapper.format
|
||||||
if (inspector.format_match and inspector.complete and
|
|
||||||
format != 'raw'):
|
|
||||||
# record all match (other than raw)
|
|
||||||
detections.append(inspector)
|
|
||||||
inspectors.pop(format)
|
|
||||||
if all(i.complete for i in inspectors.values()):
|
|
||||||
# If all the inspectors are sure they are not a match, avoid
|
|
||||||
# reading to the end of the file to settle on 'raw'.
|
|
||||||
break
|
|
||||||
|
|
||||||
for format, inspector in list(inspectors.items()):
|
|
||||||
inspector.finish()
|
|
||||||
if inspector.format_match and inspector.complete and format != 'raw':
|
|
||||||
detections.append(inspector)
|
|
||||||
inspectors.pop(format)
|
|
||||||
|
|
||||||
if len(detections) > 1:
|
|
||||||
all_formats = [str(inspector) for inspector in detections]
|
|
||||||
raise ImageFormatError(
|
|
||||||
'Multiple formats detected: %s' % ', '.join(all_formats))
|
|
||||||
|
|
||||||
return inspectors['raw'] if not detections else detections[0]
|
|
||||||
|
@ -15,7 +15,6 @@
|
|||||||
|
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import struct
|
import struct
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
@ -25,21 +24,19 @@ import ddt
|
|||||||
from oslo_utils import units
|
from oslo_utils import units
|
||||||
|
|
||||||
from oslo_utils.imageutils import format_inspector
|
from oslo_utils.imageutils import format_inspector
|
||||||
|
from oslo_utils.imageutils import QemuImgInfo
|
||||||
from oslotest import base as test_base
|
from oslotest import base as test_base
|
||||||
|
|
||||||
|
|
||||||
TEST_IMAGE_PREFIX = 'oslo-unittest-formatinspector-'
|
TEST_IMAGE_PREFIX = 'oslo-unittest-formatinspector-'
|
||||||
|
|
||||||
|
|
||||||
def get_size_from_qemu_img(filename):
|
def get_size_format_from_qemu_img(filename):
|
||||||
output = subprocess.check_output('qemu-img info "%s"' % filename,
|
output = subprocess.check_output(
|
||||||
shell=True)
|
'qemu-img info --output=json "%s"' % filename,
|
||||||
for line in output.split(b'\n'):
|
shell=True)
|
||||||
m = re.search(b'^virtual size: .* .([0-9]+) bytes', line.strip())
|
info = QemuImgInfo(output, format='json')
|
||||||
if m:
|
return info.virtual_size, info.file_format
|
||||||
return int(m.group(1))
|
|
||||||
|
|
||||||
raise Exception('Could not find virtual size with qemu-img')
|
|
||||||
|
|
||||||
|
|
||||||
@ddt.ddt
|
@ddt.ddt
|
||||||
@ -217,11 +214,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
|||||||
return fn
|
return fn
|
||||||
|
|
||||||
def _test_format_at_block_size(self, format_name, img, block_size):
|
def _test_format_at_block_size(self, format_name, img, block_size):
|
||||||
fmt = format_inspector.get_inspector(format_name)()
|
wrapper = format_inspector.InspectWrapper(open(img, 'rb'),
|
||||||
self.assertIsNotNone(fmt,
|
format_name)
|
||||||
'Did not get format inspector for %s' % (
|
|
||||||
format_name))
|
|
||||||
wrapper = format_inspector.InfoWrapper(open(img, 'rb'), fmt)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
chunk = wrapper.read(block_size)
|
chunk = wrapper.read(block_size)
|
||||||
@ -229,7 +223,8 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
|||||||
break
|
break
|
||||||
|
|
||||||
wrapper.close()
|
wrapper.close()
|
||||||
return fmt
|
self.assertIsNotNone(wrapper.format, 'Failed to detect format')
|
||||||
|
return wrapper.format
|
||||||
|
|
||||||
def _test_format_at_image_size(self, format_name, image_size,
|
def _test_format_at_image_size(self, format_name, image_size,
|
||||||
subformat=None, safety_check=False):
|
subformat=None, safety_check=False):
|
||||||
@ -244,7 +239,7 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
|||||||
|
|
||||||
# Some formats have internal alignment restrictions making this not
|
# Some formats have internal alignment restrictions making this not
|
||||||
# always exactly like image_size, so get the real value for comparison
|
# always exactly like image_size, so get the real value for comparison
|
||||||
virtual_size = get_size_from_qemu_img(img)
|
virtual_size, _ = get_size_format_from_qemu_img(img)
|
||||||
|
|
||||||
# Read the format in various sizes, some of which will read whole
|
# Read the format in various sizes, some of which will read whole
|
||||||
# sections in a single read, others will be completely unaligned, etc.
|
# sections in a single read, others will be completely unaligned, etc.
|
||||||
@ -326,25 +321,18 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
|||||||
return qcow, iso, fn
|
return qcow, iso, fn
|
||||||
|
|
||||||
def test_bad_iso_qcow2(self):
|
def test_bad_iso_qcow2(self):
|
||||||
|
# Test that an iso with a qcow2 header in the system area will be
|
||||||
|
# rejected because it matches more than one format (iso and qcow2).
|
||||||
|
# This is an important case because qemu-img does not support iso,
|
||||||
|
# and can be fooled into thinking one is a qcow2 by putting the header
|
||||||
|
# for one in ISO9660's "system area", which is technically a valid
|
||||||
|
# thing to do.
|
||||||
_, _, fn = self._generate_bad_iso()
|
_, _, fn = self._generate_bad_iso()
|
||||||
|
|
||||||
iso_check = self._test_format_at_block_size('iso', fn, 4 * units.Ki)
|
self.assertRaisesRegex(format_inspector.ImageFormatError,
|
||||||
qcow_check = self._test_format_at_block_size('qcow2', fn, 4 * units.Ki)
|
'Multiple formats detected',
|
||||||
# this system area of the ISO file is not considered part of the format
|
self._test_format_at_block_size,
|
||||||
# the qcow2 header is in the system area of the ISO file
|
'iso', fn, 4 * units.Ki)
|
||||||
# so the ISO file is still valid
|
|
||||||
self.assertTrue(iso_check.format_match)
|
|
||||||
# the qcow2 header is in the system area of the ISO file
|
|
||||||
# but that will be parsed by the qcow2 format inspector
|
|
||||||
# and it will match
|
|
||||||
self.assertTrue(qcow_check.format_match)
|
|
||||||
# if we call format_inspector.detect_file_format it should detect
|
|
||||||
# and raise an exception because both match internally.
|
|
||||||
e = self.assertRaises(
|
|
||||||
format_inspector.ImageFormatError,
|
|
||||||
format_inspector.detect_file_format, fn)
|
|
||||||
self.assertIn('Multiple formats detected', str(e))
|
|
||||||
|
|
||||||
def test_from_file_reads_minimum(self):
|
def test_from_file_reads_minimum(self):
|
||||||
img = self._create_img('qcow2', 10 * units.Mi)
|
img = self._create_img('qcow2', 10 * units.Mi)
|
||||||
@ -387,14 +375,10 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
|||||||
# Read the format in various sizes, some of which will read whole
|
# Read the format in various sizes, some of which will read whole
|
||||||
# sections in a single read, others will be completely unaligned, etc.
|
# sections in a single read, others will be completely unaligned, etc.
|
||||||
for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
|
for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
|
||||||
fmt = self._test_format_at_block_size(format_name, img, block_size)
|
self.assertRaisesRegex(format_inspector.ImageFormatError,
|
||||||
self.assertTrue(fmt.format_match,
|
'Wrong descriptor location',
|
||||||
'Failed to match %s at size %i block %i' % (
|
self._test_format_at_block_size,
|
||||||
format_name, image_size, block_size))
|
'vmdk', img, block_size)
|
||||||
self.assertEqual(0, fmt.virtual_size,
|
|
||||||
('Calculated a virtual size for a corrupt %s at '
|
|
||||||
'size %i block %i') % (format_name, image_size,
|
|
||||||
block_size))
|
|
||||||
|
|
||||||
def test_vmdk_bad_descriptor_offset(self):
|
def test_vmdk_bad_descriptor_offset(self):
|
||||||
self._test_vmdk_bad_descriptor_offset()
|
self._test_vmdk_bad_descriptor_offset()
|
||||||
@ -559,46 +543,46 @@ class TestFormatInspectors(test_base.BaseTestCase):
|
|||||||
def test_vdi(self):
|
def test_vdi(self):
|
||||||
self._test_format('vdi')
|
self._test_format('vdi')
|
||||||
|
|
||||||
def _test_format_with_invalid_data(self, format_name):
|
def test_invalid_data(self):
|
||||||
fmt = format_inspector.get_inspector(format_name)()
|
wrapper = format_inspector.InspectWrapper(open(__file__, 'rb'))
|
||||||
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
|
|
||||||
while True:
|
while True:
|
||||||
chunk = wrapper.read(32)
|
chunk = wrapper.read(32)
|
||||||
if not chunk:
|
if not chunk:
|
||||||
break
|
break
|
||||||
|
|
||||||
wrapper.close()
|
wrapper.close()
|
||||||
self.assertFalse(fmt.format_match)
|
# Make sure this was not detected as any other format
|
||||||
self.assertEqual(0, fmt.virtual_size)
|
self.assertEqual('raw', str(wrapper.format))
|
||||||
memory = sum(fmt.context_info.values())
|
|
||||||
self.assertLess(memory, 512 * units.Ki,
|
|
||||||
'Format used more than 512KiB of memory: %s' % (
|
|
||||||
fmt.context_info))
|
|
||||||
|
|
||||||
def test_qcow2_invalid(self):
|
# Make sure that all of the other inspectors do not match and did not
|
||||||
self._test_format_with_invalid_data('qcow2')
|
# use too much memory
|
||||||
|
for fmt in wrapper._inspectors:
|
||||||
|
if str(fmt) == 'raw':
|
||||||
|
continue
|
||||||
|
self.assertFalse(fmt.format_match)
|
||||||
|
memory = sum(fmt.context_info.values())
|
||||||
|
self.assertLess(memory, 512 * units.Ki,
|
||||||
|
'Format used more than 512KiB of memory: %s' % (
|
||||||
|
fmt.context_info))
|
||||||
|
|
||||||
def test_vhd_invalid(self):
|
def test_invalid_data_without_raw(self):
|
||||||
self._test_format_with_invalid_data('vhd')
|
wrapper = format_inspector.InspectWrapper(
|
||||||
|
open(__file__, 'rb'),
|
||||||
|
allowed_formats=['qcow2', 'vmdk'])
|
||||||
|
while True:
|
||||||
|
chunk = wrapper.read(32)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
|
||||||
def test_vhdx_invalid(self):
|
wrapper.close()
|
||||||
self._test_format_with_invalid_data('vhdx')
|
# Make sure this was not detected as any other format
|
||||||
|
self.assertRaises(format_inspector.ImageFormatError,
|
||||||
def test_vmdk_invalid(self):
|
lambda: wrapper.format)
|
||||||
self._test_format_with_invalid_data('vmdk')
|
|
||||||
|
|
||||||
def test_vdi_invalid(self):
|
|
||||||
self._test_format_with_invalid_data('vdi')
|
|
||||||
|
|
||||||
def test_vmdk_invalid_type(self):
|
def test_vmdk_invalid_type(self):
|
||||||
fmt = format_inspector.get_inspector('vmdk')()
|
fmt = format_inspector.VMDKInspector()
|
||||||
wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
|
with open(__file__, 'rb') as f:
|
||||||
while True:
|
fmt.eat_chunk(f.read())
|
||||||
chunk = wrapper.read(32)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
|
|
||||||
wrapper.close()
|
|
||||||
|
|
||||||
fake_rgn = mock.MagicMock()
|
fake_rgn = mock.MagicMock()
|
||||||
fake_rgn.complete = True
|
fake_rgn.complete = True
|
||||||
@ -941,8 +925,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
|||||||
|
|
||||||
def _get_wrapper(self, data):
|
def _get_wrapper(self, data):
|
||||||
source = io.BytesIO(data)
|
source = io.BytesIO(data)
|
||||||
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
|
return format_inspector.InspectWrapper(source)
|
||||||
return format_inspector.InfoWrapper(source, fake_fmt)
|
|
||||||
|
|
||||||
def test_info_wrapper_file_like(self):
|
def test_info_wrapper_file_like(self):
|
||||||
data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
|
data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
|
||||||
@ -967,9 +950,10 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
|||||||
|
|
||||||
self.assertEqual(data, read_data)
|
self.assertEqual(data, read_data)
|
||||||
|
|
||||||
def test_info_wrapper_file_like_eats_error(self):
|
@mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
|
||||||
|
def test_info_wrapper_file_like_eats_error(self, mock_eat):
|
||||||
wrapper = self._get_wrapper(b'123456')
|
wrapper = self._get_wrapper(b'123456')
|
||||||
wrapper._format.eat_chunk.side_effect = Exception('fail')
|
mock_eat.side_effect = Exception('fail')
|
||||||
|
|
||||||
data = b''
|
data = b''
|
||||||
while True:
|
while True:
|
||||||
@ -983,13 +967,12 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
|||||||
|
|
||||||
# Make sure we only called this once and never again after
|
# Make sure we only called this once and never again after
|
||||||
# the error was raised
|
# the error was raised
|
||||||
wrapper._format.eat_chunk.assert_called_once_with(b'123')
|
mock_eat.assert_called_once_with(b'123')
|
||||||
|
|
||||||
def test_info_wrapper_iter_like_eats_error(self):
|
@mock.patch.object(format_inspector.VMDKInspector, 'eat_chunk')
|
||||||
fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
|
def test_wrapper_iter_like_eats_error(self, mock_eat):
|
||||||
wrapper = format_inspector.InfoWrapper(iter([b'123', b'456']),
|
wrapper = format_inspector.InspectWrapper(iter([b'123', b'456']))
|
||||||
fake_fmt)
|
mock_eat.side_effect = Exception('fail')
|
||||||
fake_fmt.eat_chunk.side_effect = Exception('fail')
|
|
||||||
|
|
||||||
data = b''
|
data = b''
|
||||||
for chunk in wrapper:
|
for chunk in wrapper:
|
||||||
@ -1000,7 +983,7 @@ class TestFormatInspectorInfra(test_base.BaseTestCase):
|
|||||||
|
|
||||||
# Make sure we only called this once and never again after
|
# Make sure we only called this once and never again after
|
||||||
# the error was raised
|
# the error was raised
|
||||||
fake_fmt.eat_chunk.assert_called_once_with(b'123')
|
mock_eat.assert_called_once_with(b'123')
|
||||||
|
|
||||||
def test_get_inspector(self):
|
def test_get_inspector(self):
|
||||||
self.assertEqual(format_inspector.QcowInspector,
|
self.assertEqual(format_inspector.QcowInspector,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user