Fix infinite loop for temp file renames on ENOENT
For whatever reason, it appears that GlusterFS, or perhaps FUSE can continuously return ENOENT on a rename system call even when we have double checked that there is no reason to do so. That is a bug for that sub system. However, our response to that bug can result in an infinite loop, which is bad. This code reduces that to 10 attempts. In addition, we restructed the open retry loop to match, providing module constants for the upper bounds of both retry loops. BUG: XXXXXX (https://bugzilla.redhat.com/show_bug.cgi?id=XXXXXX) Change-Id: Ia2d6dd427daba3ea0461863c5ffe3aef27c88f9b Signed-off-by: Peter Portante <peter.portante@redhat.com> Reviewed-on: http://review.gluster.org/5670 Reviewed-by: Luis Pabon <lpabon@redhat.com> Tested-by: Luis Pabon <lpabon@redhat.com>
This commit is contained in:
parent
b00f10f5ca
commit
4735980723
@ -50,6 +50,9 @@ DEFAULT_BYTES_PER_SYNC = (512 * 1024 * 1024)
|
||||
# keep these lower-case
|
||||
DISALLOWED_HEADERS = set('content-length content-type deleted etag'.split())
|
||||
|
||||
MAX_RENAME_ATTEMPTS = 10
|
||||
MAX_OPEN_ATTEMPTS = 10
|
||||
|
||||
|
||||
def _random_sleep():
|
||||
sleep(random.uniform(0.5, 0.15))
|
||||
@ -233,15 +236,9 @@ if _fs_conf.read(os.path.join('/etc/swift', 'fs.conf')):
|
||||
in TRUE_VALUES
|
||||
except (NoSectionError, NoOptionError):
|
||||
_use_put_mount = False
|
||||
try:
|
||||
_relaxed_writes = _fs_conf.get('DEFAULT', 'relaxed_writes', "no") \
|
||||
in TRUE_VALUES
|
||||
except (NoSectionError, NoOptionError):
|
||||
_relaxed_writes = False
|
||||
else:
|
||||
_mkdir_locking = False
|
||||
_use_put_mount = False
|
||||
_relaxed_writes = False
|
||||
|
||||
if _mkdir_locking:
|
||||
make_directory = _make_directory_locked
|
||||
@ -324,27 +321,28 @@ class DiskWriter(SwiftDiskWriter):
|
||||
# disk.
|
||||
write_metadata(self.fd, metadata)
|
||||
|
||||
if not _relaxed_writes:
|
||||
# We call fsync() before calling drop_cache() to lower the
|
||||
# amount of redundant work the drop cache code will perform on
|
||||
# the pages (now that after fsync the pages will be all
|
||||
# clean).
|
||||
do_fsync(self.fd)
|
||||
# From the Department of the Redundancy Department, make sure
|
||||
# we call drop_cache() after fsync() to avoid redundant work
|
||||
# (pages all clean).
|
||||
drop_buffer_cache(self.fd, 0, self.upload_size)
|
||||
# We call fsync() before calling drop_cache() to lower the
|
||||
# amount of redundant work the drop cache code will perform on
|
||||
# the pages (now that after fsync the pages will be all
|
||||
# clean).
|
||||
do_fsync(self.fd)
|
||||
# From the Department of the Redundancy Department, make sure
|
||||
# we call drop_cache() after fsync() to avoid redundant work
|
||||
# (pages all clean).
|
||||
drop_buffer_cache(self.fd, 0, self.upload_size)
|
||||
|
||||
# At this point we know that the object's full directory path
|
||||
# exists, so we can just rename it directly without using Swift's
|
||||
# swift.common.utils.renamer(), which makes the directory path and
|
||||
# adds extra stat() calls.
|
||||
data_file = os.path.join(df.put_datadir, df._obj)
|
||||
attempts = 1
|
||||
while True:
|
||||
try:
|
||||
os.rename(self.tmppath, data_file)
|
||||
except OSError as err:
|
||||
if err.errno in (errno.ENOENT, errno.EIO):
|
||||
if err.errno in (errno.ENOENT, errno.EIO) \
|
||||
and attempts < MAX_RENAME_ATTEMPTS:
|
||||
# FIXME: Why either of these two error conditions is
|
||||
# happening is unknown at this point. This might be a
|
||||
# FUSE issue of some sort or a possible race
|
||||
@ -368,7 +366,7 @@ class DiskWriter(SwiftDiskWriter):
|
||||
self.tmppath, data_file))
|
||||
else:
|
||||
# Data file target name now has a bad path!
|
||||
dfstats = do_stat(self.put_datadir)
|
||||
dfstats = do_stat(df.put_datadir)
|
||||
if not dfstats:
|
||||
raise DiskFileError(
|
||||
'DiskFile.put(): path to object, %s, no'
|
||||
@ -393,6 +391,7 @@ class DiskWriter(SwiftDiskWriter):
|
||||
self.tmppath, data_file,
|
||||
str(err), df.put_datadir,
|
||||
dfstats))
|
||||
attempts += 1
|
||||
continue
|
||||
else:
|
||||
raise GlusterFileSystemOSError(
|
||||
@ -557,8 +556,7 @@ class DiskFile(SwiftDiskFile):
|
||||
see if the directory object already exists, that is left to the caller
|
||||
(this avoids a potentially duplicate stat() system call).
|
||||
|
||||
The "dir_path" must be relative to its container,
|
||||
self._container_path.
|
||||
The "dir_path" must be relative to its container, self._container_path.
|
||||
|
||||
The "metadata" object is an optional set of metadata to apply to the
|
||||
newly created directory object. If not present, no initial metadata is
|
||||
@ -624,7 +622,8 @@ class DiskFile(SwiftDiskFile):
|
||||
|
||||
# Assume the full directory path exists to the file already, and
|
||||
# construct the proper name for the temporary file.
|
||||
for i in range(0, 1000):
|
||||
attempts = 1
|
||||
while True:
|
||||
tmpfile = '.' + self._obj + '.' + md5(self._obj +
|
||||
str(random.random())).hexdigest()
|
||||
tmppath = os.path.join(self.put_datadir, tmpfile)
|
||||
@ -635,20 +634,29 @@ class DiskFile(SwiftDiskFile):
|
||||
if gerr.errno == errno.ENOSPC:
|
||||
# Raise DiskFileNoSpace to be handled by upper layers
|
||||
raise DiskFileNoSpace()
|
||||
if gerr.errno not in (errno.ENOENT, errno.EEXIST, errno.EIO):
|
||||
# FIXME: Other cases we should handle?
|
||||
raise
|
||||
if attempts >= MAX_OPEN_ATTEMPTS:
|
||||
# We failed after N attempts to create the temporary
|
||||
# file.
|
||||
raise DiskFileError('DiskFile.mkstemp(): failed to'
|
||||
' successfully create a temporary file'
|
||||
' without running into a name conflict'
|
||||
' after %d of %d attempts for: %s' % (
|
||||
attempts, MAX_OPEN_ATTEMPTS,
|
||||
data_file))
|
||||
if gerr.errno == errno.EEXIST:
|
||||
# Retry with a different random number.
|
||||
continue
|
||||
if gerr.errno == errno.EIO:
|
||||
attempts += 1
|
||||
elif gerr.errno == errno.EIO:
|
||||
# FIXME: Possible FUSE issue or race condition, let's
|
||||
# sleep on it and retry the operation.
|
||||
_random_sleep()
|
||||
logging.warn("DiskFile.mkstemp(): %s ... retrying in"
|
||||
" 0.1 secs", gerr)
|
||||
continue
|
||||
if gerr.errno != errno.ENOENT:
|
||||
# FIXME: Other cases we should handle?
|
||||
raise
|
||||
if not self._obj_path:
|
||||
attempts += 1
|
||||
elif not self._obj_path:
|
||||
# No directory hierarchy and the create failed telling us
|
||||
# the container or volume directory does not exist. This
|
||||
# could be a FUSE issue or some race condition, so let's
|
||||
@ -656,26 +664,22 @@ class DiskFile(SwiftDiskFile):
|
||||
_random_sleep()
|
||||
logging.warn("DiskFile.mkstemp(): %s ... retrying in"
|
||||
" 0.1 secs", gerr)
|
||||
continue
|
||||
if i != 0:
|
||||
attempts += 1
|
||||
elif attempts > 1:
|
||||
# Got ENOENT after previously making the path. This could
|
||||
# also be a FUSE issue or some race condition, nap and
|
||||
# retry.
|
||||
_random_sleep()
|
||||
logging.warn("DiskFile.mkstemp(): %s ... retrying in"
|
||||
" 0.1 secs" % gerr)
|
||||
continue
|
||||
# It looks like the path to the object does not already exist
|
||||
self._create_dir_object(self._obj_path)
|
||||
continue
|
||||
attempts += 1
|
||||
else:
|
||||
# It looks like the path to the object does not already
|
||||
# exist; don't count this as an attempt, though, since
|
||||
# we perform the open() system call optimistically.
|
||||
self._create_dir_object(self._obj_path)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
# We failed after 1,000 attempts to create the temporary file.
|
||||
raise DiskFileError('DiskFile.mkstemp(): failed to successfully'
|
||||
' create a temporary file without running'
|
||||
' into a name conflict after 1,000 attempts'
|
||||
' for: %s' % (data_file,))
|
||||
dw = None
|
||||
try:
|
||||
# Ensure it is properly owned before we make it available.
|
||||
@ -769,8 +773,8 @@ class DiskFile(SwiftDiskFile):
|
||||
:raises DiskFileError: on file size mismatch.
|
||||
:raises DiskFileNotExist: on file not existing (including deleted)
|
||||
"""
|
||||
#Marker directory.
|
||||
if self._is_dir:
|
||||
# Directories have no size.
|
||||
return 0
|
||||
try:
|
||||
file_size = 0
|
||||
|
@ -25,14 +25,17 @@ import mock
|
||||
from mock import patch
|
||||
from hashlib import md5
|
||||
|
||||
import gluster.swift.common.utils
|
||||
import gluster.swift.obj.diskfile
|
||||
from swift.common.utils import normalize_timestamp
|
||||
from gluster.swift.obj.diskfile import DiskFile
|
||||
from swift.common.exceptions import DiskFileNotExist, DiskFileError, \
|
||||
DiskFileNoSpace
|
||||
|
||||
from gluster.swift.common.exceptions import GlusterFileSystemOSError
|
||||
import gluster.swift.common.utils
|
||||
import gluster.swift.obj.diskfile
|
||||
from gluster.swift.obj.diskfile import DiskFile
|
||||
from gluster.swift.common.utils import DEFAULT_UID, DEFAULT_GID, X_TYPE, \
|
||||
X_OBJECT_TYPE, DIR_OBJECT
|
||||
|
||||
from test.unit.common.test_utils import _initxattr, _destroyxattr
|
||||
from test.unit import FakeLogger
|
||||
|
||||
@ -565,7 +568,6 @@ class TestDiskFile(unittest.TestCase):
|
||||
finally:
|
||||
shutil.rmtree(td)
|
||||
|
||||
|
||||
def test_put_ENOSPC(self):
|
||||
td = tempfile.mkdtemp()
|
||||
the_cont = os.path.join(td, "vol0", "bar")
|
||||
@ -589,6 +591,7 @@ class TestDiskFile(unittest.TestCase):
|
||||
'ETag': etag,
|
||||
'Content-Length': '5',
|
||||
}
|
||||
|
||||
def mock_open(*args, **kwargs):
|
||||
raise OSError(errno.ENOSPC, os.strerror(errno.ENOSPC))
|
||||
|
||||
@ -605,6 +608,51 @@ class TestDiskFile(unittest.TestCase):
|
||||
finally:
|
||||
shutil.rmtree(td)
|
||||
|
||||
def test_put_rename_ENOENT(self):
|
||||
td = tempfile.mkdtemp()
|
||||
the_cont = os.path.join(td, "vol0", "bar")
|
||||
try:
|
||||
os.makedirs(the_cont)
|
||||
gdf = DiskFile(td, "vol0", "p57", "ufo47", "bar", "z", self.lg)
|
||||
assert gdf._obj == "z"
|
||||
assert gdf._obj_path == ""
|
||||
assert gdf.name == "bar"
|
||||
assert gdf.datadir == the_cont
|
||||
assert gdf.data_file is None
|
||||
|
||||
body = '1234\n'
|
||||
etag = md5()
|
||||
etag.update(body)
|
||||
etag = etag.hexdigest()
|
||||
metadata = {
|
||||
'X-Timestamp': '1234',
|
||||
'Content-Type': 'file',
|
||||
'ETag': etag,
|
||||
'Content-Length': '5',
|
||||
}
|
||||
|
||||
def mock_sleep(*args, **kwargs):
|
||||
# Return without sleep, no need to dely unit tests
|
||||
return
|
||||
|
||||
def mock_rename(*args, **kwargs):
|
||||
raise OSError(errno.ENOENT, os.strerror(errno.ENOENT))
|
||||
|
||||
with mock.patch("gluster.swift.obj.diskfile.sleep", mock_sleep):
|
||||
with mock.patch("os.rename", mock_rename):
|
||||
try:
|
||||
with gdf.writer() as dw:
|
||||
assert dw.tmppath is not None
|
||||
tmppath = dw.tmppath
|
||||
dw.write(body)
|
||||
dw.put(metadata)
|
||||
except GlusterFileSystemOSError:
|
||||
pass
|
||||
else:
|
||||
self.fail("Expected exception DiskFileError")
|
||||
finally:
|
||||
shutil.rmtree(td)
|
||||
|
||||
def test_put_obj_path(self):
|
||||
the_obj_path = os.path.join("b", "a")
|
||||
the_file = os.path.join(the_obj_path, "z")
|
||||
|
Loading…
x
Reference in New Issue
Block a user