Fix infinite loop for temp file renames on ENOENT

For whatever reason, it appears that GlusterFS, or perhaps FUSE can
continuously return ENOENT on a rename system call even when we have
double checked that there is no reason to do so. That is a bug for
that sub system. However, our response to that bug can result in an
infinite loop, which is bad.

This code reduces that to 10 attempts.

In addition, we restructed the open retry loop to match, providing
module constants for the upper bounds of both retry loops.

BUG: XXXXXX (https://bugzilla.redhat.com/show_bug.cgi?id=XXXXXX)

Change-Id: Ia2d6dd427daba3ea0461863c5ffe3aef27c88f9b
Signed-off-by: Peter Portante <peter.portante@redhat.com>
Reviewed-on: http://review.gluster.org/5670
Reviewed-by: Luis Pabon <lpabon@redhat.com>
Tested-by: Luis Pabon <lpabon@redhat.com>
This commit is contained in:
Peter Portante 2013-08-20 18:32:28 -04:00 committed by Luis Pabon
parent b00f10f5ca
commit 4735980723
2 changed files with 97 additions and 45 deletions

View File

@ -50,6 +50,9 @@ DEFAULT_BYTES_PER_SYNC = (512 * 1024 * 1024)
# keep these lower-case
DISALLOWED_HEADERS = set('content-length content-type deleted etag'.split())
MAX_RENAME_ATTEMPTS = 10
MAX_OPEN_ATTEMPTS = 10
def _random_sleep():
sleep(random.uniform(0.5, 0.15))
@ -233,15 +236,9 @@ if _fs_conf.read(os.path.join('/etc/swift', 'fs.conf')):
in TRUE_VALUES
except (NoSectionError, NoOptionError):
_use_put_mount = False
try:
_relaxed_writes = _fs_conf.get('DEFAULT', 'relaxed_writes', "no") \
in TRUE_VALUES
except (NoSectionError, NoOptionError):
_relaxed_writes = False
else:
_mkdir_locking = False
_use_put_mount = False
_relaxed_writes = False
if _mkdir_locking:
make_directory = _make_directory_locked
@ -324,27 +321,28 @@ class DiskWriter(SwiftDiskWriter):
# disk.
write_metadata(self.fd, metadata)
if not _relaxed_writes:
# We call fsync() before calling drop_cache() to lower the
# amount of redundant work the drop cache code will perform on
# the pages (now that after fsync the pages will be all
# clean).
do_fsync(self.fd)
# From the Department of the Redundancy Department, make sure
# we call drop_cache() after fsync() to avoid redundant work
# (pages all clean).
drop_buffer_cache(self.fd, 0, self.upload_size)
# We call fsync() before calling drop_cache() to lower the
# amount of redundant work the drop cache code will perform on
# the pages (now that after fsync the pages will be all
# clean).
do_fsync(self.fd)
# From the Department of the Redundancy Department, make sure
# we call drop_cache() after fsync() to avoid redundant work
# (pages all clean).
drop_buffer_cache(self.fd, 0, self.upload_size)
# At this point we know that the object's full directory path
# exists, so we can just rename it directly without using Swift's
# swift.common.utils.renamer(), which makes the directory path and
# adds extra stat() calls.
data_file = os.path.join(df.put_datadir, df._obj)
attempts = 1
while True:
try:
os.rename(self.tmppath, data_file)
except OSError as err:
if err.errno in (errno.ENOENT, errno.EIO):
if err.errno in (errno.ENOENT, errno.EIO) \
and attempts < MAX_RENAME_ATTEMPTS:
# FIXME: Why either of these two error conditions is
# happening is unknown at this point. This might be a
# FUSE issue of some sort or a possible race
@ -368,7 +366,7 @@ class DiskWriter(SwiftDiskWriter):
self.tmppath, data_file))
else:
# Data file target name now has a bad path!
dfstats = do_stat(self.put_datadir)
dfstats = do_stat(df.put_datadir)
if not dfstats:
raise DiskFileError(
'DiskFile.put(): path to object, %s, no'
@ -393,6 +391,7 @@ class DiskWriter(SwiftDiskWriter):
self.tmppath, data_file,
str(err), df.put_datadir,
dfstats))
attempts += 1
continue
else:
raise GlusterFileSystemOSError(
@ -557,8 +556,7 @@ class DiskFile(SwiftDiskFile):
see if the directory object already exists, that is left to the caller
(this avoids a potentially duplicate stat() system call).
The "dir_path" must be relative to its container,
self._container_path.
The "dir_path" must be relative to its container, self._container_path.
The "metadata" object is an optional set of metadata to apply to the
newly created directory object. If not present, no initial metadata is
@ -624,7 +622,8 @@ class DiskFile(SwiftDiskFile):
# Assume the full directory path exists to the file already, and
# construct the proper name for the temporary file.
for i in range(0, 1000):
attempts = 1
while True:
tmpfile = '.' + self._obj + '.' + md5(self._obj +
str(random.random())).hexdigest()
tmppath = os.path.join(self.put_datadir, tmpfile)
@ -635,20 +634,29 @@ class DiskFile(SwiftDiskFile):
if gerr.errno == errno.ENOSPC:
# Raise DiskFileNoSpace to be handled by upper layers
raise DiskFileNoSpace()
if gerr.errno not in (errno.ENOENT, errno.EEXIST, errno.EIO):
# FIXME: Other cases we should handle?
raise
if attempts >= MAX_OPEN_ATTEMPTS:
# We failed after N attempts to create the temporary
# file.
raise DiskFileError('DiskFile.mkstemp(): failed to'
' successfully create a temporary file'
' without running into a name conflict'
' after %d of %d attempts for: %s' % (
attempts, MAX_OPEN_ATTEMPTS,
data_file))
if gerr.errno == errno.EEXIST:
# Retry with a different random number.
continue
if gerr.errno == errno.EIO:
attempts += 1
elif gerr.errno == errno.EIO:
# FIXME: Possible FUSE issue or race condition, let's
# sleep on it and retry the operation.
_random_sleep()
logging.warn("DiskFile.mkstemp(): %s ... retrying in"
" 0.1 secs", gerr)
continue
if gerr.errno != errno.ENOENT:
# FIXME: Other cases we should handle?
raise
if not self._obj_path:
attempts += 1
elif not self._obj_path:
# No directory hierarchy and the create failed telling us
# the container or volume directory does not exist. This
# could be a FUSE issue or some race condition, so let's
@ -656,26 +664,22 @@ class DiskFile(SwiftDiskFile):
_random_sleep()
logging.warn("DiskFile.mkstemp(): %s ... retrying in"
" 0.1 secs", gerr)
continue
if i != 0:
attempts += 1
elif attempts > 1:
# Got ENOENT after previously making the path. This could
# also be a FUSE issue or some race condition, nap and
# retry.
_random_sleep()
logging.warn("DiskFile.mkstemp(): %s ... retrying in"
" 0.1 secs" % gerr)
continue
# It looks like the path to the object does not already exist
self._create_dir_object(self._obj_path)
continue
attempts += 1
else:
# It looks like the path to the object does not already
# exist; don't count this as an attempt, though, since
# we perform the open() system call optimistically.
self._create_dir_object(self._obj_path)
else:
break
else:
# We failed after 1,000 attempts to create the temporary file.
raise DiskFileError('DiskFile.mkstemp(): failed to successfully'
' create a temporary file without running'
' into a name conflict after 1,000 attempts'
' for: %s' % (data_file,))
dw = None
try:
# Ensure it is properly owned before we make it available.
@ -769,8 +773,8 @@ class DiskFile(SwiftDiskFile):
:raises DiskFileError: on file size mismatch.
:raises DiskFileNotExist: on file not existing (including deleted)
"""
#Marker directory.
if self._is_dir:
# Directories have no size.
return 0
try:
file_size = 0

View File

@ -25,14 +25,17 @@ import mock
from mock import patch
from hashlib import md5
import gluster.swift.common.utils
import gluster.swift.obj.diskfile
from swift.common.utils import normalize_timestamp
from gluster.swift.obj.diskfile import DiskFile
from swift.common.exceptions import DiskFileNotExist, DiskFileError, \
DiskFileNoSpace
from gluster.swift.common.exceptions import GlusterFileSystemOSError
import gluster.swift.common.utils
import gluster.swift.obj.diskfile
from gluster.swift.obj.diskfile import DiskFile
from gluster.swift.common.utils import DEFAULT_UID, DEFAULT_GID, X_TYPE, \
X_OBJECT_TYPE, DIR_OBJECT
from test.unit.common.test_utils import _initxattr, _destroyxattr
from test.unit import FakeLogger
@ -565,7 +568,6 @@ class TestDiskFile(unittest.TestCase):
finally:
shutil.rmtree(td)
def test_put_ENOSPC(self):
td = tempfile.mkdtemp()
the_cont = os.path.join(td, "vol0", "bar")
@ -589,6 +591,7 @@ class TestDiskFile(unittest.TestCase):
'ETag': etag,
'Content-Length': '5',
}
def mock_open(*args, **kwargs):
raise OSError(errno.ENOSPC, os.strerror(errno.ENOSPC))
@ -605,6 +608,51 @@ class TestDiskFile(unittest.TestCase):
finally:
shutil.rmtree(td)
def test_put_rename_ENOENT(self):
td = tempfile.mkdtemp()
the_cont = os.path.join(td, "vol0", "bar")
try:
os.makedirs(the_cont)
gdf = DiskFile(td, "vol0", "p57", "ufo47", "bar", "z", self.lg)
assert gdf._obj == "z"
assert gdf._obj_path == ""
assert gdf.name == "bar"
assert gdf.datadir == the_cont
assert gdf.data_file is None
body = '1234\n'
etag = md5()
etag.update(body)
etag = etag.hexdigest()
metadata = {
'X-Timestamp': '1234',
'Content-Type': 'file',
'ETag': etag,
'Content-Length': '5',
}
def mock_sleep(*args, **kwargs):
# Return without sleep, no need to dely unit tests
return
def mock_rename(*args, **kwargs):
raise OSError(errno.ENOENT, os.strerror(errno.ENOENT))
with mock.patch("gluster.swift.obj.diskfile.sleep", mock_sleep):
with mock.patch("os.rename", mock_rename):
try:
with gdf.writer() as dw:
assert dw.tmppath is not None
tmppath = dw.tmppath
dw.write(body)
dw.put(metadata)
except GlusterFileSystemOSError:
pass
else:
self.fail("Expected exception DiskFileError")
finally:
shutil.rmtree(td)
def test_put_obj_path(self):
the_obj_path = os.path.join("b", "a")
the_file = os.path.join(the_obj_path, "z")