Merge "Fix object replicator to handle Timeouts fixes: lp 814263"

This commit is contained in:
Jenkins 2011-12-19 21:35:13 +00:00 committed by Gerrit Code Review
commit 79fbd95433
2 changed files with 111 additions and 42 deletions

View File

@ -199,14 +199,16 @@ def get_hashes(partition_dir, recalculate=[], do_listdir=False,
return hashed, hashes return hashed, hashes
# Hack to work around Eventlet's tpool not catching and reraising Timeouts. We
# return the Timeout, Timeout if it's raised, the caller looks for it and
# reraises it if found.
def tpooled_get_hashes(*args, **kwargs): def tpooled_get_hashes(*args, **kwargs):
"""
Hack to work around Eventlet's tpool not catching and reraising Timeouts.
We return the Timeout, None if it's raised, the caller looks for it
and reraises it if found.
"""
try: try:
return get_hashes(*args, **kwargs) return get_hashes(*args, **kwargs)
except Timeout, err: except Timeout, err:
return err, err return err, None
class ObjectReplicator(Daemon): class ObjectReplicator(Daemon):
@ -422,12 +424,13 @@ class ObjectReplicator(Daemon):
local_hash[suffix] != remote_hash.get(suffix, -1)] local_hash[suffix] != remote_hash.get(suffix, -1)]
if not suffixes: if not suffixes:
continue continue
hashed, local_hash = tpool.execute(tpooled_get_hashes, hashed, recalc_hash = tpool.execute(tpooled_get_hashes,
job['path'], recalculate=suffixes, job['path'], recalculate=suffixes,
reclaim_age=self.reclaim_age) reclaim_age=self.reclaim_age)
# See tpooled_get_hashes "Hack". # See tpooled_get_hashes "Hack".
if isinstance(hashed, BaseException): if isinstance(hashed, BaseException):
raise hashed raise hashed
local_hash = recalc_hash
suffixes = [suffix for suffix in local_hash if suffixes = [suffix for suffix in local_hash if
local_hash[suffix] != remote_hash.get(suffix, -1)] local_hash[suffix] != remote_hash.get(suffix, -1)]
self.rsync(node, job, suffixes) self.rsync(node, job, suffixes)

View File

@ -26,6 +26,7 @@ import time
import tempfile import tempfile
from contextlib import contextmanager from contextlib import contextmanager
from eventlet.green import subprocess from eventlet.green import subprocess
from eventlet import Timeout, tpool
from test.unit import FakeLogger from test.unit import FakeLogger
from swift.common import utils from swift.common import utils
from swift.common.utils import hash_path, mkdirs, normalize_timestamp from swift.common.utils import hash_path, mkdirs, normalize_timestamp
@ -201,7 +202,8 @@ class TestObjectReplicator(unittest.TestCase):
hashed, hashes = object_replicator.get_hashes(part, do_listdir=True) hashed, hashes = object_replicator.get_hashes(part, do_listdir=True)
self.assertEquals(hashed, 0) self.assertEquals(hashed, 0)
self.assert_('a83' in hashes) self.assert_('a83' in hashes)
hashed, hashes = object_replicator.get_hashes(part, recalculate=['a83']) hashed, hashes = object_replicator.get_hashes(part,
recalculate=['a83'])
self.assertEquals(hashed, 1) self.assertEquals(hashed, 1)
self.assert_('a83' in hashes) self.assert_('a83' in hashes)
@ -364,6 +366,7 @@ class TestObjectReplicator(unittest.TestCase):
dict(swift_dir=self.testdir, devices=self.devices, dict(swift_dir=self.testdir, devices=self.devices,
mount_check='false', timeout='300', stats_interval='1')) mount_check='false', timeout='300', stats_interval='1'))
was_connector = object_replicator.http_connect was_connector = object_replicator.http_connect
try:
object_replicator.http_connect = mock_http_connect(200) object_replicator.http_connect = mock_http_connect(200)
# Write some files into '1' and run replicate- they should be moved # Write some files into '1' and run replicate- they should be moved
# to the other partitoins and then node should get deleted. # to the other partitoins and then node should get deleted.
@ -384,7 +387,8 @@ class TestObjectReplicator(unittest.TestCase):
self.ring.get_part_nodes(int(cur_part)) \ self.ring.get_part_nodes(int(cur_part)) \
if node['ip'] not in _ips()] if node['ip'] not in _ips()]
for node in nodes: for node in nodes:
rsync_mod = '[%s]::object/sda/objects/%s' % (node['ip'], cur_part) rsync_mod = '[%s]::object/sda/objects/%s' % (node['ip'],
cur_part)
process_arg_checker.append( process_arg_checker.append(
(0, '', ['rsync', whole_path_from, rsync_mod])) (0, '', ['rsync', whole_path_from, rsync_mod]))
self.assertTrue(os.access(os.path.join(self.objects, self.assertTrue(os.access(os.path.join(self.objects,
@ -399,8 +403,70 @@ class TestObjectReplicator(unittest.TestCase):
os.path.join(self.objects, os.path.join(self.objects,
i, object_replicator.HASH_FILE), i, object_replicator.HASH_FILE),
os.F_OK), result) os.F_OK), result)
finally:
object_replicator.http_connect = was_connector object_replicator.http_connect = was_connector
def test_run_once_recover_from_timeout(self):
replicator = object_replicator.ObjectReplicator(
dict(swift_dir=self.testdir, devices=self.devices,
mount_check='false', timeout='300', stats_interval='1'))
was_connector = object_replicator.http_connect
was_get_hashes = object_replicator.get_hashes
was_execute = tpool.execute
self.get_hash_count = 0
try:
def fake_get_hashes(*args, **kwargs):
self.get_hash_count += 1
if self.get_hash_count == 3:
# raise timeout on last call to get hashes
raise Timeout()
return 2, {'abc': 'def'}
def fake_exc(tester, *args, **kwargs):
if 'Error syncing partition' in args[0]:
tester.i_failed = True
self.i_failed = False
object_replicator.http_connect = mock_http_connect(200)
object_replicator.get_hashes = fake_get_hashes
replicator.logger.exception = \
lambda *args, **kwargs: fake_exc(self, *args, **kwargs)
# Write some files into '1' and run replicate- they should be moved
# to the other partitoins and then node should get deleted.
cur_part = '1'
df = DiskFile(self.devices, 'sda', cur_part, 'a', 'c', 'o',
FakeLogger())
mkdirs(df.datadir)
f = open(os.path.join(df.datadir,
normalize_timestamp(time.time()) + '.data'),
'wb')
f.write('1234567890')
f.close()
ohash = hash_path('a', 'c', 'o')
data_dir = ohash[-3:]
whole_path_from = os.path.join(self.objects, cur_part, data_dir)
process_arg_checker = []
nodes = [node for node in
self.ring.get_part_nodes(int(cur_part)) \
if node['ip'] not in _ips()]
for node in nodes:
rsync_mod = '[%s]::object/sda/objects/%s' % (node['ip'],
cur_part)
process_arg_checker.append(
(0, '', ['rsync', whole_path_from, rsync_mod]))
self.assertTrue(os.access(os.path.join(self.objects,
'1', data_dir, ohash),
os.F_OK))
with _mock_process(process_arg_checker):
replicator.run_once()
self.assertFalse(process_errors)
self.assertFalse(self.i_failed)
finally:
object_replicator.http_connect = was_connector
object_replicator.get_hashes = was_get_hashes
tpool.execute = was_execute
def test_run(self): def test_run(self):
with _mock_process([(0, '')] * 100): with _mock_process([(0, '')] * 100):
self.replicator.replicate() self.replicator.replicate()