Merge "Deprecate broken handoffs_first in favor of handoffs_only"

This commit is contained in:
Jenkins 2017-02-15 03:54:49 +00:00 committed by Gerrit Code Review
commit cdd72dd34f
3 changed files with 145 additions and 16 deletions

View File

@ -298,7 +298,18 @@ use = egg:swift#recon
# lockup_timeout = 1800 # lockup_timeout = 1800
# ring_check_interval = 15 # ring_check_interval = 15
# recon_cache_path = /var/cache/swift # recon_cache_path = /var/cache/swift
# handoffs_first = False # The handoffs_only mode option is for special case emergency situations during
# rebalance such as disk full in the cluster. This option SHOULD NOT BE
# CHANGED, except for extreme situations. When handoffs_only mode is enabled
# the reconstructor will *only* revert rebalance fragments to primaries and not
# attempt to sync any primary parts with neighbor primaries. This will force
# the reconstructor to sync and delete handoffs fragments more quickly and
# minimize the time of the rebalance by limiting the number of rebuilds. The
# handoffs_only option is only for temporary use, it should be disabled as soon
# as the emergency situation is resolved. When handoffs_only is not set, the
# deprecated handoffs_first option will be honored as a synonym, but may be
# ignored in a future release.
# handoffs_only = False
# #
# You can set scheduling priority of processes. Niceness values range from -20 # You can set scheduling priority of processes. Niceness values range from -20
# (most favorable to the process) to 19 (least favorable to the process). # (most favorable to the process) to 19 (least favorable to the process).

View File

@ -149,8 +149,24 @@ class ObjectReconstructor(Daemon):
self.headers = { self.headers = {
'Content-Length': '0', 'Content-Length': '0',
'user-agent': 'obj-reconstructor %s' % os.getpid()} 'user-agent': 'obj-reconstructor %s' % os.getpid()}
self.handoffs_first = config_true_value(conf.get('handoffs_first', if 'handoffs_first' in conf:
False)) self.logger.warning(
'The handoffs_first option is deprecated in favor '
'of handoffs_only. This option may be ignored in a '
'future release.')
# honor handoffs_first for backwards compatibility
default_handoffs_only = config_true_value(conf['handoffs_first'])
else:
default_handoffs_only = False
self.handoffs_only = config_true_value(
conf.get('handoffs_only', default_handoffs_only))
if self.handoffs_only:
self.logger.warning(
'Handoff only mode is not intended for normal '
'operation, use handoffs_only with care.')
elif default_handoffs_only:
self.logger.warning('Ignored handoffs_first option in favor '
'of handoffs_only.')
self._df_router = DiskFileRouter(conf, self.logger) self._df_router = DiskFileRouter(conf, self.logger)
def load_object_ring(self, policy): def load_object_ring(self, policy):
@ -656,6 +672,8 @@ class ObjectReconstructor(Daemon):
if syncd_with >= len(job['sync_to']): if syncd_with >= len(job['sync_to']):
self.delete_reverted_objs( self.delete_reverted_objs(
job, reverted_objs, job['frag_index']) job, reverted_objs, job['frag_index'])
else:
self.handoffs_remaining += 1
self.logger.timing_since('partition.delete.timing', begin) self.logger.timing_since('partition.delete.timing', begin)
def _get_part_jobs(self, local_dev, part_path, partition, policy): def _get_part_jobs(self, local_dev, part_path, partition, policy):
@ -684,6 +702,9 @@ class ObjectReconstructor(Daemon):
:param policy: the policy :param policy: the policy
:returns: a list of dicts of job info :returns: a list of dicts of job info
N.B. If this function ever returns an empty list of jobs the entire
partition will be deleted.
""" """
# find all the fi's in the part, and which suffixes have them # find all the fi's in the part, and which suffixes have them
try: try:
@ -876,12 +897,12 @@ class ObjectReconstructor(Daemon):
""" """
Helper function for collect_jobs to build jobs for reconstruction Helper function for collect_jobs to build jobs for reconstruction
using EC style storage policy using EC style storage policy
N.B. If this function ever returns an empty list of jobs the entire
partition will be deleted.
""" """
jobs = self._get_part_jobs(**part_info) jobs = self._get_part_jobs(**part_info)
random.shuffle(jobs) random.shuffle(jobs)
if self.handoffs_first:
# Move the handoff revert jobs to the front of the list
jobs.sort(key=lambda job: job['job_type'], reverse=True)
self.job_count += len(jobs) self.job_count += len(jobs)
return jobs return jobs
@ -897,6 +918,7 @@ class ObjectReconstructor(Daemon):
self.reconstruction_part_count = 0 self.reconstruction_part_count = 0
self.reconstruction_device_count = 0 self.reconstruction_device_count = 0
self.last_reconstruction_count = -1 self.last_reconstruction_count = -1
self.handoffs_remaining = 0
def delete_partition(self, path): def delete_partition(self, path):
self.logger.info(_("Removing partition: %s"), path) self.logger.info(_("Removing partition: %s"), path)
@ -932,6 +954,11 @@ class ObjectReconstructor(Daemon):
self.run_pool.spawn(self.delete_partition, self.run_pool.spawn(self.delete_partition,
part_info['part_path']) part_info['part_path'])
for job in jobs: for job in jobs:
if (self.handoffs_only and job['job_type'] != REVERT):
self.logger.debug('Skipping %s job for %s '
'while in handoffs_only mode.',
job['job_type'], job['path'])
continue
self.run_pool.spawn(self.process_job, job) self.run_pool.spawn(self.process_job, job)
with Timeout(self.lockup_timeout): with Timeout(self.lockup_timeout):
self.run_pool.waitall() self.run_pool.waitall()
@ -943,6 +970,16 @@ class ObjectReconstructor(Daemon):
stats.kill() stats.kill()
lockup_detector.kill() lockup_detector.kill()
self.stats_line() self.stats_line()
if self.handoffs_only:
if self.handoffs_remaining > 0:
self.logger.info(_(
"Handoffs only mode still has handoffs remaining. "
"Next pass will continue to revert handoffs."))
else:
self.logger.warning(_(
"Handoffs only mode found no handoffs remaining. "
"You should disable handoffs_only once all nodes "
"are reporting no handoffs remaining."))
def run_once(self, *args, **kwargs): def run_once(self, *args, **kwargs):
start = time.time() start = time.time()

View File

@ -719,7 +719,6 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase):
rmtree(testring, ignore_errors=1) rmtree(testring, ignore_errors=1)
def test_build_reconstruction_jobs(self): def test_build_reconstruction_jobs(self):
self.reconstructor.handoffs_first = False
self.reconstructor._reset_stats() self.reconstructor._reset_stats()
for part_info in self.reconstructor.collect_parts(): for part_info in self.reconstructor.collect_parts():
jobs = self.reconstructor.build_reconstruction_jobs(part_info) jobs = self.reconstructor.build_reconstruction_jobs(part_info)
@ -728,13 +727,40 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase):
object_reconstructor.REVERT)) object_reconstructor.REVERT))
self.assert_expected_jobs(part_info['partition'], jobs) self.assert_expected_jobs(part_info['partition'], jobs)
self.reconstructor.handoffs_first = True def test_handoffs_only(self):
self.reconstructor._reset_stats() self.reconstructor.handoffs_only = True
for part_info in self.reconstructor.collect_parts():
jobs = self.reconstructor.build_reconstruction_jobs(part_info) found_job_types = set()
self.assertTrue(jobs[0]['job_type'] ==
object_reconstructor.REVERT) def fake_process_job(job):
self.assert_expected_jobs(part_info['partition'], jobs) # increment failure counter
self.reconstructor.handoffs_remaining += 1
found_job_types.add(job['job_type'])
self.reconstructor.process_job = fake_process_job
# only revert jobs
self.reconstructor.reconstruct()
self.assertEqual(found_job_types, {object_reconstructor.REVERT})
# but failures keep handoffs remaining
msgs = self.reconstructor.logger.get_lines_for_level('info')
self.assertIn('Next pass will continue to revert handoffs', msgs[-1])
self.logger._clear()
found_job_types = set()
def fake_process_job(job):
# success does not increment failure counter
found_job_types.add(job['job_type'])
self.reconstructor.process_job = fake_process_job
# only revert jobs ... but all handoffs cleared out successfully
self.reconstructor.reconstruct()
self.assertEqual(found_job_types, {object_reconstructor.REVERT})
# it's time to turn off handoffs_only
msgs = self.reconstructor.logger.get_lines_for_level('warning')
self.assertIn('You should disable handoffs_only', msgs[-1])
def test_get_partners(self): def test_get_partners(self):
# we're going to perform an exhaustive test of every possible # we're going to perform an exhaustive test of every possible
@ -1156,6 +1182,57 @@ class TestObjectReconstructor(unittest.TestCase):
def ts(self): def ts(self):
return next(self.ts_iter) return next(self.ts_iter)
def test_handoffs_only_default(self):
# sanity neither option added to default conf
self.conf.pop('handoffs_only', None)
self.conf.pop('handoffs_first', None)
self.reconstructor = object_reconstructor.ObjectReconstructor(
self.conf, logger=self.logger)
self.assertFalse(self.reconstructor.handoffs_only)
def test_handoffs_first_enables_handoffs_only(self):
self.conf.pop('handoffs_only', None) # sanity
self.conf['handoffs_first'] = True
self.reconstructor = object_reconstructor.ObjectReconstructor(
self.conf, logger=self.logger)
self.assertTrue(self.reconstructor.handoffs_only)
warnings = self.logger.get_lines_for_level('warning')
expected = [
'The handoffs_first option is deprecated in favor '
'of handoffs_only. This option may be ignored in a '
'future release.',
'Handoff only mode is not intended for normal operation, '
'use handoffs_only with care.',
]
self.assertEqual(expected, warnings)
def test_handoffs_only_ignores_handoffs_first(self):
self.conf['handoffs_first'] = True
self.conf['handoffs_only'] = False
self.reconstructor = object_reconstructor.ObjectReconstructor(
self.conf, logger=self.logger)
self.assertFalse(self.reconstructor.handoffs_only)
warnings = self.logger.get_lines_for_level('warning')
expected = [
'The handoffs_first option is deprecated in favor of '
'handoffs_only. This option may be ignored in a future release.',
'Ignored handoffs_first option in favor of handoffs_only.',
]
self.assertEqual(expected, warnings)
def test_handoffs_only_enabled(self):
self.conf.pop('handoffs_first', None) # sanity
self.conf['handoffs_only'] = True
self.reconstructor = object_reconstructor.ObjectReconstructor(
self.conf, logger=self.logger)
self.assertTrue(self.reconstructor.handoffs_only)
warnings = self.logger.get_lines_for_level('warning')
expected = [
'Handoff only mode is not intended for normal operation, '
'use handoffs_only with care.',
]
self.assertEqual(expected, warnings)
def test_two_ec_policies(self): def test_two_ec_policies(self):
with patch_policies([ with patch_policies([
StoragePolicy(0, name='zero', is_deprecated=True), StoragePolicy(0, name='zero', is_deprecated=True),
@ -2345,7 +2422,7 @@ class TestObjectReconstructor(unittest.TestCase):
self.assertEqual(call['node'], sync_to[0]) self.assertEqual(call['node'], sync_to[0])
self.assertEqual(set(call['suffixes']), set(['123', 'abc'])) self.assertEqual(set(call['suffixes']), set(['123', 'abc']))
def test_process_job_revert_is_handoff(self): def test_process_job_revert_is_handoff_fails(self):
replicas = self.policy.object_ring.replicas replicas = self.policy.object_ring.replicas
frag_index = random.randint(0, replicas - 1) frag_index = random.randint(0, replicas - 1)
sync_to = [random.choice([n for n in self.policy.object_ring.devs sync_to = [random.choice([n for n in self.policy.object_ring.devs
@ -2375,7 +2452,8 @@ class TestObjectReconstructor(unittest.TestCase):
def ssync_response_callback(*args): def ssync_response_callback(*args):
# in this test ssync always fails, until we encounter ourselves in # in this test ssync always fails, until we encounter ourselves in
# the list of possible handoff's to sync to # the list of possible handoff's to sync to, so handoffs_remaining
# should increment
return False, {} return False, {}
expected_suffix_calls = set([ expected_suffix_calls = set([
@ -2401,6 +2479,7 @@ class TestObjectReconstructor(unittest.TestCase):
call = ssync_calls[0] call = ssync_calls[0]
self.assertEqual(call['node'], sync_to[0]) self.assertEqual(call['node'], sync_to[0])
self.assertEqual(set(call['suffixes']), set(['123', 'abc'])) self.assertEqual(set(call['suffixes']), set(['123', 'abc']))
self.assertEqual(self.reconstructor.handoffs_remaining, 1)
def test_process_job_revert_cleanup(self): def test_process_job_revert_cleanup(self):
replicas = self.policy.object_ring.replicas replicas = self.policy.object_ring.replicas
@ -2446,6 +2525,7 @@ class TestObjectReconstructor(unittest.TestCase):
} }
def ssync_response_callback(*args): def ssync_response_callback(*args):
# success should not increment handoffs_remaining
return True, {ohash: {'ts_data': ts}} return True, {ohash: {'ts_data': ts}}
ssync_calls = [] ssync_calls = []
@ -2469,6 +2549,7 @@ class TestObjectReconstructor(unittest.TestCase):
df_mgr.get_hashes(self.local_dev['device'], str(partition), [], df_mgr.get_hashes(self.local_dev['device'], str(partition), [],
self.policy) self.policy)
self.assertFalse(os.access(df._datadir, os.F_OK)) self.assertFalse(os.access(df._datadir, os.F_OK))
self.assertEqual(self.reconstructor.handoffs_remaining, 0)
def test_process_job_revert_cleanup_tombstone(self): def test_process_job_revert_cleanup_tombstone(self):
sync_to = [random.choice([n for n in self.policy.object_ring.devs sync_to = [random.choice([n for n in self.policy.object_ring.devs