From c10f8ae4eca5d034cbf77103aa27413c4ae92ac3 Mon Sep 17 00:00:00 2001 From: Tim Burke Date: Thu, 7 Jan 2021 16:18:07 -0800 Subject: [PATCH] relinker: Track part_power/next_part_power in state file ...and only update from the state file if those part powers match the work we're currently doing. Otherwise, we would skip relinking when trying to do multiple increases in quick succession, causing misplaced data that clients could not access. Note that because the part_power/next_part_power pair *must* change between relinking and cleanup, we now only need to track one boolean per part; the state file will automatically reset when switching modes. Also note that any state files in the old format will be discarded -- this is safe, but may cause increased I/O. OTOH, operators probably shouldn't be upgrading Swift mid-part-power-increase anyway. Change-Id: Id08d0893db2c6e6f9ce1e42a104409568daf985b Closes-Bug: #1910589 --- swift/cli/relinker.py | 85 +++++++++++-------- test/unit/cli/test_relinker.py | 144 ++++++++++++++++++++++++--------- 2 files changed, 157 insertions(+), 72 deletions(-) diff --git a/swift/cli/relinker.py b/swift/cli/relinker.py index 630c0e98ee..9bf6b52965 100644 --- a/swift/cli/relinker.py +++ b/swift/cli/relinker.py @@ -50,13 +50,18 @@ def hook_pre_device(locks, states, datadir, device_path): locks[0] = fd state_file = os.path.join(device_path, STATE_FILE.format(datadir=datadir)) - states.clear() + states["state"].clear() try: with open(state_file, 'rt') as f: - tmp = json.load(f) - states.update(tmp) - except ValueError: - # Invalid JSON: remove the file to restart from scratch + state_from_disk = json.load(f) + if state_from_disk["next_part_power"] != states["next_part_power"]: + raise ValueError + if state_from_disk["part_power"] != states["part_power"]: + states["prev_part_power"] = state_from_disk["part_power"] + raise ValueError + states["state"].update(state_from_disk["state"]) + except (ValueError, TypeError, KeyError): + # Bad state file: remove the file to restart from scratch os.unlink(state_file) except IOError as err: # Ignore file not found error @@ -69,33 +74,39 @@ def hook_post_device(locks, _): locks[0] = None -def partitions_filter(states, step, part_power, next_part_power, +def partitions_filter(states, part_power, next_part_power, datadir_path, partitions): # Remove all non partitions first (eg: auditor_status_ALL.json) partitions = [p for p in partitions if p.isdigit()] - if not (step == STEP_CLEANUP and part_power == next_part_power): - # This is not a cleanup after cancel, partitions in the upper half are - # new partitions, there is nothing to relink/cleanup from there - partitions = [p for p in partitions - if int(p) < 2 ** next_part_power / 2] + relinking = (part_power != next_part_power) + if relinking: + # All partitions in the upper half are new partitions and there is + # nothing to relink there + partitions = [part for part in partitions + if int(part) < 2 ** part_power] + elif "prev_part_power" in states: + # All partitions in the upper half are new partitions and there is + # nothing to clean up there + partitions = [part for part in partitions + if int(part) < 2 ** states["prev_part_power"]] - # Format: { 'part': [relinked, cleaned] } - if states: - missing = list(set(partitions) - set(states.keys())) + # Format: { 'part': processed } + if states["state"]: + missing = list(set(partitions) - set(states["state"].keys())) if missing: - # All missing partitions was created after the first run of - # relink, so after the new ring was distribued, so they already - # are hardlinked in both partitions, but they will need to - # cleaned.. Just update the state file. + # All missing partitions were created after the first run of the + # relinker with this part_power/next_part_power pair. This is + # expected when relinking, where new partitions appear that are + # appropriate for the target part power. In such cases, there's + # nothing to be done. Err on the side of caution during cleanup, + # however. for part in missing: - states[part] = [True, False] - if step == STEP_RELINK: - partitions = [str(p) for p, (r, c) in states.items() if not r] - elif step == STEP_CLEANUP: - partitions = [str(p) for p, (r, c) in states.items() if not c] + states["state"][part] = relinking + partitions = [str(part) for part, processed in states["state"].items() + if not processed] else: - states.update({str(p): [False, False] for p in partitions}) + states["state"].update({str(part): False for part in partitions}) # Always scan the partitions in reverse order to minimize the amount of IO # (it actually only matters for relink, not for cleanup). @@ -107,7 +118,7 @@ def partitions_filter(states, step, part_power, next_part_power, # If the relinker then scan partition 1, it will listdir that object while # it's unnecessary. By working in reverse order of partitions, this is # avoided. - partitions = sorted(partitions, key=lambda x: int(x), reverse=True) + partitions = sorted(partitions, key=lambda part: int(part), reverse=True) return partitions @@ -124,10 +135,8 @@ def hook_post_partition(states, step, state_file = os.path.join(device_path, STATE_FILE.format(datadir=datadir_name)) - if step == STEP_RELINK: - states[part][0] = True - elif step == STEP_CLEANUP: - states[part][1] = True + if step in (STEP_RELINK, STEP_CLEANUP): + states["state"][part] = True with open(state_tmp_file, 'wt') as f: json.dump(states, f) os.fsync(f.fileno()) @@ -164,14 +173,17 @@ def relink(swift_dir='/etc/swift', datadir = diskfile.get_data_dir(policy) locks = [None] - states = {} + states = { + "part_power": part_power, + "next_part_power": next_part_power, + "state": {}, + } relink_devices_filter = partial(devices_filter, device) relink_hook_pre_device = partial(hook_pre_device, locks, states, datadir) relink_hook_post_device = partial(hook_post_device, locks) relink_partition_filter = partial(partitions_filter, - states, STEP_RELINK, - part_power, next_part_power) + states, part_power, next_part_power) relink_hook_post_partition = partial(hook_post_partition, states, STEP_RELINK) relink_hashes_filter = partial(hashes_filter, next_part_power) @@ -228,14 +240,17 @@ def cleanup(swift_dir='/etc/swift', datadir = diskfile.get_data_dir(policy) locks = [None] - states = {} + states = { + "part_power": part_power, + "next_part_power": next_part_power, + "state": {}, + } cleanup_devices_filter = partial(devices_filter, device) cleanup_hook_pre_device = partial(hook_pre_device, locks, states, datadir) cleanup_hook_post_device = partial(hook_post_device, locks) cleanup_partition_filter = partial(partitions_filter, - states, STEP_CLEANUP, - part_power, next_part_power) + states, part_power, next_part_power) cleanup_hook_post_partition = partial(hook_post_partition, states, STEP_CLEANUP) cleanup_hashes_filter = partial(hashes_filter, next_part_power) diff --git a/test/unit/cli/test_relinker.py b/test/unit/cli/test_relinker.py index 571f1c2d7f..8f31c33597 100644 --- a/test/unit/cli/test_relinker.py +++ b/test/unit/cli/test_relinker.py @@ -178,16 +178,30 @@ class TestRelinker(unittest.TestCase): self._save_ring() relinker.relink(self.testdir, self.devices, True) with open(state_file, 'rt') as f: - self.assertEqual(json.load(f), {str(self.part): [True, False]}) + orig_inode = os.stat(state_file).st_ino + self.assertEqual(json.load(f), { + "part_power": PART_POWER, + "next_part_power": PART_POWER + 1, + "state": {str(self.part): True}}) self.rb.increase_partition_power() self.rb._ring = None # Force builder to reload ring self._save_ring() - relinker.cleanup(self.testdir, self.devices, True) with open(state_file, 'rt') as f: - self.assertEqual(json.load(f), - {str(self.part): [True, True], - str(self.next_part): [True, True]}) + # Keep the state file open during cleanup so the inode can't be + # released/re-used when it gets unlinked + self.assertEqual(orig_inode, os.stat(state_file).st_ino) + relinker.cleanup(self.testdir, self.devices, True) + self.assertNotEqual(orig_inode, os.stat(state_file).st_ino) + with open(state_file, 'rt') as f: + # NB: part_power/next_part_power tuple changed, so state was reset + # (though we track prev_part_power for an efficient clean up) + self.assertEqual(json.load(f), { + "prev_part_power": PART_POWER, + "part_power": PART_POWER + 1, + "next_part_power": PART_POWER + 1, + "state": {str(self.part): True, + str(self.next_part): True}}) def test_devices_filter_filtering(self): # With no filtering, returns all devices @@ -210,7 +224,8 @@ class TestRelinker(unittest.TestCase): lock_file = os.path.join(device_path, '.relink.%s.lock' % datadir) # The first run gets the lock - relinker.hook_pre_device(locks, {}, datadir, device_path) + states = {"state": {}} + relinker.hook_pre_device(locks, states, datadir, device_path) self.assertNotEqual([None], locks) # A following run would block @@ -232,20 +247,21 @@ class TestRelinker(unittest.TestCase): datadir_path = os.path.join(device_path, datadir) state_file = os.path.join(device_path, 'relink.%s.json' % datadir) - def call_partition_filter(step, parts): + def call_partition_filter(part_power, next_part_power, parts): # Partition 312 will be ignored because it must have been created # by the relinker - return relinker.partitions_filter(states, step, - PART_POWER, PART_POWER + 1, + return relinker.partitions_filter(states, + part_power, next_part_power, datadir_path, parts) # Start relinking - states = {} + states = {"part_power": PART_POWER, "next_part_power": PART_POWER + 1, + "state": {}} # Load the states: As it starts, it must be empty locks = [None] relinker.hook_pre_device(locks, states, datadir, device_path) - self.assertEqual({}, states) + self.assertEqual({}, states["state"]) os.close(locks[0]) # Release the lock # Partition 312 is ignored because it must have been created with the @@ -253,80 +269,134 @@ class TestRelinker(unittest.TestCase): # 96 and 227 are reverse ordered # auditor_status_ALL.json is ignored because it's not a partition self.assertEqual(['227', '96'], - call_partition_filter(relinker.STEP_RELINK, + call_partition_filter(PART_POWER, PART_POWER + 1, ['96', '227', '312', 'auditor_status.json'])) - self.assertEqual(states, {'96': [False, False], '227': [False, False]}) + self.assertEqual(states["state"], {'96': False, '227': False}) # Ack partition 96 relinker.hook_post_partition(states, relinker.STEP_RELINK, os.path.join(datadir_path, '96')) - self.assertEqual(states, {'96': [True, False], '227': [False, False]}) + self.assertEqual(states["state"], {'96': True, '227': False}) with open(state_file, 'rt') as f: - self.assertEqual(json.load(f), {'96': [True, False], - '227': [False, False]}) + self.assertEqual(json.load(f), { + "part_power": PART_POWER, + "next_part_power": PART_POWER + 1, + "state": {'96': True, '227': False}}) # Restart relinking after only part 96 was done self.assertEqual(['227'], - call_partition_filter(relinker.STEP_RELINK, + call_partition_filter(PART_POWER, PART_POWER + 1, ['96', '227', '312'])) - self.assertEqual(states, {'96': [True, False], '227': [False, False]}) + self.assertEqual(states["state"], {'96': True, '227': False}) # Ack partition 227 relinker.hook_post_partition(states, relinker.STEP_RELINK, os.path.join(datadir_path, '227')) - self.assertEqual(states, {'96': [True, False], '227': [True, False]}) + self.assertEqual(states["state"], {'96': True, '227': True}) with open(state_file, 'rt') as f: - self.assertEqual(json.load(f), {'96': [True, False], - '227': [True, False]}) + self.assertEqual(json.load(f), { + "part_power": PART_POWER, + "next_part_power": PART_POWER + 1, + "state": {'96': True, '227': True}}) # If the process restarts, it reload the state locks = [None] - states = {} + states = { + "part_power": PART_POWER, + "next_part_power": PART_POWER + 1, + "state": {}, + } relinker.hook_pre_device(locks, states, datadir, device_path) - self.assertEqual(states, {'96': [True, False], '227': [True, False]}) + self.assertEqual(states, { + "part_power": PART_POWER, + "next_part_power": PART_POWER + 1, + "state": {'96': True, '227': True}}) + os.close(locks[0]) # Release the lock + + # Start cleanup -- note that part_power and next_part_power now match! + states = { + "part_power": PART_POWER + 1, + "next_part_power": PART_POWER + 1, + "state": {}, + } + # ...which means our state file was ignored + relinker.hook_pre_device(locks, states, datadir, device_path) + self.assertEqual(states, { + "prev_part_power": PART_POWER, + "part_power": PART_POWER + 1, + "next_part_power": PART_POWER + 1, + "state": {}}) os.close(locks[0]) # Release the lock - # Start cleanup self.assertEqual(['227', '96'], - call_partition_filter(relinker.STEP_CLEANUP, + call_partition_filter(PART_POWER + 1, PART_POWER + 1, ['96', '227', '312'])) # Ack partition 227 relinker.hook_post_partition(states, relinker.STEP_CLEANUP, os.path.join(datadir_path, '227')) - self.assertEqual(states, {'96': [True, False], '227': [True, True]}) + self.assertEqual(states["state"], + {'96': False, '227': True}) with open(state_file, 'rt') as f: - self.assertEqual(json.load(f), {'96': [True, False], - '227': [True, True]}) + self.assertEqual(json.load(f), { + "prev_part_power": PART_POWER, + "part_power": PART_POWER + 1, + "next_part_power": PART_POWER + 1, + "state": {'96': False, '227': True}}) # Restart cleanup after only part 227 was done self.assertEqual(['96'], - call_partition_filter(relinker.STEP_CLEANUP, + call_partition_filter(PART_POWER + 1, PART_POWER + 1, ['96', '227', '312'])) - self.assertEqual(states, {'96': [True, False], '227': [True, True]}) + self.assertEqual(states["state"], + {'96': False, '227': True}) # Ack partition 96 relinker.hook_post_partition(states, relinker.STEP_CLEANUP, os.path.join(datadir_path, '96')) - self.assertEqual(states, {'96': [True, True], '227': [True, True]}) + self.assertEqual(states["state"], + {'96': True, '227': True}) with open(state_file, 'rt') as f: - self.assertEqual(json.load(f), {'96': [True, True], - '227': [True, True]}) + self.assertEqual(json.load(f), { + "prev_part_power": PART_POWER, + "part_power": PART_POWER + 1, + "next_part_power": PART_POWER + 1, + "state": {'96': True, '227': True}}) # At the end, the state is still accurate locks = [None] - states = {} + states = { + "prev_part_power": PART_POWER, + "part_power": PART_POWER + 1, + "next_part_power": PART_POWER + 1, + "state": {}, + } relinker.hook_pre_device(locks, states, datadir, device_path) - self.assertEqual(states, {'96': [True, True], '227': [True, True]}) + self.assertEqual(states["state"], + {'96': True, '227': True}) + os.close(locks[0]) # Release the lock + + # If the part_power/next_part_power tuple differs, restart from scratch + locks = [None] + states = { + "part_power": PART_POWER + 1, + "next_part_power": PART_POWER + 2, + "state": {}, + } + relinker.hook_pre_device(locks, states, datadir, device_path) + self.assertEqual(states["state"], {}) + self.assertFalse(os.path.exists(state_file)) os.close(locks[0]) # Release the lock # If the file gets corrupted, restart from scratch with open(state_file, 'wt') as f: f.write('NOT JSON') locks = [None] - states = {} + states = {"part_power": PART_POWER, "next_part_power": PART_POWER + 1, + "state": {}} relinker.hook_pre_device(locks, states, datadir, device_path) - self.assertEqual(states, {}) + self.assertEqual(states["state"], {}) + self.assertFalse(os.path.exists(state_file)) os.close(locks[0]) # Release the lock def test_cleanup_not_yet_relinked(self):