From bbaed18e9b681ce9cf26ffa6a5d5292f5cb219b7 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Thu, 15 Jul 2021 18:08:01 +0100 Subject: [PATCH] diskfile: don't remove recently written non-durables DiskFileManager will remove any stale files during cleanup_ondisk_files(): these include tombstones and nondurable EC data fragments whose timestamps are older than reclaim_age. It can usually be safely assumed that a non-durable data fragment older than reclaim_age is not going to become durable. However, if an agent PUTs objects with specified older X-Timestamps (for example the reconciler or container-sync) then there is a window of time during which the object server has written an old non-durable data file but has not yet committed it to make it durable. Previously, if another process (for example the reconstructor) called cleanup_ondisk_files during this window then the non-durable data file would be removed. The subsequent attempt to commit the data file would then result in a traceback due to there no longer being a data file to rename, and of course the data file is lost. This patch modifies cleanup_ondisk_files to not remove old, otherwise stale, non-durable data files that were only written to disk in the preceding 'commit_window' seconds. 'commit_window' is configurable for the object server and defaults to 60.0 seconds. Closes-Bug: #1936508 Related-Change: I0d519ebaaade35249fb7b17bd5f419ffdaa616c0 Change-Id: I5f3318a44af64b77a63713e6ff8d0fd3b6144f13 --- doc/manpages/object-server.conf.5 | 4 + doc/source/config/object_server_config.rst | 12 ++ etc/object-server.conf-sample | 7 + swift/obj/diskfile.py | 18 ++- test/unit/obj/test_diskfile.py | 80 ++++++++++- test/unit/obj/test_reconstructor.py | 148 ++++++++++++++++++++- 6 files changed, 260 insertions(+), 9 deletions(-) diff --git a/doc/manpages/object-server.conf.5 b/doc/manpages/object-server.conf.5 index 3402e025d9..7150c6c918 100644 --- a/doc/manpages/object-server.conf.5 +++ b/doc/manpages/object-server.conf.5 @@ -149,6 +149,10 @@ The default is 65536. .IP \fBreclaim_age\fR Time elapsed in seconds before an object can be reclaimed. The default is 604800 seconds. +.IP \fBcommit_window\fR +Time in seconds during which a newly written non-durable data file will not be +reclaimed. The value should be greater than zero and much less than +reclaim_age. The default is 60.0 seconds. .IP \fBnice_priority\fR Modify scheduling priority of server processes. Niceness values range from -20 (most favorable to the process) to 19 (least favorable to the process). diff --git a/doc/source/config/object_server_config.rst b/doc/source/config/object_server_config.rst index 6dac546322..8dc5e3127d 100644 --- a/doc/source/config/object_server_config.rst +++ b/doc/source/config/object_server_config.rst @@ -136,6 +136,18 @@ reclaim_age 604800 Time elapsed in seconds before the it will result in dark data. This setting should be consistent across all object services. +commit_window 60 Non-durable data files may also + get reclaimed if they are older + than reclaim_age, but not if the + time they were written to disk + (i.e. mtime) is less than + commit_window seconds ago. A + commit_window greater than zero is + strongly recommended to avoid + unintended reclamation of data + files that were about to become + durable; commit_window should be + much less than reclaim_age. nice_priority None Scheduling priority of server processes. Niceness values range from -20 (most favorable to the process) to 19 (least diff --git a/etc/object-server.conf-sample b/etc/object-server.conf-sample index d82a18a65f..3e4dd8896f 100644 --- a/etc/object-server.conf-sample +++ b/etc/object-server.conf-sample @@ -87,6 +87,13 @@ bind_port = 6200 # and not greater than the container services reclaim_age # reclaim_age = 604800 # +# Non-durable data files may also get reclaimed if they are older than +# reclaim_age, but not if the time they were written to disk (i.e. mtime) is +# less than commit_window seconds ago. A commit_window greater than zero is +# strongly recommended to avoid unintended reclamation of data files that were +# about to become durable; commit_window should be much less than reclaim_age. +# commit_window = 60.0 +# # You can set scheduling priority of processes. Niceness values range from -20 # (most favorable to the process) to 19 (least favorable to the process). # nice_priority = diff --git a/swift/obj/diskfile.py b/swift/obj/diskfile.py index f6207cfbf9..7ddd71e7ac 100644 --- a/swift/obj/diskfile.py +++ b/swift/obj/diskfile.py @@ -66,7 +66,7 @@ from swift.common.utils import mkdirs, Timestamp, \ get_md5_socket, F_SETPIPE_SZ, decode_timestamps, encode_timestamps, \ MD5_OF_EMPTY_STRING, link_fd_to_path, \ O_TMPFILE, makedirs_count, replace_partition_in_path, remove_directory, \ - md5, is_file_older + md5, is_file_older, non_negative_float from swift.common.splice import splice, tee from swift.common.exceptions import DiskFileQuarantined, DiskFileNotExist, \ DiskFileCollision, DiskFileNoSpace, DiskFileDeviceUnavailable, \ @@ -81,6 +81,7 @@ from swift.common.storage_policy import ( PICKLE_PROTOCOL = 2 DEFAULT_RECLAIM_AGE = timedelta(weeks=1).total_seconds() +DEFAULT_COMMIT_WINDOW = 60.0 HASH_FILE = 'hashes.pkl' HASH_INVALIDATIONS_FILE = 'hashes.invalid' METADATA_KEY = b'user.swift.metadata' @@ -708,6 +709,8 @@ class BaseDiskFileManager(object): self.bytes_per_sync = int(conf.get('mb_per_sync', 512)) * 1024 * 1024 self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.reclaim_age = int(conf.get('reclaim_age', DEFAULT_RECLAIM_AGE)) + self.commit_window = non_negative_float(conf.get( + 'commit_window', DEFAULT_COMMIT_WINDOW)) replication_concurrency_per_device = conf.get( 'replication_concurrency_per_device') replication_one_per_device = conf.get('replication_one_per_device') @@ -1102,8 +1105,14 @@ class BaseDiskFileManager(object): remove_file(join(hsh_path, results['ts_info']['filename'])) files.remove(results.pop('ts_info')['filename']) for file_info in results.get('possible_reclaim', []): - # stray files are not deleted until reclaim-age - if is_reclaimable(file_info['timestamp']): + # stray files are not deleted until reclaim-age; non-durable data + # files are not deleted unless they were written before + # commit_window + filepath = join(hsh_path, file_info['filename']) + if (is_reclaimable(file_info['timestamp']) and + (file_info.get('durable', True) or + self.commit_window <= 0 or + is_file_older(filepath, self.commit_window))): results.setdefault('obsolete', []).append(file_info) for file_info in results.get('obsolete', []): remove_file(join(hsh_path, file_info['filename'])) @@ -3625,7 +3634,8 @@ class ECDiskFileManager(BaseDiskFileManager): results.setdefault('obsolete', []).extend(exts['.durable']) exts.pop('.durable') - # Fragments *may* be ready for reclaim, unless they are durable + # Fragments *may* be ready for reclaim, unless they are most recent + # durable for frag_set in frag_sets.values(): if frag_set in (durable_frag_set, chosen_frag_set): continue diff --git a/test/unit/obj/test_diskfile.py b/test/unit/obj/test_diskfile.py index 12486f15a9..1179f2ff3c 100644 --- a/test/unit/obj/test_diskfile.py +++ b/test/unit/obj/test_diskfile.py @@ -1054,6 +1054,48 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin): policy=policy, frag_index=frag_index, **kwargs) + def test_init(self): + for policy in POLICIES: + df_router = diskfile.DiskFileRouter({}, self.logger) + df_mgr = df_router[policy] + self.assertEqual('/srv/node', df_mgr.devices) + self.assertEqual(604800, df_mgr.reclaim_age) + self.assertEqual(60.0, df_mgr.commit_window) + self.assertTrue(df_mgr.mount_check) + + for policy in POLICIES: + conf = dict(devices=self.testdir, + mount_check='false', + reclaim_age=1000, + commit_window=10.1) + df_router = diskfile.DiskFileRouter(conf, self.logger) + df_mgr = df_router[policy] + self.assertEqual(self.testdir, df_mgr.devices) + self.assertEqual(1000, df_mgr.reclaim_age) + self.assertEqual(10.1, df_mgr.commit_window) + self.assertFalse(df_mgr.mount_check) + + def test_init_commit_window(self): + def assert_ok(value, expected): + for policy in POLICIES: + conf = {'commit_window': value} + df_mgr = diskfile.DiskFileRouter(conf, self.logger)[policy] + self.assertEqual(expected, df_mgr.commit_window) + + assert_ok(10.1, 10.1) + assert_ok('10.1', 10.1) + assert_ok(0, 0.0) + + def assert_invalid(value): + for policy in POLICIES: + conf = {'commit_window': value} + with self.assertRaises(ValueError): + diskfile.DiskFileRouter(conf, self.logger)[policy] + + assert_invalid(-1.1) + assert_invalid('-1.1') + assert_invalid('auto') + def test_cleanup_uses_configured_reclaim_age(self): # verify that the reclaim_age used when cleaning up tombstones is # either the default or the configured value @@ -1108,7 +1150,7 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin): shuffle(files) def _test_cleanup_ondisk_files(self, scenarios, policy, - reclaim_age=None): + reclaim_age=None, commit_window=None): # check that expected files are left in hashdir after cleanup for test in scenarios: class_under_test = self.df_router[policy] @@ -1120,6 +1162,8 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin): open(os.path.join(hashdir, fname), 'w') expected_after_cleanup = set([f[0] for f in test if (f[2] if len(f) > 2 else f[1])]) + if commit_window is not None: + class_under_test.commit_window = commit_window if reclaim_age: class_under_test.reclaim_age = reclaim_age class_under_test.cleanup_ondisk_files(hashdir) @@ -1268,7 +1312,7 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin): ('%s.ts' % older, False, False)]] self._test_cleanup_ondisk_files(scenarios, POLICIES.default, - reclaim_age=1000) + reclaim_age=1000, commit_window=0) def test_construct_dev_path(self): res_path = self.df_mgr.construct_dev_path('abc') @@ -2693,7 +2737,7 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): ('%s.durable' % much_older, False, False)]] self._test_cleanup_ondisk_files(scenarios, POLICIES.default, - reclaim_age=1000) + reclaim_age=1000, commit_window=0) def test_cleanup_ondisk_files_reclaim_with_data_files(self): # Each scenario specifies a list of (filename, extension, [survives]) @@ -2739,7 +2783,33 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): ('%s#4.data' % much_older, False, False)]] self._test_cleanup_ondisk_files(scenarios, POLICIES.default, - reclaim_age=1000) + reclaim_age=1000, commit_window=0) + + def test_cleanup_ondisk_files_commit_window(self): + # verify that non-durable files are not reclaimed regardless of + # timestamp if written to disk within commit_window + much_older = Timestamp(time() - 1001).internal + older = Timestamp(time() - 1001).internal + newer = Timestamp(time() - 900).internal + scenarios = [ + # recently written nondurables not cleaned up + [('%s#1.data' % older, True), + ('%s#2.data' % newer, True), + ('%s.meta' % much_older, False), + ('%s.ts' % much_older, False)]] + self._test_cleanup_ondisk_files(scenarios, POLICIES.default, + reclaim_age=1000, commit_window=60) + + # ... but if commit_window is reduced then recently written files are + # cleaned up + scenarios = [ + # older *timestamps* cleaned up + [('%s#1.data' % older, False), + ('%s#2.data' % newer, True), + ('%s.meta' % much_older, False), + ('%s.ts' % much_older, False)]] + self._test_cleanup_ondisk_files(scenarios, POLICIES.default, + reclaim_age=1000, commit_window=0) def test_get_ondisk_files_with_stray_meta(self): # get_ondisk_files ignores a stray .meta file @@ -6820,6 +6890,7 @@ class TestSuffixHashes(unittest.TestCase): self.assertRaises(output_files.__class__, df_mgr.cleanup_ondisk_files, path) return + df_mgr.commit_window = 0 files = df_mgr.cleanup_ondisk_files('/whatever')['files'] self.assertEqual(files, output_files) if files: @@ -7701,6 +7772,7 @@ class TestSuffixHashes(unittest.TestCase): # get_hashes does not trigger reclaim because the suffix has # MD5_OF_EMPTY_STRING in hashes.pkl df_mgr.reclaim_age = 500 + df_mgr.commit_window = 0 hashes = df_mgr.get_hashes('sda1', '0', [], policy) self.assertEqual([ts_meta.internal + '.meta'], os.listdir(df._datadir)) diff --git a/test/unit/obj/test_reconstructor.py b/test/unit/obj/test_reconstructor.py index 51bf5d9308..81ae808152 100644 --- a/test/unit/obj/test_reconstructor.py +++ b/test/unit/obj/test_reconstructor.py @@ -269,7 +269,6 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase): dev_path = os.path.join(self.reconstructor.devices_dir, self.ec_local_dev['device']) self.ec_obj_path = os.path.join(dev_path, data_dir) - # create a bunch of FA's to test t = 1421181937.70054 # time.time() with mock.patch('swift.obj.diskfile.time') as mock_time: @@ -1252,6 +1251,153 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase): self.assertFalse(os.path.exists(datafile_recent)) self.assertFalse(os.path.exists(datafile_older)) + def test_sync_old_nondurable_before_committed_non_zero_commit_window(self): + # verify that a *recently written* nondurable fragment survives being + # visited by the reconstructor, despite having timestamp older than + # reclaim_age + shutil.rmtree(self.ec_obj_path) + ips = utils.whataremyips(self.reconstructor.bind_ip) + local_devs = [dev for dev in self.ec_obj_ring.devs + if dev and dev['replication_ip'] in ips and + dev['replication_port'] == + self.reconstructor.port] + partition = local_devs[0]['id'] + # recently written, recent timestamp non-durable + ts_recent = Timestamp(time.time()) + df_mgr = self.reconstructor._df_router[self.policy] + reclaim_age = df_mgr.reclaim_age + df_recent = self._create_diskfile( + object_name='recent', part=partition, commit=False, + timestamp=ts_recent, frag_index=4) + datafile_recent = df_recent.manager.cleanup_ondisk_files( + df_recent._datadir, frag_prefs=[])['data_file'] + + # recently written but old timestamp non-durable + ts_old = Timestamp(time.time() - reclaim_age - 1) + df_older = self._create_diskfile( + object_name='older', part=partition, commit=False, + timestamp=ts_old, frag_index=4) + datafile_older = df_older.manager.cleanup_ondisk_files( + df_older._datadir, frag_prefs=[])['data_file'] + self.assertTrue(os.path.exists(datafile_recent)) + self.assertTrue(os.path.exists(datafile_older)) + + # for this test we don't actually need to ssync anything, so pretend + # all suffixes are in sync + self.reconstructor._get_suffixes_to_sync = ( + lambda job, node: ([], node)) + df_mgr.commit_window = 1000 # avoid non-durables being reclaimed + self.reconstructor.reconstruct() + # neither nondurable should be removed yet with default commit_window + # because their mtimes are too recent + self.assertTrue(os.path.exists(datafile_recent)) + self.assertTrue(os.path.exists(datafile_older)) + # and we can still make the nondurables durable + df_recent.writer().commit(ts_recent) + self.assertTrue(os.path.exists(datafile_recent.replace('#4', '#4#d'))) + df_older.writer().commit(ts_old) + self.assertTrue(os.path.exists(datafile_older.replace('#4', '#4#d'))) + + def test_sync_old_nondurable_before_committed_zero_commit_window(self): + # verify that a *recently written* nondurable fragment won't survive + # being visited by the reconstructor if its timestamp is older than + # reclaim_age and commit_window is zero; this test illustrates the + # potential data loss bug that commit_window addresses + shutil.rmtree(self.ec_obj_path) + ips = utils.whataremyips(self.reconstructor.bind_ip) + local_devs = [dev for dev in self.ec_obj_ring.devs + if dev and dev['replication_ip'] in ips and + dev['replication_port'] == + self.reconstructor.port] + partition = local_devs[0]['id'] + # recently written, recent timestamp non-durable + ts_recent = Timestamp(time.time()) + df_mgr = self.reconstructor._df_router[self.policy] + reclaim_age = df_mgr.reclaim_age + df_recent = self._create_diskfile( + object_name='recent', part=partition, commit=False, + timestamp=ts_recent, frag_index=4) + datafile_recent = df_recent.manager.cleanup_ondisk_files( + df_recent._datadir, frag_prefs=[])['data_file'] + + # recently written but old timestamp non-durable + ts_old = Timestamp(time.time() - reclaim_age - 1) + df_older = self._create_diskfile( + object_name='older', part=partition, commit=False, + timestamp=ts_old, frag_index=4) + datafile_older = df_older.manager.cleanup_ondisk_files( + df_older._datadir, frag_prefs=[])['data_file'] + self.assertTrue(os.path.exists(datafile_recent)) + self.assertTrue(os.path.exists(datafile_older)) + + # for this test we don't actually need to ssync anything, so pretend + # all suffixes are in sync + self.reconstructor._get_suffixes_to_sync = ( + lambda job, node: ([], node)) + df_mgr.commit_window = 0 + with mock.patch( + 'swift.obj.diskfile.is_file_older') as mock_is_file_older: + self.reconstructor.reconstruct() + # older nondurable will be removed with commit_window = 0 + self.assertTrue(os.path.exists(datafile_recent)) + self.assertFalse(os.path.exists(datafile_older)) + df_recent.writer().commit(ts_recent) + self.assertTrue(os.path.exists(datafile_recent.replace('#4', '#4#d'))) + # ...and attempt to commit will fail :( + with self.assertRaises(DiskFileError): + df_older.writer().commit(ts_old) + # with zero commit_window the call to stat the file is not made + mock_is_file_older.assert_not_called() + + def test_sync_old_nondurable_before_committed_past_commit_window(self): + # verify that a *not so recently written* nondurable fragment won't + # survive being visited by the reconstructor if its timestamp is older + # than reclaim_age + shutil.rmtree(self.ec_obj_path) + ips = utils.whataremyips(self.reconstructor.bind_ip) + local_devs = [dev for dev in self.ec_obj_ring.devs + if dev and dev['replication_ip'] in ips and + dev['replication_port'] == + self.reconstructor.port] + partition = local_devs[0]['id'] + # recently written, recent timestamp non-durable + ts_recent = Timestamp(time.time()) + df_mgr = self.reconstructor._df_router[self.policy] + reclaim_age = df_mgr.reclaim_age + df_recent = self._create_diskfile( + object_name='recent', part=partition, commit=False, + timestamp=ts_recent, frag_index=4) + datafile_recent = df_recent.manager.cleanup_ondisk_files( + df_recent._datadir, frag_prefs=[])['data_file'] + + # recently written but old timestamp non-durable + ts_old = Timestamp(time.time() - reclaim_age - 1) + df_older = self._create_diskfile( + object_name='older', part=partition, commit=False, + timestamp=ts_old, frag_index=4) + datafile_older = df_older.manager.cleanup_ondisk_files( + df_older._datadir, frag_prefs=[])['data_file'] + # pretend file was written more than commit_window seconds ago + now = time.time() + os.utime(datafile_older, (now - 60.1, now - 60.1)) + self.assertTrue(os.path.exists(datafile_recent)) + self.assertTrue(os.path.exists(datafile_older)) + + # for this test we don't actually need to ssync anything, so pretend + # all suffixes are in sync + self.reconstructor._get_suffixes_to_sync = ( + lambda job, node: ([], node)) + # leave commit_window at default of 60 seconds + self.reconstructor.reconstruct() + # older nondurable will be removed + self.assertTrue(os.path.exists(datafile_recent)) + self.assertFalse(os.path.exists(datafile_older)) + df_recent.writer().commit(ts_recent) + self.assertTrue(os.path.exists(datafile_recent.replace('#4', '#4#d'))) + # ...and attempt to commit will fail :( + with self.assertRaises(DiskFileError): + df_older.writer().commit(ts_old) + def test_no_delete_failed_revert(self): # test will only process revert jobs self.reconstructor.handoffs_only = True