From ff5ea003b3a1b37d8417aa17d3521237768dfe62 Mon Sep 17 00:00:00 2001 From: Tim Burke Date: Tue, 20 Aug 2019 22:20:44 -0700 Subject: [PATCH] ec: log durability of frags that fail to reconstruct Whether the frag is durable or non-durable greatly affects how much I care whether I can reconstruct it. Change-Id: Ie6f46267d4bb567ecc0cc195d1fd7ce55c8cb325 --- swift/obj/reconstructor.py | 30 ++++++++++++++++++----------- test/unit/obj/test_reconstructor.py | 22 ++++++++++++--------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/swift/obj/reconstructor.py b/swift/obj/reconstructor.py index b8f18de4dc..02f3bb7bf8 100644 --- a/swift/obj/reconstructor.py +++ b/swift/obj/reconstructor.py @@ -401,6 +401,7 @@ class ObjectReconstructor(Daemon): path, headers, full_get_path) buckets = defaultdict(dict) + durable_buckets = {} etag_buckets = {} error_resp_count = 0 for resp in pile: @@ -444,6 +445,10 @@ class ObjectReconstructor(Daemon): continue timestamp = Timestamp(timestamp) + durable = resp.headers.get('X-Backend-Durable-Timestamp') + if durable: + durable_buckets[Timestamp(durable)] = True + etag = resp.headers.get('X-Object-Sysmeta-Ec-Etag') if not etag: self.logger.warning('Invalid resp from %s, frag index %s ' @@ -469,26 +474,29 @@ class ObjectReconstructor(Daemon): % (fi_to_rebuild, list(buckets[timestamp]))) break else: + path = _full_path(node, job['partition'], + datafile_metadata['name'], + job['policy']) + for timestamp, resp in sorted(buckets.items()): etag = etag_buckets[timestamp] + durable = durable_buckets.get(timestamp) self.logger.error( - 'Unable to get enough responses (%s/%s) ' - 'to reconstruct %s frag#%s with ETag %s' % ( + 'Unable to get enough responses (%s/%s) to reconstruct ' + '%s %s frag#%s with ETag %s and timestamp %s' % ( len(resp), job['policy'].ec_ndata, - _full_path(node, job['partition'], - datafile_metadata['name'], - job['policy']), - fi_to_rebuild, etag)) + 'durable' if durable else 'non-durable', + path, fi_to_rebuild, etag, timestamp.internal)) if error_resp_count: + durable = durable_buckets.get(Timestamp( + datafile_metadata['X-Timestamp'])) self.logger.error( 'Unable to get enough responses (%s error responses) ' - 'to reconstruct %s frag#%s' % ( + 'to reconstruct %s %s frag#%s' % ( error_resp_count, - _full_path(node, job['partition'], - datafile_metadata['name'], - job['policy']), - fi_to_rebuild)) + 'durable' if durable else 'non-durable', + path, fi_to_rebuild)) raise DiskFileError('Unable to reconstruct EC archive') diff --git a/test/unit/obj/test_reconstructor.py b/test/unit/obj/test_reconstructor.py index efce60ad6a..075246751d 100644 --- a/test/unit/obj/test_reconstructor.py +++ b/test/unit/obj/test_reconstructor.py @@ -5031,16 +5031,18 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor): archive_bodies = encode_frag_archive_bodies(self.policy, body) # pop the index to the destination node archive_bodies.pop(1) - ec_archive_dict[ - (md5(body).hexdigest(), next(ts).internal)] = archive_bodies + key = (md5(body).hexdigest(), next(ts).internal, bool(i % 2)) + ec_archive_dict[key] = archive_bodies responses = list() # fill out response list by 3 different etag bodies - for etag, ts in itertools.cycle(ec_archive_dict): - body = ec_archive_dict[(etag, ts)].pop(0) + for etag, ts, durable in itertools.cycle(ec_archive_dict): + body = ec_archive_dict[(etag, ts, durable)].pop(0) headers = get_header_frag_index(self, body) headers.update({'X-Object-Sysmeta-Ec-Etag': etag, 'X-Backend-Timestamp': ts}) + if durable: + headers['X-Backend-Durable-Timestamp'] = ts responses.append((200, body, headers)) if len(responses) >= (self.policy.object_ring.replicas - 1): break @@ -5063,7 +5065,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor): # 1 error log per etag to report not enough responses self.assertEqual(3, len(error_lines)) for error_line in error_lines: - for expected_etag, ts in ec_archive_dict: + for expected_etag, ts, durable in ec_archive_dict: if expected_etag in error_line: break else: @@ -5072,13 +5074,15 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor): (list(ec_archive_dict), error_line)) # remove the found etag which should not be found in the # following error lines - del ec_archive_dict[(expected_etag, ts)] + del ec_archive_dict[(expected_etag, ts, durable)] expected = 'Unable to get enough responses (%s/10) to ' \ - 'reconstruct 10.0.0.1:1001/sdb/0%s policy#0 ' \ - 'frag#1 with ETag' % \ + 'reconstruct %s 10.0.0.1:1001/sdb/0%s policy#0 ' \ + 'frag#1 with ETag %s and timestamp %s' % \ (etag_count[expected_etag], - self.obj_path.decode('utf8')) + 'durable' if durable else 'non-durable', + self.obj_path.decode('utf8'), + expected_etag, ts) self.assertIn( expected, error_line, "Unexpected error line found: Expected: %s Got: %s"