ec: log durability of frags that fail to reconstruct

Whether the frag is durable or non-durable greatly affects how much I
care whether I can reconstruct it.

Change-Id: Ie6f46267d4bb567ecc0cc195d1fd7ce55c8cb325
This commit is contained in:
Tim Burke 2019-08-20 22:20:44 -07:00
parent f70520239c
commit ff5ea003b3
2 changed files with 32 additions and 20 deletions

View File

@ -401,6 +401,7 @@ class ObjectReconstructor(Daemon):
path, headers, full_get_path) path, headers, full_get_path)
buckets = defaultdict(dict) buckets = defaultdict(dict)
durable_buckets = {}
etag_buckets = {} etag_buckets = {}
error_resp_count = 0 error_resp_count = 0
for resp in pile: for resp in pile:
@ -444,6 +445,10 @@ class ObjectReconstructor(Daemon):
continue continue
timestamp = Timestamp(timestamp) timestamp = Timestamp(timestamp)
durable = resp.headers.get('X-Backend-Durable-Timestamp')
if durable:
durable_buckets[Timestamp(durable)] = True
etag = resp.headers.get('X-Object-Sysmeta-Ec-Etag') etag = resp.headers.get('X-Object-Sysmeta-Ec-Etag')
if not etag: if not etag:
self.logger.warning('Invalid resp from %s, frag index %s ' self.logger.warning('Invalid resp from %s, frag index %s '
@ -469,26 +474,29 @@ class ObjectReconstructor(Daemon):
% (fi_to_rebuild, list(buckets[timestamp]))) % (fi_to_rebuild, list(buckets[timestamp])))
break break
else: else:
path = _full_path(node, job['partition'],
datafile_metadata['name'],
job['policy'])
for timestamp, resp in sorted(buckets.items()): for timestamp, resp in sorted(buckets.items()):
etag = etag_buckets[timestamp] etag = etag_buckets[timestamp]
durable = durable_buckets.get(timestamp)
self.logger.error( self.logger.error(
'Unable to get enough responses (%s/%s) ' 'Unable to get enough responses (%s/%s) to reconstruct '
'to reconstruct %s frag#%s with ETag %s' % ( '%s %s frag#%s with ETag %s and timestamp %s' % (
len(resp), job['policy'].ec_ndata, len(resp), job['policy'].ec_ndata,
_full_path(node, job['partition'], 'durable' if durable else 'non-durable',
datafile_metadata['name'], path, fi_to_rebuild, etag, timestamp.internal))
job['policy']),
fi_to_rebuild, etag))
if error_resp_count: if error_resp_count:
durable = durable_buckets.get(Timestamp(
datafile_metadata['X-Timestamp']))
self.logger.error( self.logger.error(
'Unable to get enough responses (%s error responses) ' 'Unable to get enough responses (%s error responses) '
'to reconstruct %s frag#%s' % ( 'to reconstruct %s %s frag#%s' % (
error_resp_count, error_resp_count,
_full_path(node, job['partition'], 'durable' if durable else 'non-durable',
datafile_metadata['name'], path, fi_to_rebuild))
job['policy']),
fi_to_rebuild))
raise DiskFileError('Unable to reconstruct EC archive') raise DiskFileError('Unable to reconstruct EC archive')

View File

@ -5031,16 +5031,18 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
archive_bodies = encode_frag_archive_bodies(self.policy, body) archive_bodies = encode_frag_archive_bodies(self.policy, body)
# pop the index to the destination node # pop the index to the destination node
archive_bodies.pop(1) archive_bodies.pop(1)
ec_archive_dict[ key = (md5(body).hexdigest(), next(ts).internal, bool(i % 2))
(md5(body).hexdigest(), next(ts).internal)] = archive_bodies ec_archive_dict[key] = archive_bodies
responses = list() responses = list()
# fill out response list by 3 different etag bodies # fill out response list by 3 different etag bodies
for etag, ts in itertools.cycle(ec_archive_dict): for etag, ts, durable in itertools.cycle(ec_archive_dict):
body = ec_archive_dict[(etag, ts)].pop(0) body = ec_archive_dict[(etag, ts, durable)].pop(0)
headers = get_header_frag_index(self, body) headers = get_header_frag_index(self, body)
headers.update({'X-Object-Sysmeta-Ec-Etag': etag, headers.update({'X-Object-Sysmeta-Ec-Etag': etag,
'X-Backend-Timestamp': ts}) 'X-Backend-Timestamp': ts})
if durable:
headers['X-Backend-Durable-Timestamp'] = ts
responses.append((200, body, headers)) responses.append((200, body, headers))
if len(responses) >= (self.policy.object_ring.replicas - 1): if len(responses) >= (self.policy.object_ring.replicas - 1):
break break
@ -5063,7 +5065,7 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
# 1 error log per etag to report not enough responses # 1 error log per etag to report not enough responses
self.assertEqual(3, len(error_lines)) self.assertEqual(3, len(error_lines))
for error_line in error_lines: for error_line in error_lines:
for expected_etag, ts in ec_archive_dict: for expected_etag, ts, durable in ec_archive_dict:
if expected_etag in error_line: if expected_etag in error_line:
break break
else: else:
@ -5072,13 +5074,15 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
(list(ec_archive_dict), error_line)) (list(ec_archive_dict), error_line))
# remove the found etag which should not be found in the # remove the found etag which should not be found in the
# following error lines # following error lines
del ec_archive_dict[(expected_etag, ts)] del ec_archive_dict[(expected_etag, ts, durable)]
expected = 'Unable to get enough responses (%s/10) to ' \ expected = 'Unable to get enough responses (%s/10) to ' \
'reconstruct 10.0.0.1:1001/sdb/0%s policy#0 ' \ 'reconstruct %s 10.0.0.1:1001/sdb/0%s policy#0 ' \
'frag#1 with ETag' % \ 'frag#1 with ETag %s and timestamp %s' % \
(etag_count[expected_etag], (etag_count[expected_etag],
self.obj_path.decode('utf8')) 'durable' if durable else 'non-durable',
self.obj_path.decode('utf8'),
expected_etag, ts)
self.assertIn( self.assertIn(
expected, error_line, expected, error_line,
"Unexpected error line found: Expected: %s Got: %s" "Unexpected error line found: Expected: %s Got: %s"