Fix purge for tombstone only REVERT job
When we revert a partition we normally push it off to the specific primary node for the index of the data files in the partition. However, when a partition is devoid of any data files (only tombstones) we build a REVERT job with a frag_index of None. This change updates the ECDiskFile's purge method to be robust to purging tombstones when the frag_index is None. Add probetest to validate tombstone only revert jobs will clean themselves up if they can validate they're in-sync with part-replica count nodes - even if one of the primaries is down (in which case they sync tombstones with other handoffs to fill in for the primaries) Change-Id: Ib9a42f412fb90d51959efce886c0f8952aba8d85
This commit is contained in:
parent
a3facce53c
commit
369447ec47
@ -2166,9 +2166,14 @@ class ECDiskFile(BaseDiskFile):
|
||||
|
||||
:param timestamp: the object timestamp, an instance of
|
||||
:class:`~swift.common.utils.Timestamp`
|
||||
:param frag_index: a fragment archive index, must be a whole number.
|
||||
:param frag_index: fragment archive index, must be
|
||||
a whole number or None.
|
||||
"""
|
||||
for ext in ('.data', '.ts'):
|
||||
exts = ['.ts']
|
||||
# when frag_index is None it's not possible to build a data file name
|
||||
if frag_index is not None:
|
||||
exts.append('.data')
|
||||
for ext in exts:
|
||||
purge_file = self.manager.make_on_disk_filename(
|
||||
timestamp, ext=ext, frag_index=frag_index)
|
||||
remove_file(os.path.join(self._datadir, purge_file))
|
||||
|
@ -542,6 +542,9 @@ class ObjectReconstructor(Daemon):
|
||||
frag_index=frag_index)
|
||||
df.purge(Timestamp(timestamp), frag_index)
|
||||
except DiskFileError:
|
||||
self.logger.exception(
|
||||
'Unable to purge DiskFile (%r %r %r)',
|
||||
object_hash, timestamp, frag_index)
|
||||
continue
|
||||
|
||||
def process_job(self, job):
|
||||
|
@ -15,6 +15,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
from hashlib import md5
|
||||
import itertools
|
||||
import unittest
|
||||
import uuid
|
||||
import random
|
||||
@ -94,7 +95,7 @@ class TestReconstructorRevert(ECProbeTest):
|
||||
self.object_name, headers=headers_post)
|
||||
del headers_post['X-Auth-Token'] # WTF, where did this come from?
|
||||
|
||||
# these primaries can't servce the data any more, we expect 507
|
||||
# these primaries can't serve the data any more, we expect 507
|
||||
# here and not 404 because we're using mount_check to kill nodes
|
||||
for onode in (onodes[0], onodes[1]):
|
||||
try:
|
||||
@ -102,7 +103,7 @@ class TestReconstructorRevert(ECProbeTest):
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 507)
|
||||
else:
|
||||
self.fail('Node data on %r was not fully destoryed!' %
|
||||
self.fail('Node data on %r was not fully destroyed!' %
|
||||
(onode,))
|
||||
|
||||
# now take out another primary
|
||||
@ -115,7 +116,7 @@ class TestReconstructorRevert(ECProbeTest):
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 507)
|
||||
else:
|
||||
self.fail('Node data on %r was not fully destoryed!' %
|
||||
self.fail('Node data on %r was not fully destroyed!' %
|
||||
(onode,))
|
||||
|
||||
# make sure we can still GET the object and its correct
|
||||
@ -152,10 +153,10 @@ class TestReconstructorRevert(ECProbeTest):
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 404)
|
||||
else:
|
||||
self.fail('Node data on %r was not fully destoryed!' %
|
||||
self.fail('Node data on %r was not fully destroyed!' %
|
||||
(hnode,))
|
||||
|
||||
def test_delete_propogate(self):
|
||||
def test_delete_propagate(self):
|
||||
# create EC container
|
||||
headers = {'X-Storage-Policy': self.policy.name}
|
||||
client.put_container(self.url, self.token, self.container_name,
|
||||
@ -164,56 +165,95 @@ class TestReconstructorRevert(ECProbeTest):
|
||||
# get our node lists
|
||||
opart, onodes = self.object_ring.get_nodes(
|
||||
self.account, self.container_name, self.object_name)
|
||||
hnodes = self.object_ring.get_more_nodes(opart)
|
||||
p_dev2 = self.device_dir('object', onodes[1])
|
||||
hnodes = list(itertools.islice(
|
||||
self.object_ring.get_more_nodes(opart), 2))
|
||||
|
||||
# PUT object
|
||||
contents = Body()
|
||||
client.put_object(self.url, self.token, self.container_name,
|
||||
self.object_name, contents=contents)
|
||||
|
||||
# now lets shut one down
|
||||
self.kill_drive(p_dev2)
|
||||
# now lets shut down a couple primaries
|
||||
failed_nodes = random.sample(onodes, 2)
|
||||
for node in failed_nodes:
|
||||
self.kill_drive(self.device_dir('object', node))
|
||||
|
||||
# delete on the ones that are left
|
||||
# Write tombstones over the nodes that are still online
|
||||
client.delete_object(self.url, self.token,
|
||||
self.container_name,
|
||||
self.object_name)
|
||||
|
||||
# spot check a node
|
||||
# spot check the primary nodes that are still online
|
||||
delete_timestamp = None
|
||||
for node in onodes:
|
||||
if node in failed_nodes:
|
||||
continue
|
||||
try:
|
||||
self.direct_get(node, opart)
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 404)
|
||||
delete_timestamp = err.http_headers['X-Backend-Timestamp']
|
||||
else:
|
||||
self.fail('Node data on %r was not fully destroyed!' %
|
||||
(node,))
|
||||
|
||||
# repair the first primary
|
||||
self.revive_drive(self.device_dir('object', failed_nodes[0]))
|
||||
|
||||
# run the reconstructor on the *second* handoff node
|
||||
self.reconstructor.once(number=self.config_number(hnodes[1]))
|
||||
|
||||
# make sure it's tombstone was pushed out
|
||||
try:
|
||||
self.direct_get(onodes[0], opart)
|
||||
self.direct_get(hnodes[1], opart)
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 404)
|
||||
self.assertNotIn('X-Backend-Timestamp', err.http_headers)
|
||||
else:
|
||||
self.fail('Node data on %r was not fully destoryed!' %
|
||||
(onodes[0],))
|
||||
self.fail('Found obj data on %r' % hnodes[1])
|
||||
|
||||
# enable the first node again
|
||||
self.revive_drive(p_dev2)
|
||||
|
||||
# propagate the delete...
|
||||
# fire up reconstructor on handoff nodes only
|
||||
for hnode in hnodes:
|
||||
hnode_id = (hnode['port'] - 6000) / 10
|
||||
self.reconstructor.once(number=hnode_id)
|
||||
|
||||
# check the first node to make sure its gone
|
||||
# ... and it's on the first failed (now repaired) primary
|
||||
try:
|
||||
self.direct_get(onodes[1], opart)
|
||||
self.direct_get(failed_nodes[0], opart)
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 404)
|
||||
self.assertEqual(err.http_headers['X-Backend-Timestamp'],
|
||||
delete_timestamp)
|
||||
else:
|
||||
self.fail('Node data on %r was not fully destoryed!' %
|
||||
(onodes[0]))
|
||||
self.fail('Found obj data on %r' % failed_nodes[0])
|
||||
|
||||
# make sure proxy get can't find it
|
||||
# repair the second primary
|
||||
self.revive_drive(self.device_dir('object', failed_nodes[1]))
|
||||
|
||||
# run the reconstructor on the *first* handoff node
|
||||
self.reconstructor.once(number=self.config_number(hnodes[0]))
|
||||
|
||||
# make sure it's tombstone was pushed out
|
||||
try:
|
||||
self.direct_get(hnodes[0], opart)
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 404)
|
||||
self.assertNotIn('X-Backend-Timestamp', err.http_headers)
|
||||
else:
|
||||
self.fail('Found obj data on %r' % hnodes[0])
|
||||
|
||||
# ... and now it's on the second failed primary too!
|
||||
try:
|
||||
self.direct_get(failed_nodes[1], opart)
|
||||
except direct_client.DirectClientException as err:
|
||||
self.assertEqual(err.http_status, 404)
|
||||
self.assertEqual(err.http_headers['X-Backend-Timestamp'],
|
||||
delete_timestamp)
|
||||
else:
|
||||
self.fail('Found obj data on %r' % failed_nodes[1])
|
||||
|
||||
# sanity make sure proxy get can't find it
|
||||
try:
|
||||
self.proxy_get()
|
||||
except Exception as err:
|
||||
self.assertEqual(err.http_status, 404)
|
||||
else:
|
||||
self.fail('Node data on %r was not fully destoryed!' %
|
||||
self.fail('Node data on %r was not fully destroyed!' %
|
||||
(onodes[0]))
|
||||
|
||||
def test_reconstruct_from_reverted_fragment_archive(self):
|
||||
|
@ -3665,6 +3665,18 @@ class TestECDiskFile(DiskFileMixin, unittest.TestCase):
|
||||
df.purge(ts, 3)
|
||||
self.assertEqual(sorted(os.listdir(df._datadir)), [])
|
||||
|
||||
def test_purge_without_frag(self):
|
||||
ts = self.ts()
|
||||
df = self._simple_get_diskfile()
|
||||
df.delete(ts)
|
||||
|
||||
# sanity
|
||||
self.assertEqual(sorted(os.listdir(df._datadir)), [
|
||||
ts.internal + '.ts',
|
||||
])
|
||||
df.purge(ts, None)
|
||||
self.assertEqual(sorted(os.listdir(df._datadir)), [])
|
||||
|
||||
def test_purge_old_tombstone(self):
|
||||
old_ts = self.ts()
|
||||
ts = self.ts()
|
||||
|
@ -2415,11 +2415,8 @@ class TestObjectReconstructor(unittest.TestCase):
|
||||
self.assertFalse(os.access(df._datadir, os.F_OK))
|
||||
|
||||
def test_process_job_revert_cleanup_tombstone(self):
|
||||
replicas = self.policy.object_ring.replicas
|
||||
frag_index = random.randint(0, replicas - 1)
|
||||
sync_to = [random.choice([n for n in self.policy.object_ring.devs
|
||||
if n != self.local_dev])]
|
||||
sync_to[0]['index'] = frag_index
|
||||
partition = 0
|
||||
|
||||
part_path = os.path.join(self.devices, self.local_dev['device'],
|
||||
@ -2437,7 +2434,7 @@ class TestObjectReconstructor(unittest.TestCase):
|
||||
|
||||
job = {
|
||||
'job_type': object_reconstructor.REVERT,
|
||||
'frag_index': frag_index,
|
||||
'frag_index': None,
|
||||
'suffixes': [suffix],
|
||||
'sync_to': sync_to,
|
||||
'partition': partition,
|
||||
|
Loading…
Reference in New Issue
Block a user