Fix purge for tombstone only REVERT job

When we revert a partition we normally push it off to the specific primary node for the index of the data files in the partition. However, when a partition is devoid of any data files (only tombstones) we build a REVERT job with a frag_index of None. This change updates the ECDiskFile's purge method to be robust to purging tombstones when the frag_index is None. Add probetest to validate tombstone only revert jobs will clean themselves up if they can validate they're in-sync with part-replica count nodes - even if one of the primaries is down (in which case they sync tombstones with other handoffs to fill in for the primaries) Change-Id: Ib9a42f412fb90d51959efce886c0f8952aba8d85
2015-08-27 15:09:29 -07:00 · 2015-08-27 15:09:29 -07:00 · 369447ec47
commit 369447ec47
parent a3facce53c
5 changed files with 92 additions and 35 deletions
--- a/swift/obj/diskfile.py
+++ b/swift/obj/diskfile.py
@ -2166,9 +2166,14 @@ class ECDiskFile(BaseDiskFile):

        :param timestamp: the object timestamp, an instance of
                          :class:`~swift.common.utils.Timestamp`
-        :param frag_index: a fragment archive index, must be a whole number.
+        :param frag_index: fragment archive index, must be
+                           a whole number or None.
        """
-        for ext in ('.data', '.ts'):
+        exts = ['.ts']
+        # when frag_index is None it's not possible to build a data file name
+        if frag_index is not None:
+            exts.append('.data')
+        for ext in exts:
            purge_file = self.manager.make_on_disk_filename(
                timestamp, ext=ext, frag_index=frag_index)
            remove_file(os.path.join(self._datadir, purge_file))
--- a/swift/obj/reconstructor.py
+++ b/swift/obj/reconstructor.py
@ -542,6 +542,9 @@ class ObjectReconstructor(Daemon):
                    frag_index=frag_index)
                df.purge(Timestamp(timestamp), frag_index)
            except DiskFileError:
+                self.logger.exception(
+                    'Unable to purge DiskFile (%r %r %r)',
+                    object_hash, timestamp, frag_index)
                continue

    def process_job(self, job):
--- a/test/probe/test_reconstructor_revert.py
+++ b/test/probe/test_reconstructor_revert.py
@ -15,6 +15,7 @@
 # limitations under the License.

 from hashlib import md5
+import itertools
 import unittest
 import uuid
 import random
@ -94,7 +95,7 @@ class TestReconstructorRevert(ECProbeTest):
                           self.object_name, headers=headers_post)
        del headers_post['X-Auth-Token']  # WTF, where did this come from?

-        # these primaries can't servce the data any more, we expect 507
+        # these primaries can't serve the data any more, we expect 507
        # here and not 404 because we're using mount_check to kill nodes
        for onode in (onodes[0], onodes[1]):
            try:
@ -102,7 +103,7 @@ class TestReconstructorRevert(ECProbeTest):
            except direct_client.DirectClientException as err:
                self.assertEqual(err.http_status, 507)
            else:
-                self.fail('Node data on %r was not fully destoryed!' %
+                self.fail('Node data on %r was not fully destroyed!' %
                          (onode,))

        # now take out another primary
@ -115,7 +116,7 @@ class TestReconstructorRevert(ECProbeTest):
        except direct_client.DirectClientException as err:
            self.assertEqual(err.http_status, 507)
        else:
-            self.fail('Node data on %r was not fully destoryed!' %
+            self.fail('Node data on %r was not fully destroyed!' %
                      (onode,))

        # make sure we can still GET the object and its correct
@ -152,10 +153,10 @@ class TestReconstructorRevert(ECProbeTest):
            except direct_client.DirectClientException as err:
                self.assertEqual(err.http_status, 404)
            else:
-                self.fail('Node data on %r was not fully destoryed!' %
+                self.fail('Node data on %r was not fully destroyed!' %
                          (hnode,))

-    def test_delete_propogate(self):
+    def test_delete_propagate(self):
        # create EC container
        headers = {'X-Storage-Policy': self.policy.name}
        client.put_container(self.url, self.token, self.container_name,
@ -164,56 +165,95 @@ class TestReconstructorRevert(ECProbeTest):
        # get our node lists
        opart, onodes = self.object_ring.get_nodes(
            self.account, self.container_name, self.object_name)
-        hnodes = self.object_ring.get_more_nodes(opart)
-        p_dev2 = self.device_dir('object', onodes[1])
+        hnodes = list(itertools.islice(
+            self.object_ring.get_more_nodes(opart), 2))

        # PUT object
        contents = Body()
        client.put_object(self.url, self.token, self.container_name,
                          self.object_name, contents=contents)

-        # now lets shut one down
-        self.kill_drive(p_dev2)
+        # now lets shut down a couple primaries
+        failed_nodes = random.sample(onodes, 2)
+        for node in failed_nodes:
+            self.kill_drive(self.device_dir('object', node))

-        # delete on the ones that are left
+        # Write tombstones over the nodes that are still online
        client.delete_object(self.url, self.token,
                             self.container_name,
                             self.object_name)

-        # spot check a node
+        # spot check the primary nodes that are still online
+        delete_timestamp = None
+        for node in onodes:
+            if node in failed_nodes:
+                continue
+            try:
+                self.direct_get(node, opart)
+            except direct_client.DirectClientException as err:
+                self.assertEqual(err.http_status, 404)
+                delete_timestamp = err.http_headers['X-Backend-Timestamp']
+            else:
+                self.fail('Node data on %r was not fully destroyed!' %
+                          (node,))
+
+        # repair the first primary
+        self.revive_drive(self.device_dir('object', failed_nodes[0]))
+
+        # run the reconstructor on the *second* handoff node
+        self.reconstructor.once(number=self.config_number(hnodes[1]))
+
+        # make sure it's tombstone was pushed out
        try:
-            self.direct_get(onodes[0], opart)
+            self.direct_get(hnodes[1], opart)
        except direct_client.DirectClientException as err:
            self.assertEqual(err.http_status, 404)
+            self.assertNotIn('X-Backend-Timestamp', err.http_headers)
        else:
-            self.fail('Node data on %r was not fully destoryed!' %
-                      (onodes[0],))
+            self.fail('Found obj data on %r' % hnodes[1])

-        # enable the first node again
-        self.revive_drive(p_dev2)
-
-        # propagate the delete...
-        # fire up reconstructor on handoff nodes only
-        for hnode in hnodes:
-            hnode_id = (hnode['port'] - 6000) / 10
-            self.reconstructor.once(number=hnode_id)
-
-        # check the first node to make sure its gone
+        # ... and it's on the first failed (now repaired) primary
        try:
-            self.direct_get(onodes[1], opart)
+            self.direct_get(failed_nodes[0], opart)
        except direct_client.DirectClientException as err:
            self.assertEqual(err.http_status, 404)
+            self.assertEqual(err.http_headers['X-Backend-Timestamp'],
+                             delete_timestamp)
        else:
-            self.fail('Node data on %r was not fully destoryed!' %
-                      (onodes[0]))
+            self.fail('Found obj data on %r' % failed_nodes[0])

-        # make sure proxy get can't find it
+        # repair the second primary
+        self.revive_drive(self.device_dir('object', failed_nodes[1]))
+
+        # run the reconstructor on the *first* handoff node
+        self.reconstructor.once(number=self.config_number(hnodes[0]))
+
+        # make sure it's tombstone was pushed out
+        try:
+            self.direct_get(hnodes[0], opart)
+        except direct_client.DirectClientException as err:
+            self.assertEqual(err.http_status, 404)
+            self.assertNotIn('X-Backend-Timestamp', err.http_headers)
+        else:
+            self.fail('Found obj data on %r' % hnodes[0])
+
+        # ... and now it's on the second failed primary too!
+        try:
+            self.direct_get(failed_nodes[1], opart)
+        except direct_client.DirectClientException as err:
+            self.assertEqual(err.http_status, 404)
+            self.assertEqual(err.http_headers['X-Backend-Timestamp'],
+                             delete_timestamp)
+        else:
+            self.fail('Found obj data on %r' % failed_nodes[1])
+
+        # sanity make sure proxy get can't find it
        try:
            self.proxy_get()
        except Exception as err:
            self.assertEqual(err.http_status, 404)
        else:
-            self.fail('Node data on %r was not fully destoryed!' %
+            self.fail('Node data on %r was not fully destroyed!' %
                      (onodes[0]))

    def test_reconstruct_from_reverted_fragment_archive(self):
--- a/test/unit/obj/test_diskfile.py
+++ b/test/unit/obj/test_diskfile.py
@ -3665,6 +3665,18 @@ class TestECDiskFile(DiskFileMixin, unittest.TestCase):
        df.purge(ts, 3)
        self.assertEqual(sorted(os.listdir(df._datadir)), [])

+    def test_purge_without_frag(self):
+        ts = self.ts()
+        df = self._simple_get_diskfile()
+        df.delete(ts)
+
+        # sanity
+        self.assertEqual(sorted(os.listdir(df._datadir)), [
+            ts.internal + '.ts',
+        ])
+        df.purge(ts, None)
+        self.assertEqual(sorted(os.listdir(df._datadir)), [])
+
    def test_purge_old_tombstone(self):
        old_ts = self.ts()
        ts = self.ts()
--- a/test/unit/obj/test_reconstructor.py
+++ b/test/unit/obj/test_reconstructor.py
@ -2415,11 +2415,8 @@ class TestObjectReconstructor(unittest.TestCase):
        self.assertFalse(os.access(df._datadir, os.F_OK))

    def test_process_job_revert_cleanup_tombstone(self):
-        replicas = self.policy.object_ring.replicas
-        frag_index = random.randint(0, replicas - 1)
        sync_to = [random.choice([n for n in self.policy.object_ring.devs
                                  if n != self.local_dev])]
-        sync_to[0]['index'] = frag_index
        partition = 0

        part_path = os.path.join(self.devices, self.local_dev['device'],
@ -2437,7 +2434,7 @@ class TestObjectReconstructor(unittest.TestCase):

        job = {
            'job_type': object_reconstructor.REVERT,
-            'frag_index': frag_index,
+            'frag_index': None,
            'suffixes': [suffix],
            'sync_to': sync_to,
            'partition': partition,