Limit number of revert tombstone SSYNC requests

Revert tombstone only parts try to talk to all primary nodes - this fixes it to randomize selection within part_nodes. Corresponding probe test is modified to reflect this change. The primary improvement of this patch is the reconstuctor at a handoff node is being able to delete local tombstones when it succeeds to sync to less than all primary nodes. (Before this patch, it requires all nodes are responsible for the REVERT requests) The number of primary nodes to communicate with the reconstructor can be in dicsussion more but, right now with this patch, it's (replicas - k + 1) that is able to prevent stale read. *BONUS* - Fix mis-testsetting (was setting less replicas than ec_k + ec_m) for reconstructor ring in the unit test Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com> Change-Id: I05ce8fe75f1c4a7971cc8995b003df818b69b3c1 Closes-Bug: #1668857
2017-03-01 23:18:09 +05:30 · 2017-03-01 23:18:09 +05:30 · 188c07e12a
commit 188c07e12a
parent 1f36b5dd16
3 changed files with 37 additions and 59 deletions
--- a/swift/obj/reconstructor.py
+++ b/swift/obj/reconstructor.py
@ -854,14 +854,18 @@ class ObjectReconstructor(Daemon):
                # push partitions off this node, but none of the suffixes
                # have any data fragments to hint at which node would be a
                # good candidate to receive the tombstones.
+                #
+                # we'll check a sample of other primaries before we delete our
+                # local tombstones, the exact number doesn't matter as long as
+                # it's enough to ensure the tombstones are not lost and less
+                # than *all the replicas*
+                nsample = (policy.ec_n_unique_fragments *
+                           policy.ec_duplication_factor) - policy.ec_ndata + 1
                jobs.append(build_job(
                    job_type=REVERT,
                    frag_index=None,
                    suffixes=non_data_fragment_suffixes,
-                    # this is super safe
-                    sync_to=part_nodes,
-                    # something like this would be probably be better
-                    # sync_to=random.sample(part_nodes, 3),
+                    sync_to=random.sample(part_nodes, nsample)
                ))
        # return a list of jobs for this part
        return jobs
--- a/test/probe/test_reconstructor_revert.py
+++ b/test/probe/test_reconstructor_revert.py
@ -173,7 +173,7 @@ class TestReconstructorRevert(ECProbeTest):
        client.put_object(self.url, self.token, self.container_name,
                          self.object_name, contents=contents)

-        # now lets shut down a couple primaries
+        # now lets shut down a couple of primaries
        failed_nodes = random.sample(onodes, 2)
        for node in failed_nodes:
            self.kill_drive(self.device_dir('object', node))
@ -197,61 +197,26 @@ class TestReconstructorRevert(ECProbeTest):
                self.fail('Node data on %r was not fully destroyed!' %
                          (node,))

-        # repair the first primary
-        self.revive_drive(self.device_dir('object', failed_nodes[0]))
-
-        # run the reconstructor on the handoffs nodes, if there are no data
-        # frags to hint at the node index - each hnode syncs to all primaries
-        for hnode in hnodes:
-            self.reconstructor.once(number=self.config_number(hnode))
-
-        # because not all primaries are online, the tombstones remain
-        for hnode in hnodes:
+        # run the reconstructor on the handoff node multiple times until
+        # tombstone is pushed out - each handoff node syncs to a few
+        # primaries each time
+        iterations = 0
+        while iterations < 52:
+            self.reconstructor.once(number=self.config_number(hnodes[0]))
+            iterations += 1
+            # see if the tombstone is reverted
            try:
-                self.direct_get(hnode, opart)
+                self.direct_get(hnodes[0], opart)
            except direct_client.DirectClientException as err:
                self.assertEqual(err.http_status, 404)
-                self.assertEqual(err.http_headers['X-Backend-Timestamp'],
-                                 delete_timestamp)
-            else:
-                self.fail('Found obj data on %r' % hnode)
-
-        # ... but it's on the first failed (now repaired) primary
-        try:
-            self.direct_get(failed_nodes[0], opart)
-        except direct_client.DirectClientException as err:
-            self.assertEqual(err.http_status, 404)
-            self.assertEqual(err.http_headers['X-Backend-Timestamp'],
-                             delete_timestamp)
+                if 'X-Backend-Timestamp' not in err.http_headers:
+                    # this means the tombstone is *gone* so it's reverted
+                    break
        else:
-            self.fail('Found obj data on %r' % failed_nodes[0])
+            self.fail('Still found tombstone on %r after %s iterations' % (
+                hnodes[0], iterations))

-        # repair the second primary
-        self.revive_drive(self.device_dir('object', failed_nodes[1]))
-
-        # run the reconstructor on the *first* handoff node
-        self.reconstructor.once(number=self.config_number(hnodes[0]))
-
-        # make sure it's tombstone was pushed out
-        try:
-            self.direct_get(hnodes[0], opart)
-        except direct_client.DirectClientException as err:
-            self.assertEqual(err.http_status, 404)
-            self.assertNotIn('X-Backend-Timestamp', err.http_headers)
-        else:
-            self.fail('Found obj data on %r' % hnodes[0])
-
-        # ... and now it's on the second failed primary too!
-        try:
-            self.direct_get(failed_nodes[1], opart)
-        except direct_client.DirectClientException as err:
-            self.assertEqual(err.http_status, 404)
-            self.assertEqual(err.http_headers['X-Backend-Timestamp'],
-                             delete_timestamp)
-        else:
-            self.fail('Found obj data on %r' % failed_nodes[1])
-
-        # ... but still on the second handoff node
+        # tombstone is still on the *second* handoff
        try:
            self.direct_get(hnodes[1], opart)
        except direct_client.DirectClientException as err:
@ -261,10 +226,14 @@ class TestReconstructorRevert(ECProbeTest):
        else:
            self.fail('Found obj data on %r' % hnodes[1])

-        # ... until it's next sync
+        # repair the primaries
+        self.revive_drive(self.device_dir('object', failed_nodes[0]))
+        self.revive_drive(self.device_dir('object', failed_nodes[1]))
+
+        # run reconstructor on second handoff
        self.reconstructor.once(number=self.config_number(hnodes[1]))

-        # ... then it's tombstone is pushed off too!
+        # verify tombstone is reverted on the first pass
        try:
            self.direct_get(hnodes[1], opart)
        except direct_client.DirectClientException as err:
--- a/test/unit/obj/test_reconstructor.py
+++ b/test/unit/obj/test_reconstructor.py
@ -1160,7 +1160,7 @@ class TestObjectReconstructor(unittest.TestCase):
        self.policy.object_ring.max_more_nodes = \
            self.policy.object_ring.replicas
        self.ts_iter = make_timestamp_iter()
-        self.fabricated_ring = FabricatedRing()
+        self.fabricated_ring = FabricatedRing(replicas=14, devices=28)

    def _configure_reconstructor(self, **kwargs):
        self.conf.update(kwargs)
@ -1924,7 +1924,12 @@ class TestObjectReconstructor(unittest.TestCase):
            'local_dev': self.local_dev,
            'device': self.local_dev['device'],
        }
-        self.assertEqual(ring.replica_count, len(job['sync_to']))
+        self.assertEqual(ring.replica_count, len(part_nodes))
+        expected_samples = (
+            (self.policy.ec_n_unique_fragments *
+             self.policy.ec_duplication_factor) -
+            self.policy.ec_ndata + 1)
+        self.assertEqual(len(job['sync_to']), expected_samples)
        for k, v in expected.items():
            msg = 'expected %s != %s for %s' % (
                v, job[k], k)