From 156cdc8edfe9783a0cfd8a702da9247e39293de8 Mon Sep 17 00:00:00 2001 From: Samuel Merritt Date: Thu, 31 Jan 2013 15:12:09 -0800 Subject: [PATCH] Deterministic, repeatable serialization for rings. The (account|container|object).ring.gz files contain, among other things, a JSON-encoded dictionary. This change simply makes the JSON serializer sort the keys of that dictionary so that two Python-identical rings will result in two bytewise-identical ring files. Also, to get repeatable compression, we lock down the timestamp in the gzip output stream to a fixed value. (There's a timestamp value in a gzip stream header; by default, gzip.GzipFile sticks time.time() in there.) This only works on Python 2.7; on 2.6, the 'mtime' argument to gzip.GzipFile() is unsupported. Don't worry, serialization still works on 2.6. It just doesn't always produce the same bytes for the same ring. Change-Id: Ide446413d0aeb78536883933fd0caf440b8f54ad --- swift/common/ring/ring.py | 14 ++++++++++++-- test/unit/common/ring/test_ring.py | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/swift/common/ring/ring.py b/swift/common/ring/ring.py index bc76e42e17..dc2d4aaf55 100644 --- a/swift/common/ring/ring.py +++ b/swift/common/ring/ring.py @@ -85,7 +85,8 @@ class RingData(object): # Write out new-style serialization magic and version: file_obj.write(struct.pack('!4sH', 'R1NG', 1)) ring = self.to_dict() - json_text = json.dumps( + json_encoder = json.JSONEncoder(sort_keys=True) + json_text = json_encoder.encode( {'devs': ring['devs'], 'part_shift': ring['part_shift'], 'replica_count': len(ring['replica2part2dev_id'])}) json_len = len(json_text) @@ -100,7 +101,16 @@ class RingData(object): :param filename: File into which this instance should be serialized. """ - gz_file = GzipFile(filename, 'wb') + # Override the timestamp so that the same ring data creates + # the same bytes on disk. This makes a checksum comparison a + # good way to see if two rings are identical. + # + # This only works on Python 2.7; on 2.6, we always get the + # current time in the gzip output. + try: + gz_file = GzipFile(filename, 'wb', mtime=1300507380.0) + except TypeError: + gz_file = GzipFile(filename, 'wb') self.serialize_v1(gz_file) gz_file.close() diff --git a/test/unit/common/ring/test_ring.py b/test/unit/common/ring/test_ring.py index 9a37cc31b7..cd4be8862d 100644 --- a/test/unit/common/ring/test_ring.py +++ b/test/unit/common/ring/test_ring.py @@ -16,6 +16,7 @@ import array import cPickle as pickle import os +import sys import unittest from gzip import GzipFile from shutil import rmtree @@ -67,6 +68,29 @@ class TestRingData(unittest.TestCase): rd2 = ring.RingData.load(ring_fname) self.assert_ring_data_equal(rd, rd2) + def test_deterministic_serialization(self): + """ + Two identical rings should produce identical .gz files on disk. + + Only true on Python 2.7 or greater. + """ + if sys.version_info[0] == 2 and sys.version_info[1] < 7: + return + os.mkdir(os.path.join(self.testdir, '1')) + os.mkdir(os.path.join(self.testdir, '2')) + # These have to have the same filename (not full path, + # obviously) since the filename gets encoded in the gzip data. + ring_fname1 = os.path.join(self.testdir, '1', 'the.ring.gz') + ring_fname2 = os.path.join(self.testdir, '2', 'the.ring.gz') + rd = ring.RingData( + [array.array('H', [0, 1, 0, 1]), array.array('H',[0, 1, 0, 1])], + [{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30) + rd.save(ring_fname1) + rd.save(ring_fname2) + with open(ring_fname1) as ring1: + with open(ring_fname2) as ring2: + self.assertEqual(ring1.read(), ring2.read()) + class TestRing(unittest.TestCase):