
This patch adds methods to increase the partition power of an existing object ring without downtime for the users using a 3-step process. Data won't be moved to other nodes; objects using the new increased partition power will be located on the same device and are hardlinked to avoid data movement. 1. A new setting "next_part_power" will be added to the rings, and once the proxy server reloaded the rings it will send this value to the object servers on any write operation. Object servers will now create a hard-link in the new location to the original DiskFile object. Already existing data will be relinked using a new tool in the new locations using hardlinks. 2. The actual partition power itself will be increased. Servers will now use the new partition power to read from and write to. No longer required hard links in the old object location have to be removed now by the relinker tool; the relinker tool reads the next_part_power setting to find object locations that need to be cleaned up. 3. The "next_part_power" flag will be removed. This mostly implements the spec in [1]; however it's not using an "epoch" as described there. The idea of the epoch was to store data using different partition powers in their own namespace to avoid conflicts with auditors and replicators as well as being able to abort such an operation and just remove the new tree. This would require some heavy change of the on-disk data layout, and other object-server implementations would be required to adopt this scheme too. Instead the object-replicator is now aware that there is a partition power increase in progress and will skip replication of data in that storage policy; the relinker tool should be simply run and afterwards the partition power will be increased. This shouldn't take that much time (it's only walking the filesystem and hardlinking); impact should be low therefore. The relinker should be run on all storage nodes at the same time in parallel to decrease the required time (though this is not mandatory). Failures during relinking should not affect cluster operations - relinking can be even aborted manually and restarted later. Auditors are not quarantining objects written to a path with a different partition power and therefore working as before (though they are reading each object twice in the worst case before the no longer needed hard links are removed). Co-Authored-By: Alistair Coles <alistair.coles@hpe.com> Co-Authored-By: Matthew Oliver <matt@oliver.net.au> Co-Authored-By: Tim Burke <tim.burke@gmail.com> [1] https://specs.openstack.org/openstack/swift-specs/specs/in_progress/ increasing_partition_power.html Change-Id: I7d6371a04f5c1c4adbb8733a71f3c177ee5448bb
173 lines
6.5 KiB
Python
173 lines
6.5 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import binascii
|
|
import os
|
|
import shutil
|
|
import struct
|
|
import tempfile
|
|
import unittest
|
|
|
|
from swift.cli import relinker
|
|
from swift.common import exceptions, ring, utils
|
|
from swift.common import storage_policy
|
|
from swift.common.storage_policy import (
|
|
StoragePolicy, StoragePolicyCollection, POLICIES)
|
|
|
|
from swift.obj.diskfile import write_metadata
|
|
|
|
from test.unit import FakeLogger
|
|
|
|
|
|
class TestRelinker(unittest.TestCase):
|
|
def setUp(self):
|
|
self.logger = FakeLogger()
|
|
self.testdir = tempfile.mkdtemp()
|
|
self.devices = os.path.join(self.testdir, 'node')
|
|
shutil.rmtree(self.testdir, ignore_errors=1)
|
|
os.mkdir(self.testdir)
|
|
os.mkdir(self.devices)
|
|
|
|
self.rb = ring.RingBuilder(8, 6.0, 1)
|
|
|
|
for i in range(6):
|
|
ip = "127.0.0.%s" % i
|
|
self.rb.add_dev({'id': i, 'region': 0, 'zone': 0, 'weight': 1,
|
|
'ip': ip, 'port': 10000, 'device': 'sda1'})
|
|
self.rb.rebalance(seed=1)
|
|
|
|
self.existing_device = 'sda1'
|
|
os.mkdir(os.path.join(self.devices, self.existing_device))
|
|
self.objects = os.path.join(self.devices, self.existing_device,
|
|
'objects')
|
|
os.mkdir(self.objects)
|
|
self._hash = utils.hash_path('a/c/o')
|
|
digest = binascii.unhexlify(self._hash)
|
|
part = struct.unpack_from('>I', digest)[0] >> 24
|
|
self.next_part = struct.unpack_from('>I', digest)[0] >> 23
|
|
self.objdir = os.path.join(
|
|
self.objects, str(part), self._hash[-3:], self._hash)
|
|
os.makedirs(self.objdir)
|
|
self.object_fname = "1278553064.00000.data"
|
|
self.objname = os.path.join(self.objdir, self.object_fname)
|
|
with open(self.objname, "wb") as dummy:
|
|
dummy.write("Hello World!")
|
|
write_metadata(dummy, {'name': '/a/c/o', 'Content-Length': '12'})
|
|
|
|
test_policies = [StoragePolicy(0, 'platin', True)]
|
|
storage_policy._POLICIES = StoragePolicyCollection(test_policies)
|
|
|
|
self.expected_dir = os.path.join(
|
|
self.objects, str(self.next_part), self._hash[-3:], self._hash)
|
|
self.expected_file = os.path.join(self.expected_dir, self.object_fname)
|
|
|
|
def _save_ring(self):
|
|
rd = self.rb.get_ring()
|
|
for policy in POLICIES:
|
|
rd.save(os.path.join(
|
|
self.testdir, '%s.ring.gz' % policy.ring_name))
|
|
# Enforce ring reloading in relinker
|
|
policy.object_ring = None
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.testdir, ignore_errors=1)
|
|
storage_policy.reload_storage_policies()
|
|
|
|
def test_relink(self):
|
|
self.rb.prepare_increase_partition_power()
|
|
self._save_ring()
|
|
relinker.relink(self.testdir, self.devices, True)
|
|
|
|
self.assertTrue(os.path.isdir(self.expected_dir))
|
|
self.assertTrue(os.path.isfile(self.expected_file))
|
|
|
|
stat_old = os.stat(os.path.join(self.objdir, self.object_fname))
|
|
stat_new = os.stat(self.expected_file)
|
|
self.assertEqual(stat_old.st_ino, stat_new.st_ino)
|
|
|
|
def _common_test_cleanup(self, relink=True):
|
|
# Create a ring that has prev_part_power set
|
|
self.rb.prepare_increase_partition_power()
|
|
self.rb.increase_partition_power()
|
|
self._save_ring()
|
|
|
|
os.makedirs(self.expected_dir)
|
|
|
|
if relink:
|
|
# Create a hardlink to the original object name. This is expected
|
|
# after a normal relinker run
|
|
os.link(os.path.join(self.objdir, self.object_fname),
|
|
self.expected_file)
|
|
|
|
def test_cleanup(self):
|
|
self._common_test_cleanup()
|
|
self.assertEqual(0, relinker.cleanup(self.testdir, self.devices, True))
|
|
|
|
# Old objectname should be removed, new should still exist
|
|
self.assertTrue(os.path.isdir(self.expected_dir))
|
|
self.assertTrue(os.path.isfile(self.expected_file))
|
|
self.assertFalse(os.path.isfile(
|
|
os.path.join(self.objdir, self.object_fname)))
|
|
|
|
def test_cleanup_not_yet_relinked(self):
|
|
self._common_test_cleanup(relink=False)
|
|
self.assertEqual(1, relinker.cleanup(self.testdir, self.devices, True))
|
|
|
|
self.assertTrue(os.path.isfile(
|
|
os.path.join(self.objdir, self.object_fname)))
|
|
|
|
def test_cleanup_deleted(self):
|
|
self._common_test_cleanup()
|
|
|
|
# Pretend the object got deleted inbetween and there is a tombstone
|
|
fname_ts = self.expected_file[:-4] + "ts"
|
|
os.rename(self.expected_file, fname_ts)
|
|
|
|
self.assertEqual(0, relinker.cleanup(self.testdir, self.devices, True))
|
|
|
|
def test_cleanup_doesnotexist(self):
|
|
self._common_test_cleanup()
|
|
|
|
# Pretend the file in the new place got deleted inbetween
|
|
os.remove(self.expected_file)
|
|
|
|
self.assertEqual(
|
|
1, relinker.cleanup(self.testdir, self.devices, True, self.logger))
|
|
self.assertEqual(self.logger.get_lines_for_level('warning'),
|
|
['Error cleaning up %s: %s' % (self.objname,
|
|
repr(exceptions.DiskFileNotExist()))])
|
|
|
|
def test_cleanup_non_durable_fragment(self):
|
|
self._common_test_cleanup()
|
|
|
|
# Actually all fragments are non-durable and raise and DiskFileNotExist
|
|
# in EC in this test. However, if the counterpart exists in the new
|
|
# location, this is ok - it will be fixed by the reconstructor later on
|
|
storage_policy._POLICIES[0].policy_type = 'erasure_coding'
|
|
|
|
self.assertEqual(
|
|
0, relinker.cleanup(self.testdir, self.devices, True, self.logger))
|
|
self.assertEqual(self.logger.get_lines_for_level('warning'), [])
|
|
|
|
def test_cleanup_quarantined(self):
|
|
self._common_test_cleanup()
|
|
# Pretend the object in the new place got corrupted
|
|
with open(self.expected_file, "wb") as obj:
|
|
obj.write('trash')
|
|
|
|
self.assertEqual(
|
|
1, relinker.cleanup(self.testdir, self.devices, True, self.logger))
|
|
|
|
self.assertIn('failed audit and was quarantined',
|
|
self.logger.get_lines_for_level('warning')[0])
|