From ba046b2a14459132ca10d30a1d2cc0a01126bb1d Mon Sep 17 00:00:00 2001 From: Lingxian Kong Date: Wed, 22 Jul 2020 15:41:21 +1200 Subject: [PATCH] Support online volume resize Trove now supports to resize volume without downtime. To use this feature, the version of Nova and Cinder needs to be at least Pike, the config option ``cinder_service_type`` needs to be set to ``volumev3``. The cloud admin can disable this feature by setting ``online_volume_resize=False``, default is enabled. Change-Id: I000a4e90800454972dd39f2f82d286571bc0b96c --- .../notes/victoria-support-online-resize.yaml | 7 ++ trove/common/cfg.py | 7 +- trove/common/clients_admin.py | 8 +- trove/common/notification.py | 17 --- trove/guestagent/api.py | 5 +- trove/guestagent/datastore/manager.py | 7 +- trove/guestagent/volume.py | 17 +-- trove/taskmanager/models.py | 111 +++++++++--------- trove/tests/api/instances_actions.py | 4 +- trove/tests/fakes/guestagent.py | 2 +- .../unittests/common/test_notification.py | 34 +----- .../unittests/taskmanager/test_models.py | 22 +--- 12 files changed, 103 insertions(+), 138 deletions(-) create mode 100644 releasenotes/notes/victoria-support-online-resize.yaml diff --git a/releasenotes/notes/victoria-support-online-resize.yaml b/releasenotes/notes/victoria-support-online-resize.yaml new file mode 100644 index 0000000000..f5218013cf --- /dev/null +++ b/releasenotes/notes/victoria-support-online-resize.yaml @@ -0,0 +1,7 @@ +--- +features: + - Trove now supports to resize volume without downtime. To use this feature, + the version of Nova and Cinder needs to be at least Pike, the config option + ``cinder_service_type`` needs to be set to ``volumev3``. The cloud admin + can disable this feature by setting ``online_volume_resize=False``, default + is enabled. \ No newline at end of file diff --git a/trove/common/cfg.py b/trove/common/cfg.py index 2be700561e..ed099961ba 100644 --- a/trove/common/cfg.py +++ b/trove/common/cfg.py @@ -93,7 +93,7 @@ common_opts = [ cfg.BoolOpt('neutron_api_insecure', default=False, help="Allow to perform insecure SSL requests to neutron."), cfg.URIOpt('cinder_url', help='URL without the tenant segment.'), - cfg.StrOpt('cinder_service_type', default='volumev2', + cfg.StrOpt('cinder_service_type', default='volumev3', help='Service type to use when searching catalog.'), cfg.StrOpt('cinder_endpoint_type', default='publicURL', help='Service endpoint type to use when searching catalog.'), @@ -475,7 +475,10 @@ common_opts = [ help='The docker image used for backup and restore.'), cfg.ListOpt('reserved_network_cidrs', default=[], help='Network CIDRs reserved for Trove guest instance ' - 'management.') + 'management.'), + cfg.BoolOpt( + 'online_volume_resize', default=True, + help='If online volume resize is supported.') ] diff --git a/trove/common/clients_admin.py b/trove/common/clients_admin.py index 9f89d97fee..af0bb1fb97 100644 --- a/trove/common/clients_admin.py +++ b/trove/common/clients_admin.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cinderclient.v2 import client as CinderClient +from cinderclient import client as CinderClient import glanceclient from keystoneauth1 import loading from keystoneauth1 import session @@ -95,13 +95,17 @@ def cinder_client_trove_admin(context, region_name=None): LOG.debug('Re-use admin cinder client') return ADMIN_CINDER_CLIENT + version = CONF.cinder_service_type.split('v')[-1] or '3' + ks_session = get_keystone_session() ADMIN_CINDER_CLIENT = CinderClient.Client( + version, session=ks_session, service_type=CONF.cinder_service_type, region_name=region_name or CONF.service_credentials.region_name, insecure=CONF.cinder_api_insecure, - endpoint_type=CONF.cinder_endpoint_type) + endpoint_type=CONF.cinder_endpoint_type, + additional_headers={'OpenStack-API-Version': 'volumev3 latest'}) if CONF.cinder_url and CONF.service_credentials.project_id: ADMIN_CINDER_CLIENT.client.management_url = "%s/%s/" % ( diff --git a/trove/common/notification.py b/trove/common/notification.py index c5fa57e27b..a050e0a787 100644 --- a/trove/common/notification.py +++ b/trove/common/notification.py @@ -205,23 +205,6 @@ class TroveInstanceCreate(TroveCommonTraits): super(TroveInstanceCreate, self).notify('create') -class TroveInstanceModifyVolume(TroveCommonTraits): - - ''' - Additional traits for trove.instance.create notifications that describe - instance action events - - This class should correspond to trove_instance_modify_volume in - ceilometer/event_definitions.yaml - ''' - - def __init__(self, **kwargs): - super(TroveInstanceModifyVolume, self).__init__(**kwargs) - - def notify(self): - super(TroveInstanceModifyVolume, self).notify('modify_volume') - - class TroveInstanceModifyFlavor(TroveCommonTraits): ''' diff --git a/trove/guestagent/api.py b/trove/guestagent/api.py index 67c3baa544..329980b48d 100644 --- a/trove/guestagent/api.py +++ b/trove/guestagent/api.py @@ -473,7 +473,7 @@ class API(object): self.agent_low_timeout, version=version, device_path=device_path, mount_point=mount_point) - def resize_fs(self, device_path=None, mount_point=None): + def resize_fs(self, device_path=None, mount_point=None, online=False): """Resize the filesystem.""" LOG.debug("Resize device %(device)s on instance %(id)s.", { 'device': device_path, 'id': self.id}) @@ -481,7 +481,8 @@ class API(object): self._call("resize_fs", self.agent_high_timeout, version=version, - device_path=device_path, mount_point=mount_point) + device_path=device_path, mount_point=mount_point, + online=online) def update_overrides(self, overrides, remove=False): """Update the overrides.""" diff --git a/trove/guestagent/datastore/manager.py b/trove/guestagent/datastore/manager.py index a7e126f124..f1bec02e07 100644 --- a/trove/guestagent/datastore/manager.py +++ b/trove/guestagent/datastore/manager.py @@ -364,10 +364,11 @@ class Manager(periodic_task.PeriodicTasks): device = volume.VolumeDevice(device_path) device.unmount(mount_point) - def resize_fs(self, context, device_path=None, mount_point=None): - LOG.debug("Resizing the filesystem at %s.", mount_point) + def resize_fs(self, context, device_path=None, mount_point=None, + online=False): + LOG.info(f"Resizing the filesystem at {mount_point}, online: {online}") device = volume.VolumeDevice(device_path) - device.resize_fs(mount_point) + device.resize_fs(mount_point, online=online) ############### # Configuration diff --git a/trove/guestagent/volume.py b/trove/guestagent/volume.py index 47b75726d6..0239607345 100644 --- a/trove/guestagent/volume.py +++ b/trove/guestagent/volume.py @@ -71,7 +71,7 @@ class FSBase(object): """ @abc.abstractmethod - def resize(self, device_path): + def resize(self, device_path, online=False): """ Resize the filesystem on device """ @@ -113,9 +113,10 @@ class FSExt(FSBase): exc_fmt = _("Volume '%s' was not formatted.") log_and_raise(log_fmt, exc_fmt, device_path) - def resize(self, device_path): - utils.execute("e2fsck", "-f", "-p", device_path, - run_as_root=True, root_helper="sudo") + def resize(self, device_path, online=False): + if not online: + utils.execute("e2fsck", "-f", "-p", device_path, + run_as_root=True, root_helper="sudo") utils.execute("resize2fs", device_path, run_as_root=True, root_helper="sudo") @@ -158,7 +159,7 @@ class FSXFS(FSBase): device_path) raise exception.GuestError(original_message=msg) - def resize(self, device_path): + def resize(self, device_path, online=False): utils.execute("xfs_repair", device_path, run_as_root=True, root_helper="sudo") utils.execute("mount", device_path, @@ -263,18 +264,18 @@ class VolumeDevice(object): return True - def resize_fs(self, mount_point): + def resize_fs(self, mount_point, online=False): """Resize the filesystem on the specified device.""" self._check_device_exists() # Some OS's will mount a file systems after it's attached if # an entry is put in the fstab file (like Trove does). # Thus it may be necessary to wait for the mount and then unmount # the fs again (since the volume was just attached). - if self._wait_for_mount(mount_point, timeout=2): + if not online and self._wait_for_mount(mount_point, timeout=2): LOG.debug("Unmounting '%s' before resizing.", mount_point) self.unmount(mount_point) try: - self.volume_fs.resize(self.device_path) + self.volume_fs.resize(self.device_path, online=online) except exception.ProcessExecutionError: log_fmt = "Error resizing the filesystem with device '%s'." exc_fmt = _("Error resizing the filesystem with device '%s'.") diff --git a/trove/taskmanager/models.py b/trove/taskmanager/models.py index 7adfb3377e..44e6577a38 100755 --- a/trove/taskmanager/models.py +++ b/trove/taskmanager/models.py @@ -17,7 +17,6 @@ import os.path import time import traceback -from cinderclient import exceptions as cinder_exceptions from eventlet import greenthread from eventlet.timeout import Timeout from oslo_log import log as logging @@ -55,7 +54,6 @@ from trove.common.notification import EndNotification from trove.common.notification import StartNotification from trove.common.notification import TroveInstanceCreate from trove.common.notification import TroveInstanceModifyFlavor -from trove.common.notification import TroveInstanceModifyVolume from trove.common.strategies.cluster import strategy from trove.common.utils import try_recover from trove.extensions.mysql import models as mysql_models @@ -1512,11 +1510,11 @@ class ResizeVolumeAction(object): return self.instance.device_path def _fail(self, orig_func): - LOG.exception("%(func)s encountered an error when " - "attempting to resize the volume for " - "instance %(id)s. Setting service " - "status to failed.", {'func': orig_func.__name__, - 'id': self.instance.id}) + LOG.error("%(func)s encountered an error when " + "attempting to resize the volume for " + "instance %(id)s. Setting service " + "status to failed.", {'func': orig_func.__name__, + 'id': self.instance.id}) service = InstanceServiceStatus.find_by(instance_id=self.instance.id) service.set_status(srvstatus.ServiceStatuses.FAILED) service.save() @@ -1539,12 +1537,12 @@ class ResizeVolumeAction(object): self.instance.restart() def _recover_full(self, orig_func): - LOG.exception("%(func)s encountered an error when attempting to " - "resize the volume for instance %(id)s. Trying to " - "recover by attaching and" - " mounting the volume and then restarting the " - "guest.", {'func': orig_func.__name__, - 'id': self.instance.id}) + LOG.error("%(func)s encountered an error when attempting to " + "resize the volume for instance %(id)s. Trying to " + "recover by attaching and" + " mounting the volume and then restarting the " + "guest.", {'func': orig_func.__name__, + 'id': self.instance.id}) self._attach_volume() self._mount_volume() self.instance.restart() @@ -1609,16 +1607,16 @@ class ResizeVolumeAction(object): 'id': self.instance.id}) @try_recover - def _resize_fs(self): - LOG.debug("Resizing the filesystem for instance %(id)s", { - 'id': self.instance.id}) + def _resize_fs(self, online=False): + LOG.info(f"Resizing the filesystem for instance {self.instance.id}, " + f"online: {online}") mount_point = self.get_mount_point() device_path = self.get_device_path() self.instance.guest.resize_fs(device_path=device_path, - mount_point=mount_point) - LOG.debug("Successfully resized volume %(vol_id)s filesystem for " - "instance %(id)s", {'vol_id': self.instance.volume_id, - 'id': self.instance.id}) + mount_point=mount_point, + online=online) + LOG.debug(f"Successfully resized volume {self.instance.volume_id} " + f"filesystem for instance {self.instance.id}") @try_recover def _mount_volume(self): @@ -1634,10 +1632,8 @@ class ResizeVolumeAction(object): @try_recover def _extend(self): - LOG.debug("Extending volume %(vol_id)s for instance %(id)s to " - "size %(size)s", {'vol_id': self.instance.volume_id, - 'id': self.instance.id, - 'size': self.new_size}) + LOG.info(f"Calling Cinder to extend volume {self.instance.volume_id} " + f"for instance {self.instance.id} to size {self.new_size}") self.instance.volume_client.volumes.extend(self.instance.volume_id, self.new_size) LOG.debug("Successfully extended the volume %(vol_id)s for instance " @@ -1649,9 +1645,8 @@ class ResizeVolumeAction(object): volume = self.instance.volume_client.volumes.get( self.instance.volume_id) if not volume: - msg = (_('Failed to get volume %(vol_id)s') % { - 'vol_id': self.instance.volume_id}) - raise cinder_exceptions.ClientException(msg) + msg = f'Failed to get volume {self.instance.volume_id}' + raise exception.TroveError(msg) def volume_is_new_size(): volume = self.instance.volume_client.volumes.get( @@ -1659,34 +1654,46 @@ class ResizeVolumeAction(object): return volume.size == self.new_size utils.poll_until(volume_is_new_size, - sleep_time=2, + sleep_time=5, time_out=CONF.volume_time_out) self.instance.update_db(volume_size=self.new_size) except PollTimeOut: - LOG.exception("Timeout trying to extend the volume %(vol_id)s " - "for instance %(id)s", - {'vol_id': self.instance.volume_id, - 'id': self.instance.id}) + LOG.error("Timeout trying to extend the volume %(vol_id)s " + "for instance %(id)s", + {'vol_id': self.instance.volume_id, + 'id': self.instance.id}) volume = self.instance.volume_client.volumes.get( self.instance.volume_id) if volume.status == 'extending': self._fail(self._verify_extend) elif volume.size != self.new_size: self.instance.update_db(volume_size=volume.size) - self._recover_full(self._verify_extend) + if not CONF.online_volume_resize: + self._recover_full(self._verify_extend) raise - except Exception: - LOG.exception("Error encountered trying to verify extend for " - "the volume %(vol_id)s for instance %(id)s", - {'vol_id': self.instance.volume_id, - 'id': self.instance.id}) - self._recover_full(self._verify_extend) + except Exception as e: + LOG.error("Error encountered trying to verify extend for " + "the volume %(vol_id)s for instance %(id)s, " + "error: %(error)s", + {'vol_id': self.instance.volume_id, + 'id': self.instance.id, + 'error': str(e)}) + if not CONF.online_volume_resize: + self._recover_full(self._verify_extend) raise def _resize_active_volume(self): - LOG.debug("Begin _resize_active_volume for id: %(id)s", { - 'id': self.instance.id}) + if CONF.online_volume_resize: + try: + self._extend() + except Exception as e: + LOG.error(f'Failed to extend volume, error: {str(e)}') + + self._verify_extend() + self._resize_fs(recover_func=self._fail, online=True) + return + self._stop_db() self._unmount_volume(recover_func=self._recover_restart) self._detach_volume(recover_func=self._recover_mount_restart) @@ -1694,11 +1701,9 @@ class ResizeVolumeAction(object): self._verify_extend() # if anything fails after this point, recovery is futile self._attach_volume(recover_func=self._fail) - self._resize_fs(recover_func=self._fail) + self._resize_fs(recover_func=self._fail, online=False) self._mount_volume(recover_func=self._fail) self.instance.restart() - LOG.debug("End _resize_active_volume for id: %(id)s", { - 'id': self.instance.id}) def execute(self): LOG.debug("%(gt)s: Resizing instance %(id)s volume for server " @@ -1711,19 +1716,11 @@ class ResizeVolumeAction(object): if self.instance.server.status in [InstanceStatus.ACTIVE, InstanceStatus.HEALTHY]: - self._resize_active_volume() - self.instance.reset_task_status() - # send usage event for size reported by cinder - volume = self.instance.volume_client.volumes.get( - self.instance.volume_id) - launched_time = timeutils.isotime(self.instance.updated) - modified_time = timeutils.isotime(self.instance.updated) - TroveInstanceModifyVolume(instance=self.instance, - old_volume_size=self.old_size, - launched_at=launched_time, - modify_at=modified_time, - volume_size=volume.size, - ).notify() + try: + self._resize_active_volume() + finally: + self.instance.reset_task_status() + else: self.instance.reset_task_status() msg = ( diff --git a/trove/tests/api/instances_actions.py b/trove/tests/api/instances_actions.py index 608e827e1c..4d8d2b25c3 100644 --- a/trove/tests/api/instances_actions.py +++ b/trove/tests/api/instances_actions.py @@ -546,7 +546,6 @@ class ResizeInstanceVolumeTest(ActionTestBase): self.new_volume_size) @test(depends_on=[test_volume_resize]) - @time_out(300) def test_volume_resize_success(self): """test_volume_resize_success""" @@ -559,7 +558,8 @@ class ResizeInstanceVolumeTest(ActionTestBase): else: asserts.fail("Status should not be %s" % instance.status) - poll_until(check_resize_status, sleep_time=2, time_out=300) + poll_until(check_resize_status, sleep_time=5, time_out=300, + initial_delay=5) instance = instance_info.dbaas.instances.get(instance_info.id) asserts.assert_equal(instance.volume['size'], self.new_volume_size) diff --git a/trove/tests/fakes/guestagent.py b/trove/tests/fakes/guestagent.py index a711058069..dc22bb536d 100644 --- a/trove/tests/fakes/guestagent.py +++ b/trove/tests/fakes/guestagent.py @@ -330,7 +330,7 @@ class FakeGuest(object): def unmount_volume(self, device_path=None, mount_point=None): pass - def resize_fs(self, device_path=None, mount_point=None): + def resize_fs(self, device_path=None, mount_point=None, online=False): pass def update_overrides(self, overrides, remove=False): diff --git a/trove/tests/unittests/common/test_notification.py b/trove/tests/unittests/common/test_notification.py index 3f27c71ec0..dd914a2386 100644 --- a/trove/tests/unittests/common/test_notification.py +++ b/trove/tests/unittests/common/test_notification.py @@ -13,17 +13,19 @@ # License for the specific language governing permissions and limitations # under the License. # -from unittest.mock import Mock, patch +from unittest.mock import Mock +from unittest.mock import patch from oslo_utils import timeutils +from trove import rpc from trove.common import cfg -from trove.common.context import TroveContext from trove.common import exception from trove.common import notification -from trove.common.notification import EndNotification, StartNotification +from trove.common.context import TroveContext +from trove.common.notification import EndNotification +from trove.common.notification import StartNotification from trove.conductor import api as conductor_api -from trove import rpc from trove.tests.unittests import trove_testtools @@ -227,30 +229,6 @@ class TestTroveInstanceDelete(trove_testtools.TestCase): self.assertTrue(notifier().info.called) -class TestTroveInstanceModifyVolume(trove_testtools.TestCase): - - def setUp(self): - super(TestTroveInstanceModifyVolume, self).setUp() - self.instance = Mock(db_info=Mock(created=timeutils.utcnow())) - - @patch.object(cfg.CONF, 'get', Mock()) - @patch.object(rpc, 'get_notifier') - def test_notification(self, notifier): - notification.TroveInstanceModifyVolume(instance=self.instance).notify() - self.assertTrue(notifier().info.called) - - @patch.object(cfg.CONF, 'get', Mock()) - @patch.object(rpc, 'get_notifier') - def test_notification_after_serialization(self, notifier): - orig_notify = notification.TroveInstanceModifyVolume( - instance=self.instance) - serialized = orig_notify.serialize(None) - new_notify = notification.TroveInstanceModifyVolume().deserialize( - None, serialized) - new_notify.notify() - self.assertTrue(notifier().info.called) - - class TestTroveInstanceModifyFlavor(trove_testtools.TestCase): def setUp(self): diff --git a/trove/tests/unittests/taskmanager/test_models.py b/trove/tests/unittests/taskmanager/test_models.py index e9dad0d626..d542b34bec 100644 --- a/trove/tests/unittests/taskmanager/test_models.py +++ b/trove/tests/unittests/taskmanager/test_models.py @@ -39,10 +39,10 @@ import trove.backup.models from trove.common import timeutils from trove.common import utils import trove.common.context +from trove.common import exception from trove.common.exception import GuestError from trove.common.exception import PollTimeOut from trove.common.exception import TroveError -from trove.common.notification import TroveInstanceModifyVolume import trove.common.template as template from trove.datastore import models as datastore_models import trove.db.models @@ -627,11 +627,10 @@ class ResizeVolumeTest(trove_testtools.TestCase): self.instance.volume_client.volumes.extend.side_effect = None self.instance.reset_mock() - @patch('trove.taskmanager.models.LOG') - def test_resize_volume_verify_extend_no_volume(self, mock_logging): + def test_resize_volume_verify_extend_no_volume(self): self.instance.volume_client.volumes.get = Mock( return_value=None) - self.assertRaises(cinder_exceptions.ClientException, + self.assertRaises(exception.TroveError, self.action._verify_extend) self.instance.reset_mock() @@ -643,29 +642,20 @@ class ResizeVolumeTest(trove_testtools.TestCase): utils.poll_until.side_effect = None self.instance.reset_mock() - @patch.object(TroveInstanceModifyVolume, 'notify') def test_resize_volume_active_server_succeeds(self, *args): server = Mock(status=InstanceStatus.ACTIVE) self.instance.attach_mock(server, 'server') + self.action.execute() - self.assertEqual(1, self.instance.guest.stop_db.call_count) - self.assertEqual(1, self.instance.guest.unmount_volume.call_count) - detach_count = ( - self.instance.nova_client.volumes.delete_server_volume.call_count) - self.assertEqual(1, detach_count) + extend_count = self.instance.volume_client.volumes.extend.call_count self.assertEqual(1, extend_count) - attach_count = ( - self.instance.nova_client.volumes.create_server_volume.call_count) - self.assertEqual(1, attach_count) - self.assertEqual(1, self.instance.guest.resize_fs.call_count) - self.assertEqual(1, self.instance.guest.mount_volume.call_count) - self.assertEqual(1, self.instance.restart.call_count) self.instance.reset_mock() def test_resize_volume_server_error_fails(self): server = Mock(status=InstanceStatus.ERROR) self.instance.attach_mock(server, 'server') + self.assertRaises(TroveError, self.action.execute) self.instance.reset_mock()