From c85e5260e836c0f4365aca602b7877aaf88eedd3 Mon Sep 17 00:00:00 2001 From: Tim Simpson Date: Tue, 11 Feb 2014 09:41:25 -0600 Subject: [PATCH] Fixes a race condition in resize flavor When resizing an instance's flavor, Trove asks the guest to start the datastore via an RPC call and then, to be safe, would check the service_status table in the database to make sure it was updated to RUNNING. The thing is, that second check was not only superfluous, it was causing resizes to fail in the post-conductor world, because while the guest would have sent a message to conductor describing the datastore's current state before it informed taskmanager that it had finished the call, conductor itself might not have updated the database. This commit changes things so taskmanager polls until the service_status is RUNNING after calling the guest. If there is an error the call to the guest will fail anyway. Change-Id: I4e1ca75a150ed58233c21372d21c6337596e43d0 Closes-Bug: 1278282 --- trove/taskmanager/models.py | 16 ++++++++++------ trove/tests/api/instances_resize.py | 4 ++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/trove/taskmanager/models.py b/trove/taskmanager/models.py index 15db54286c..588a105cd9 100644 --- a/trove/taskmanager/models.py +++ b/trove/taskmanager/models.py @@ -1181,13 +1181,12 @@ class ResizeActionBase(ConfigurationMixin): def _assert_mysql_is_ok(self): # Tell the guest to turn on MySQL, and ensure the status becomes - # ACTIVE. + # RUNNING. self._start_mysql() - # The guest should do this for us... but sometimes it walks funny. - self.instance._refresh_compute_service_status() - if self.instance.service_status != rd_instance.ServiceStatuses.RUNNING: - raise Exception("Migration failed! Service status was %s." - % self.instance.service_status) + utils.poll_until( + self._datastore_is_online, + sleep_time=2, + time_out=RESIZE_TIME_OUT) def _assert_processes_are_ok(self): """Checks the procs; if anything is wrong, reverts the operation.""" @@ -1202,6 +1201,11 @@ class ResizeActionBase(ConfigurationMixin): % self.instance.id) self.instance.server.confirm_resize() + def _datastore_is_online(self): + self.instance._refresh_compute_service_status() + return (self.instance.service_status == + rd_instance.ServiceStatuses.RUNNING) + def _revert_nova_action(self): LOG.debug(_("Instance %s calling Compute revert resize...") % self.instance.id) diff --git a/trove/tests/api/instances_resize.py b/trove/tests/api/instances_resize.py index 94c6fa47f9..84516e5907 100644 --- a/trove/tests/api/instances_resize.py +++ b/trove/tests/api/instances_resize.py @@ -178,6 +178,8 @@ class ResizeTests(ResizeTestBase): self.instance.service_status = rd_instance.ServiceStatuses.SHUTDOWN utils.poll_until(mox.IgnoreArg(), sleep_time=2, time_out=120) self._start_mysql() + utils.poll_until(mox.IgnoreArg(), sleep_time=2, + time_out=120).AndRaise(PollTimeOut) self.instance.guest.reset_configuration(mox.IgnoreArg()) self.instance.server.revert_resize() self._server_changes_to("ACTIVE", OLD_FLAVOR_ID) @@ -190,6 +192,7 @@ class ResizeTests(ResizeTestBase): self.instance.service_status = rd_instance.ServiceStatuses.RUNNING utils.poll_until(mox.IgnoreArg(), sleep_time=2, time_out=120) self._start_mysql() + utils.poll_until(mox.IgnoreArg(), sleep_time=2, time_out=120) self.server.status = "SHUTDOWN" self.instance.server.confirm_resize() @@ -236,4 +239,5 @@ class MigrateTests(ResizeTestBase): self.instance.service_status = rd_instance.ServiceStatuses.RUNNING utils.poll_until(mox.IgnoreArg(), sleep_time=2, time_out=120) self._start_mysql() + utils.poll_until(mox.IgnoreArg(), sleep_time=2, time_out=120) self.instance.server.confirm_resize()