pxc grow/shrink cluster implementation

* The adds support for pxc to grow a cluster. * api and taskmanager support for shrinking a cluster * validate that the networks given are the same for each instance in the cluster. * make sure to add the existing networks on an instance in the cluster. * add new Error task for grow and shrink. * nova client version configuration changed to a string option rather than an int option because the nova microversions change nova api output. This was needed for the network interfaces on existing instances. * testing for grow and shrink cluster Change-Id: I8602cad1a98970fcd99ee9793a89e1abdceb0982 Partially-Implements-Blueprint: pxc-grow-shrink-cluster
2015-11-16 14:11:00 -06:00 · 2015-11-16 14:11:00 -06:00 · 9e96241d12
commit 9e96241d12
parent fac6e76b54
17 changed files with 559 additions and 77 deletions
--- a/tox.ini
+++ b/tox.ini
@ -9,7 +9,7 @@ usedevelop = True
 install_command = pip install -U {opts} {packages}
 deps = -r{toxinidir}/requirements.txt
       -r{toxinidir}/test-requirements.txt
-commands = find . -type f -name "*.pyc" -delete
+commands = find ./trove -type f -name "*.pyc" -delete
           pip install pymongo===3.0.3
           {envpython} run_tests.py
           python setup.py testr --slowest
@ -33,7 +33,6 @@ commands = oslo_debug_helper {posargs}
 [testenv:cover]
 basepython = python2.7
 commands =
-     {envpython} run_tests.py --group=does_not_exist
     coverage erase
     python setup.py testr --coverage
     coverage run -a run_tests.py
--- a/trove/common/cfg.py
+++ b/trove/common/cfg.py
@ -55,7 +55,7 @@ common_opts = [
               help='Service type to use when searching catalog.'),
    cfg.StrOpt('nova_compute_endpoint_type', default='publicURL',
               help='Service endpoint type to use when searching catalog.'),
-    cfg.IntOpt('nova_client_version', default=2,
+    cfg.StrOpt('nova_client_version', default='2.12',
               help="The version of of the compute service client."),
    cfg.StrOpt('neutron_url', help='URL without the tenant segment.'),
    cfg.StrOpt('neutron_service_type', default='network',
--- a/trove/common/exception.py
+++ b/trove/common/exception.py
@ -504,6 +504,14 @@ class ClusterFlavorsNotEqual(TroveError):
    message = _("The flavor for each instance in a cluster must be the same.")


+class ClusterNetworksNotEqual(TroveError):
+    message = _("The network for each instance in a cluster must be the same.")
+
+
+class NetworkNotFound(TroveError):
+    message = _("Network Resource %(uuid)s cannot be found.")
+
+
 class ClusterVolumeSizeRequired(TroveError):
    message = _("A volume size is required for each instance in the cluster.")

@ -523,6 +531,11 @@ class ClusterNumInstancesNotLargeEnough(TroveError):
                "be at least %(num_instances)s.")


+class ClusterShrinkMustNotLeaveClusterEmpty(TroveError):
+    message = _("Must leave at least one instance in the cluster when "
+                "shrinking.")
+
+
 class ClusterInstanceOperationNotSupported(TroveError):
    message = _("Operation not supported for instances that are part of a "
                "cluster.")
--- a/trove/common/strategies/cluster/experimental/pxc/api.py
+++ b/trove/common/strategies/cluster/experimental/pxc/api.py
@ -22,8 +22,10 @@ from trove.common import cfg
 from trove.common import exception
 from trove.common import remote
 from trove.common.strategies.cluster import base
+from trove.common import utils
 from trove.extensions.mgmt.clusters.views import MgmtClusterView
-from trove.instance import models as inst_models
+from trove.instance.models import DBInstance
+from trove.instance.models import Instance
 from trove.quota.quota import check_quotas
 from trove.taskmanager import api as task_api

@ -40,7 +42,29 @@ class PXCAPIStrategy(base.BaseAPIStrategy):

    @property
    def cluster_controller_actions(self):
-        return {}
+        return {
+            'grow': self._action_grow_cluster,
+            'shrink': self._action_shrink_cluster,
+        }
+
+    def _action_grow_cluster(self, cluster, body):
+        nodes = body['grow']
+        instances = []
+        for node in nodes:
+            instance = {
+                'flavor_id': utils.get_id_from_href(node['flavorRef'])
+            }
+            if 'name' in node:
+                instance['name'] = node['name']
+            if 'volume' in node:
+                instance['volume_size'] = int(node['volume']['size'])
+            instances.append(instance)
+        return cluster.grow(instances)
+
+    def _action_shrink_cluster(self, cluster, body):
+        instances = body['shrink']
+        instance_ids = [instance['id'] for instance in instances]
+        return cluster.shrink(instance_ids)

    @property
    def cluster_view_class(self):
@ -53,10 +77,10 @@ class PXCAPIStrategy(base.BaseAPIStrategy):

 class PXCCluster(models.Cluster):

-    @classmethod
-    def create(cls, context, name, datastore, datastore_version,
-               instances, extended_properties):
-        LOG.debug("Initiating PXC cluster creation.")
+    @staticmethod
+    def _validate_cluster_instances(context, instances, datastore,
+                                    datastore_version):
+        """Validate the flavor and volume"""
        pxc_conf = CONF.get(datastore_version.manager)
        num_instances = len(instances)

@ -65,7 +89,7 @@ class PXCCluster(models.Cluster):
            raise exception.ClusterNumInstancesNotLargeEnough(
                num_instances=pxc_conf.min_cluster_member_count)

-        # Checking flavors
+        # Checking flavors and get delta for quota check
        flavor_ids = [instance['flavor_id'] for instance in instances]
        if len(set(flavor_ids)) != 1:
            raise exception.ClusterFlavorsNotEqual()
@ -77,7 +101,7 @@ class PXCCluster(models.Cluster):
            raise exception.FlavorNotFound(uuid=flavor_id)
        deltas = {'instances': num_instances}

-        # Checking volumes
+        # Checking volumes and get delta for quota check
        volume_sizes = [instance['volume_size'] for instance in instances
                        if instance.get('volume_size', None)]
        volume_size = None
@ -96,35 +120,64 @@ class PXCCluster(models.Cluster):
            if ephemeral_support and flavor.ephemeral == 0:
                raise exception.LocalStorageNotSpecified(flavor=flavor_id)

+        # quota check
        check_quotas(context.tenant, deltas)

-        nics = [instance.get('nics', None) for instance in instances]
+        # Checking networks are same for the cluster
+        instance_nics = [instance.get('nics', None) for instance in instances]
+        if len(set(instance_nics)) != 1:
+            raise exception.ClusterNetworksNotEqual()
+        instance_nic = instance_nics[0]
+        if instance_nic is None:
+            return
+        try:
+            nova_client.networks.get(instance_nic)
+        except nova_exceptions.NotFound:
+            raise exception.NetworkNotFound(uuid=instance_nic)

-        azs = [instance.get('availability_zone', None)
-               for instance in instances]
+    @staticmethod
+    def _create_instances(context, db_info, datastore, datastore_version,
+                          instances):
+        member_config = {"id": db_info.id,
+                         "instance_type": "member"}
+        name_index = 1
+        for instance in instances:
+            if not instance.get("name"):
+                instance['name'] = "%s-member-%s" % (db_info.name,
+                                                     str(name_index))
+                name_index += 1

+        return map(lambda instance:
+                   Instance.create(context,
+                                   instance['name'],
+                                   instance['flavor_id'],
+                                   datastore_version.image_id,
+                                   [], [],
+                                   datastore, datastore_version,
+                                   instance.get('volume_size', None),
+                                   None,
+                                   availability_zone=instance.get(
+                                       'availability_zone', None),
+                                   nics=instance.get('nics', None),
+                                   configuration_id=None,
+                                   cluster_config=member_config
+                                   ),
+                   instances)
+
+    @classmethod
+    def create(cls, context, name, datastore, datastore_version,
+               instances, extended_properties):
+        LOG.debug("Initiating PXC cluster creation.")
+        cls._validate_cluster_instances(context, instances, datastore,
+                                        datastore_version)
        # Updating Cluster Task
        db_info = models.DBCluster.create(
            name=name, tenant_id=context.tenant,
            datastore_version_id=datastore_version.id,
            task_status=ClusterTasks.BUILDING_INITIAL)

-        member_config = {"id": db_info.id,
-                         "instance_type": "member"}
-
-        # Creating member instances
-        for i in range(0, num_instances):
-            instance_name = "%s-member-%s" % (name, str(i + 1))
-            inst_models.Instance.create(context, instance_name,
-                                        flavor_id,
-                                        datastore_version.image_id,
-                                        [], [], datastore,
-                                        datastore_version,
-                                        volume_size, None,
-                                        nics=nics[i],
-                                        availability_zone=azs[i],
-                                        configuration_id=None,
-                                        cluster_config=member_config)
+        cls._create_instances(context, db_info, datastore, datastore_version,
+                              instances)

        # Calling taskmanager to further proceed for cluster-configuration
        task_api.load(context, datastore_version.manager).create_cluster(
@ -132,6 +185,57 @@ class PXCCluster(models.Cluster):

        return PXCCluster(context, db_info, datastore, datastore_version)

+    def _get_cluster_network_interfaces(self):
+        nova_client = remote.create_nova_client(self.context)
+        nova_instance_id = self.db_instances[0].compute_instance_id
+        interfaces = nova_client.virtual_interfaces.list(nova_instance_id)
+        ret = [{"net-id": getattr(interface, 'net_id')}
+               for interface in interfaces]
+        return ret
+
+    def grow(self, instances):
+        LOG.debug("Growing cluster %s." % self.id)
+
+        self.validate_cluster_available()
+
+        context = self.context
+        db_info = self.db_info
+        datastore = self.ds
+        datastore_version = self.ds_version
+
+        # Get the network of the existing cluster instances.
+        interface_ids = self._get_cluster_network_interfaces()
+        for instance in instances:
+            instance["nics"] = interface_ids
+
+        db_info.update(task_status=ClusterTasks.GROWING_CLUSTER)
+
+        new_instances = self._create_instances(context, db_info,
+                                               datastore, datastore_version,
+                                               instances)
+
+        task_api.load(context, datastore_version.manager).grow_cluster(
+            db_info.id, [instance.id for instance in new_instances])
+
+        return PXCCluster(context, db_info, datastore, datastore_version)
+
+    def shrink(self, instances):
+        """Removes instances from a cluster."""
+        LOG.debug("Shrinking cluster %s." % self.id)
+
+        self.validate_cluster_available()
+        removal_instances = [Instance.load(self.context, inst_id)
+                             for inst_id in instances]
+        db_instances = DBInstance.find_all(cluster_id=self.db_info.id).all()
+        if len(db_instances) - len(removal_instances) < 1:
+            raise exception.ClusterShrinkMustNotLeaveClusterEmpty()
+
+        self.db_info.update(task_status=ClusterTasks.SHRINKING_CLUSTER)
+        task_api.load(self.context, self.ds_version.manager).shrink_cluster(
+            self.db_info.id, [instance.id for instance in removal_instances])
+
+        return PXCCluster(self.context, self.db_info, self.ds, self.ds_version)
+

 class PXCClusterView(ClusterView):

--- a/trove/common/strategies/cluster/experimental/pxc/guestagent.py
+++ b/trove/common/strategies/cluster/experimental/pxc/guestagent.py
@ -33,6 +33,7 @@ class PXCGuestAgentAPI(guest_api.API):

    def install_cluster(self, replication_user, cluster_configuration,
                        bootstrap):
+        """Install the cluster."""
        LOG.debug("Installing PXC cluster.")
        self._call("install_cluster", CONF.cluster_usage_timeout,
                   self.version_cap,
@ -41,13 +42,27 @@ class PXCGuestAgentAPI(guest_api.API):
                   bootstrap=bootstrap)

    def reset_admin_password(self, admin_password):
-        """Store this password on the instance as the admin password"""
+        """Store this password on the instance as the admin password."""
        self._call("reset_admin_password", CONF.cluster_usage_timeout,
                   self.version_cap,
                   admin_password=admin_password)

    def cluster_complete(self):
-        """Set the status that the cluster is build is complete"""
+        """Set the status that the cluster is build is complete."""
        LOG.debug("Notifying cluster install completion.")
        return self._call("cluster_complete", guest_api.AGENT_HIGH_TIMEOUT,
                          self.version_cap)
+
+    def get_cluster_context(self):
+        """Get the context of the cluster."""
+        LOG.debug("Getting the cluster context.")
+        return self._call("get_cluster_context", guest_api.AGENT_HIGH_TIMEOUT,
+                          self.version_cap)
+
+    def write_cluster_configuration_overrides(self, cluster_configuration):
+        """Write an updated the cluster configuration."""
+        LOG.debug("Writing an updated the cluster configuration.")
+        self._call("write_cluster_configuration_overrides",
+                   guest_api.AGENT_HIGH_TIMEOUT,
+                   self.version_cap,
+                   cluster_configuration=cluster_configuration)
--- a/trove/common/strategies/cluster/experimental/pxc/taskmanager.py
+++ b/trove/common/strategies/cluster/experimental/pxc/taskmanager.py
@ -15,6 +15,8 @@ from eventlet.timeout import Timeout
 from oslo_log import log as logging

 from trove.common import cfg
+from trove.common.exception import PollTimeOut
+from trove.common.exception import TroveError
 from trove.common.i18n import _
 from trove.common.remote import create_nova_client
 from trove.common.strategies.cluster import base
@ -22,6 +24,7 @@ from trove.common.template import ClusterConfigTemplate
 from trove.common import utils
 from trove.instance.models import DBInstance
 from trove.instance.models import Instance
+from trove.instance import tasks as inst_tasks
 from trove.taskmanager import api as task_api
 import trove.taskmanager.models as task_models

@ -74,7 +77,7 @@ class PXCClusterTasks(task_models.ClusterTasks):
            LOG.debug("Waiting for instances to get to cluster-ready status.")
            # Wait for cluster members to get to cluster-ready status.
            if not self._all_instances_ready(instance_ids, cluster_id):
-                return
+                raise TroveError("Instances in cluster did not report ACTIVE")

            LOG.debug("All members ready, proceeding for cluster setup.")
            instances = [Instance.load(context, instance_id) for instance_id
@ -113,7 +116,8 @@ class PXCClusterTasks(task_models.ClusterTasks):
                    # render the conf.d/cluster.cnf configuration
                    cluster_configuration = self._render_cluster_config(
                        context,
-                        instance, ",".join(cluster_ips),
+                        instance,
+                        ",".join(cluster_ips),
                        cluster_name,
                        replication_user)

@ -139,7 +143,169 @@ class PXCClusterTasks(task_models.ClusterTasks):
                raise  # not my timeout
            LOG.exception(_("Timeout for building cluster."))
            self.update_statuses_on_failure(cluster_id)
+        except TroveError:
+            LOG.exception(_("Error creating cluster %s.") % cluster_id)
+            self.update_statuses_on_failure(cluster_id)
        finally:
            timeout.cancel()

        LOG.debug("End create_cluster for id: %s." % cluster_id)
+
+    def grow_cluster(self, context, cluster_id, new_instance_ids):
+        LOG.debug("Begin pxc grow_cluster for id: %s." % cluster_id)
+
+        def _grow_cluster():
+
+            db_instances = DBInstance.find_all(cluster_id=cluster_id).all()
+            existing_instances = [Instance.load(context, db_inst.id)
+                                  for db_inst in db_instances
+                                  if db_inst.id not in new_instance_ids]
+            if not existing_instances:
+                raise TroveError("Unable to determine existing cluster "
+                                 "member(s)")
+
+            # get list of ips of existing cluster members
+            existing_cluster_ips = [self.get_ip(instance) for instance in
+                                    existing_instances]
+            existing_instance_guests = [self.get_guest(instance)
+                                        for instance in existing_instances]
+
+            # get the cluster context to setup new members
+            cluster_context = existing_instance_guests[0].get_cluster_context()
+
+            # Wait for cluster members to get to cluster-ready status.
+            if not self._all_instances_ready(new_instance_ids, cluster_id):
+                raise TroveError("Instances in cluster did not report ACTIVE")
+
+            LOG.debug("All members ready, proceeding for cluster setup.")
+
+            # Get the new instances to join the cluster
+            new_instances = [Instance.load(context, instance_id)
+                             for instance_id in new_instance_ids]
+            new_cluster_ips = [self.get_ip(instance) for instance in
+                               new_instances]
+            for instance in new_instances:
+                guest = self.get_guest(instance)
+
+                guest.reset_admin_password(cluster_context['admin_password'])
+
+                # render the conf.d/cluster.cnf configuration
+                cluster_configuration = self._render_cluster_config(
+                    context,
+                    instance,
+                    ",".join(existing_cluster_ips),
+                    cluster_context['cluster_name'],
+                    cluster_context['replication_user'])
+
+                # push the cluster config and bootstrap the first instance
+                bootstrap = False
+                guest.install_cluster(cluster_context['replication_user'],
+                                      cluster_configuration,
+                                      bootstrap)
+
+            # apply the new config to all instances
+            for instance in existing_instances + new_instances:
+                guest = self.get_guest(instance)
+                # render the conf.d/cluster.cnf configuration
+                cluster_configuration = self._render_cluster_config(
+                    context,
+                    instance,
+                    ",".join(existing_cluster_ips + new_cluster_ips),
+                    cluster_context['cluster_name'],
+                    cluster_context['replication_user'])
+                guest.write_cluster_configuration_overrides(
+                    cluster_configuration)
+
+            for instance in new_instances:
+                guest = self.get_guest(instance)
+                guest.cluster_complete()
+
+        timeout = Timeout(CONF.cluster_usage_timeout)
+        try:
+            _grow_cluster()
+            self.reset_task()
+        except Timeout as t:
+            if t is not timeout:
+                raise  # not my timeout
+            LOG.exception(_("Timeout for growing cluster."))
+            self.update_statuses_on_failure(
+                cluster_id, status=inst_tasks.InstanceTasks.GROWING_ERROR)
+        except Exception:
+            LOG.exception(_("Error growing cluster %s.") % cluster_id)
+            self.update_statuses_on_failure(
+                cluster_id, status=inst_tasks.InstanceTasks.GROWING_ERROR)
+        finally:
+            timeout.cancel()
+
+        LOG.debug("End grow_cluster for id: %s." % cluster_id)
+
+    def shrink_cluster(self, context, cluster_id, removal_instance_ids):
+        LOG.debug("Begin pxc shrink_cluster for id: %s." % cluster_id)
+
+        def _shrink_cluster():
+            removal_instances = [Instance.load(context, instance_id)
+                                 for instance_id in removal_instance_ids]
+            for instance in removal_instances:
+                Instance.delete(instance)
+
+            # wait for instances to be deleted
+            def all_instances_marked_deleted():
+                non_deleted_instances = DBInstance.find_all(
+                    cluster_id=cluster_id, deleted=False).all()
+                non_deleted_ids = [db_instance.id for db_instance
+                                   in non_deleted_instances]
+                return not bool(
+                    set(removal_instance_ids).intersection(
+                        set(non_deleted_ids))
+                )
+            try:
+                LOG.info(_("Deleting instances (%s)") % removal_instance_ids)
+                utils.poll_until(all_instances_marked_deleted,
+                                 sleep_time=2,
+                                 time_out=CONF.cluster_delete_time_out)
+            except PollTimeOut:
+                LOG.error(_("timeout for instances to be marked as deleted."))
+                return
+
+            db_instances = DBInstance.find_all(cluster_id=cluster_id).all()
+            leftover_instances = [Instance.load(context, db_inst.id)
+                                  for db_inst in db_instances
+                                  if db_inst.id not in removal_instance_ids]
+            leftover_cluster_ips = [self.get_ip(instance) for instance in
+                                    leftover_instances]
+
+            # Get config changes for left over instances
+            rnd_cluster_guest = self.get_guest(leftover_instances[0])
+            cluster_context = rnd_cluster_guest.get_cluster_context()
+
+            # apply the new config to all leftover instances
+            for instance in leftover_instances:
+                guest = self.get_guest(instance)
+                # render the conf.d/cluster.cnf configuration
+                cluster_configuration = self._render_cluster_config(
+                    context,
+                    instance,
+                    ",".join(leftover_cluster_ips),
+                    cluster_context['cluster_name'],
+                    cluster_context['replication_user'])
+                guest.write_cluster_configuration_overrides(
+                    cluster_configuration)
+
+        timeout = Timeout(CONF.cluster_usage_timeout)
+        try:
+            _shrink_cluster()
+            self.reset_task()
+        except Timeout as t:
+            if t is not timeout:
+                raise  # not my timeout
+            LOG.exception(_("Timeout for shrinking cluster."))
+            self.update_statuses_on_failure(
+                cluster_id, status=inst_tasks.InstanceTasks.SHRINKING_ERROR)
+        except Exception:
+            LOG.exception(_("Error shrinking cluster %s.") % cluster_id)
+            self.update_statuses_on_failure(
+                cluster_id, status=inst_tasks.InstanceTasks.SHRINKING_ERROR)
+        finally:
+            timeout.cancel()
+
+        LOG.debug("End shrink_cluster for id: %s." % cluster_id)
--- a/trove/common/strategies/cluster/experimental/redis/api.py
+++ b/trove/common/strategies/cluster/experimental/redis/api.py
@ -21,6 +21,7 @@ from trove.cluster.views import ClusterView
 from trove.common import cfg
 from trove.common import exception
 from trove.common.exception import TroveError
+from trove.common.i18n import _
 from trove.common import remote
 from trove.common.strategies.cluster import base
 from trove.common import utils
--- a/trove/guestagent/datastore/experimental/pxc/manager.py
+++ b/trove/guestagent/datastore/experimental/pxc/manager.py
@ -69,3 +69,14 @@ class Manager(manager.MySqlManager):
        LOG.debug("Storing the admin password on the instance.")
        app = self.mysql_app(self.mysql_app_status.get())
        app.reset_admin_password(admin_password)
+
+    def get_cluster_context(self, context):
+        LOG.debug("Getting the cluster context.")
+        app = self.mysql_app(self.mysql_app_status.get())
+        return app.get_cluster_context()
+
+    def write_cluster_configuration_overrides(self, context,
+                                              cluster_configuration):
+        LOG.debug("Apply the updated cluster configuration.")
+        app = self.mysql_app(self.mysql_app_status.get())
+        app.write_cluster_configuration_overrides(cluster_configuration)
--- a/trove/guestagent/datastore/experimental/pxc/service.py
+++ b/trove/guestagent/datastore/experimental/pxc/service.py
@ -63,7 +63,7 @@ class PXCApp(service.BaseMySqlApp):
    def _wait_for_mysql_to_be_really_alive(self, max_time):
        utils.poll_until(self._test_mysql, sleep_time=3, time_out=max_time)

-    def secure(self, config_contents, overrides):
+    def secure(self, config_contents):
        LOG.info(_("Generating admin password."))
        admin_password = utils.generate_random_password()
        service.clear_expired_password()
@ -74,7 +74,6 @@ class PXCApp(service.BaseMySqlApp):
            self._create_admin_user(client, admin_password)
        self.stop_db()
        self._reset_configuration(config_contents, admin_password)
-        self._apply_user_overrides(overrides)
        self.start_mysql()
        # TODO(cp16net) figure out reason for PXC not updating the password
        try:
@ -93,7 +92,6 @@ class PXCApp(service.BaseMySqlApp):
        self.stop_db()

        self._reset_configuration(config_contents, admin_password)
-        self._apply_user_overrides(overrides)
        self.start_mysql()
        self._wait_for_mysql_to_be_really_alive(
            CONF.timeout_wait_for_service)
@ -121,13 +119,16 @@ class PXCApp(service.BaseMySqlApp):
            LOG.exception(_("Error bootstrapping cluster."))
            raise RuntimeError(_("Service is not discovered."))

+    def write_cluster_configuration_overrides(self, cluster_configuration):
+        self.configuration_manager.apply_system_override(
+            cluster_configuration, CNF_CLUSTER)
+
    def install_cluster(self, replication_user, cluster_configuration,
                        bootstrap=False):
        LOG.info(_("Installing cluster configuration."))
        self._grant_cluster_replication_privilege(replication_user)
        self.stop_db()
-        self.configuration_manager.apply_system_override(cluster_configuration,
-                                                         CNF_CLUSTER)
+        self.write_cluster_configuration_overrides(cluster_configuration)
        self.wipe_ib_logfiles()
        LOG.debug("bootstrap the instance? : %s" % bootstrap)
        # Have to wait to sync up the joiner instances with the donor instance.
@ -136,6 +137,20 @@ class PXCApp(service.BaseMySqlApp):
        else:
            self.start_mysql(timeout=CONF.restore_usage_timeout)

+    def get_cluster_context(self):
+        auth = self.configuration_manager.get_value('mysqld').get(
+            "wsrep_sst_auth").replace('"', '')
+        cluster_name = self.configuration_manager.get_value(
+            'mysqld').get("wsrep_cluster_name")
+        return {
+            'replication_user': {
+                'name': auth.split(":")[0],
+                'password': auth.split(":")[1],
+            },
+            'cluster_name': cluster_name,
+            'admin_password': self.get_auth_password()
+        }
+

 class PXCRootAccess(service.BaseMySqlRootAccess):
    def __init__(self):
--- a/trove/instance/tasks.py
+++ b/trove/instance/tasks.py
@ -106,6 +106,12 @@ class InstanceTasks(object):
    EJECTION_ERROR = InstanceTask(0x56, 'EJECTING',
                                        'Replica Source Ejection Error.',
                                        is_error=True)
+    GROWING_ERROR = InstanceTask(0x57, 'GROWING',
+                                       'Growing Cluster Error.',
+                                       is_error=True)
+    SHRINKING_ERROR = InstanceTask(0x58, 'SHRINKING',
+                                         'Shrinking Cluster Error.',
+                                         is_error=True)

 # Dissuade further additions at run-time.
 InstanceTask.__init__ = None
--- a/trove/taskmanager/models.py
+++ b/trove/taskmanager/models.py
@ -181,7 +181,8 @@ class ConfigurationMixin(object):

 class ClusterTasks(Cluster):

-    def update_statuses_on_failure(self, cluster_id, shard_id=None):
+    def update_statuses_on_failure(self, cluster_id, shard_id=None,
+                                   status=None):

        if CONF.update_status_on_fail:
            if shard_id:
@ -193,7 +194,7 @@ class ClusterTasks(Cluster):

            for db_instance in db_instances:
                db_instance.set_task_status(
-                    InstanceTasks.BUILDING_ERROR_SERVER)
+                    status or InstanceTasks.BUILDING_ERROR_SERVER)
                db_instance.save()

    @classmethod
--- a/trove/tests/scenario/runners/cluster_actions_runners.py
+++ b/trove/tests/scenario/runners/cluster_actions_runners.py
@ -191,6 +191,8 @@ class ClusterActionsRunner(TestRunner):
        self._assert_cluster_action(cluster_id, expected_task_name,
                                    expected_http_code)

+        self._assert_cluster_states(cluster_id, ['NONE'])
+        cluster = self.auth_client.clusters.get(cluster_id)
        self.assert_equal(
            len(removed_instance_names),
            initial_instance_count - len(cluster.instances),
@ -199,7 +201,6 @@ class ClusterActionsRunner(TestRunner):
        cluster_instances = self._get_cluster_instances(cluster_id)
        self.assert_all_instance_states(cluster_instances, ['ACTIVE'])

-        self._assert_cluster_states(cluster_id, ['NONE'])
        self._assert_cluster_response(cluster_id, 'NONE')

    def _find_cluster_instances_by_name(self, cluster, instance_names):
@ -322,9 +323,3 @@ class PxcClusterActionsRunner(ClusterActionsRunner):
            num_nodes=num_nodes, expected_task_name=expected_task_name,
            expected_instance_states=expected_instance_states,
            expected_http_code=expected_http_code)
-
-    def run_cluster_shrink(self):
-        raise SkipTest("Operation not supported by the datastore.")
-
-    def run_cluster_grow(self):
-        raise SkipTest("Operation not supported by the datastore.")
--- a/trove/tests/unittests/cluster/test_pxc_cluster.py
+++ b/trove/tests/unittests/cluster/test_pxc_cluster.py
@ -196,16 +196,19 @@ class ClusterTest(trove_testtools.TestCase):
                          instances, {}
                          )

+    @patch.object(inst_models.DBInstance, 'find_all')
    @patch.object(inst_models.Instance, 'create')
    @patch.object(DBCluster, 'create')
    @patch.object(task_api, 'load')
    @patch.object(QUOTAS, 'check_quotas')
    @patch.object(remote, 'create_nova_client')
    def test_create(self, mock_client, mock_check_quotas, mock_task_api,
-                    mock_db_create, mock_ins_create):
+                    mock_db_create, mock_ins_create, mock_find_all):
        instances = self.instances
        flavors = Mock()
+        networks = Mock()
        mock_client.return_value.flavors = flavors
+        mock_client.return_value.networks = networks
        self.cluster.create(Mock(),
                            self.cluster_name,
                            self.datastore,
@ -215,28 +218,7 @@ class ClusterTest(trove_testtools.TestCase):
            mock_db_create.return_value.id)
        self.assertEqual(3, mock_ins_create.call_count)

-    @patch.object(inst_models.Instance, 'create')
-    @patch.object(DBCluster, 'create')
-    @patch.object(task_api, 'load')
-    @patch.object(QUOTAS, 'check_quotas')
-    @patch.object(remote, 'create_nova_client')
-    def test_create_over_limit(self, mock_client, mock_check_quotas,
-                               mock_task_api, mock_db_create, mock_ins_create):
-        instances = [{'volume_size': 1, 'flavor_id': '1234'},
-                     {'volume_size': 1, 'flavor_id': '1234'},
-                     {'volume_size': 1, 'flavor_id': '1234'},
-                     {'volume_size': 1, 'flavor_id': '1234'}]
-        flavors = Mock()
-        mock_client.return_value.flavors = flavors
-        self.cluster.create(Mock(),
-                            self.cluster_name,
-                            self.datastore,
-                            self.datastore_version,
-                            instances, {})
-        mock_task_api.return_value.create_cluster.assert_called_with(
-            mock_db_create.return_value.id)
-        self.assertEqual(4, mock_ins_create.call_count)
-
+    @patch.object(inst_models.DBInstance, 'find_all')
    @patch.object(pxc_api, 'CONF')
    @patch.object(inst_models.Instance, 'create')
    @patch.object(DBCluster, 'create')
@ -245,7 +227,8 @@ class ClusterTest(trove_testtools.TestCase):
    @patch.object(remote, 'create_nova_client')
    def test_create_with_ephemeral_flavor(self, mock_client, mock_check_quotas,
                                          mock_task_api, mock_db_create,
-                                          mock_ins_create, mock_conf):
+                                          mock_ins_create, mock_conf,
+                                          mock_find_all):
        class FakeFlavor:
            def __init__(self, flavor_id):
                self.flavor_id = flavor_id
@ -300,3 +283,53 @@ class ClusterTest(trove_testtools.TestCase):
        self.cluster.db_info.task_status = ClusterTasks.DELETING
        self.cluster.delete()
        mock_update_db.assert_called_with(task_status=ClusterTasks.DELETING)
+
+    @patch.object(pxc_api.PXCCluster, '_get_cluster_network_interfaces')
+    @patch.object(DBCluster, 'update')
+    @patch.object(pxc_api, 'CONF')
+    @patch.object(inst_models.Instance, 'create')
+    @patch.object(task_api, 'load')
+    @patch.object(QUOTAS, 'check_quotas')
+    @patch.object(remote, 'create_nova_client')
+    def test_grow(self, mock_client, mock_check_quotas, mock_task_api,
+                  mock_inst_create, mock_conf, mock_update, mock_interfaces):
+        mock_client.return_value.flavors = Mock()
+        mock_interfaces.return_value = [Mock()]
+        self.cluster.grow(self.instances)
+        mock_update.assert_called_with(
+            task_status=ClusterTasks.GROWING_CLUSTER)
+        mock_task_api.return_value.grow_cluster.assert_called_with(
+            self.db_info.id,
+            [mock_inst_create.return_value.id] * 3)
+        self.assertEqual(mock_inst_create.call_count, 3)
+        self.assertEqual(mock_interfaces.call_count, 1)
+
+    @patch.object(inst_models.DBInstance, 'find_all')
+    @patch.object(inst_models.Instance, 'load')
+    @patch.object(Cluster, 'validate_cluster_available')
+    def test_shrink_empty(self, mock_validate, mock_load, mock_find_all):
+        instance = Mock()
+        self.assertRaises(
+            exception.ClusterShrinkMustNotLeaveClusterEmpty,
+            self.cluster.shrink, [instance])
+
+    @patch.object(pxc_api.PXCCluster, '__init__')
+    @patch.object(task_api, 'load')
+    @patch.object(DBCluster, 'update')
+    @patch.object(inst_models.DBInstance, 'find_all')
+    @patch.object(inst_models.Instance, 'load')
+    @patch.object(Cluster, 'validate_cluster_available')
+    def test_shrink(self, mock_validate, mock_load, mock_find_all,
+                    mock_update, mock_task_api, mock_init):
+        mock_init.return_value = None
+        existing_instances = [Mock(), Mock()]
+        mock_find_all.return_value.all.return_value = existing_instances
+        instance = Mock()
+        self.cluster.shrink([instance])
+        mock_validate.assert_called_with()
+        mock_update.assert_called_with(
+            task_status=ClusterTasks.SHRINKING_CLUSTER)
+        mock_task_api.return_value.shrink_cluster.assert_called_with(
+            self.db_info.id, [mock_load.return_value.id])
+        mock_init.assert_called_with(self.context, self.db_info,
+                                     self.datastore, self.datastore_version)
--- a/trove/tests/unittests/guestagent/test_pxc_api.py
+++ b/trove/tests/unittests/guestagent/test_pxc_api.py
@ -122,3 +122,25 @@ class ApiTest(trove_testtools.TestCase):
        self._verify_rpc_prepare_before_call()
        self._verify_call('cluster_complete')
        self.assertEqual(exp_resp, resp)
+
+    def test_get_cluster_context(self):
+        exp_resp = None
+        self.call_context.call.return_value = exp_resp
+
+        resp = self.api.get_cluster_context()
+
+        self._verify_rpc_prepare_before_call()
+        self._verify_call('get_cluster_context')
+        self.assertEqual(exp_resp, resp)
+
+    def test_write_cluster_configuration_overrides(self):
+        exp_resp = None
+        self.call_context.call.return_value = exp_resp
+
+        resp = self.api.write_cluster_configuration_overrides(
+            cluster_configuration="cluster-configuration")
+
+        self._verify_rpc_prepare_before_call()
+        self._verify_call('write_cluster_configuration_overrides',
+                          cluster_configuration="cluster-configuration",)
+        self.assertEqual(exp_resp, resp)
--- a/trove/tests/unittests/guestagent/test_pxc_manager.py
+++ b/trove/tests/unittests/guestagent/test_pxc_manager.py
@ -57,3 +57,18 @@ class GuestAgentManagerTest(trove_testtools.TestCase):
        self.manager.reset_admin_password(self.context, admin_password)
        self.status_get_mock.assert_any_call()
        reset_admin_pwd.assert_called_with(admin_password)
+
+    @patch.object(dbaas.PXCApp, 'get_cluster_context')
+    def test_get_cluster_context(self, get_cluster_ctxt):
+        get_cluster_ctxt.return_value = {'cluster': 'info'}
+        self.manager.get_cluster_context(self.context)
+        self.status_get_mock.assert_any_call()
+        get_cluster_ctxt.assert_any_call()
+
+    @patch.object(dbaas.PXCApp, 'write_cluster_configuration_overrides')
+    def test_write_cluster_configuration_overrides(self, conf_overries):
+        cluster_configuration = "cluster_configuration"
+        self.manager.write_cluster_configuration_overrides(
+            self.context, cluster_configuration)
+        self.status_get_mock.assert_any_call()
+        conf_overries.assert_called_with(cluster_configuration)
--- a/trove/tests/unittests/taskmanager/test_pxc_clusters.py
+++ b/trove/tests/unittests/taskmanager/test_pxc_clusters.py
@ -18,7 +18,7 @@ from mock import patch

 from trove.cluster.models import ClusterTasks as ClusterTaskStatus
 from trove.cluster.models import DBCluster
-from trove.common.exception import GuestError
+from trove.common import exception
 from trove.common.strategies.cluster.experimental.pxc.taskmanager import (
    PXCClusterTasks as ClusterTasks)
 from trove.common.strategies.cluster.experimental.pxc.taskmanager import (
@ -82,6 +82,14 @@ class PXCClusterTasksTest(trove_testtools.TestCase):
                                         self.db_cluster,
                                         datastore=mock_ds1,
                                         datastore_version=mock_dv1)
+        self.cluster_context = {
+            'replication_user': {
+                'name': "name",
+                'password': "password",
+            },
+            'cluster_name': self.cluster_name,
+            'admin_password': "admin_password"
+        }

    @patch.object(ClusterTasks, 'update_statuses_on_failure')
    @patch.object(InstanceServiceStatus, 'find_by')
@ -103,7 +111,8 @@ class PXCClusterTasksTest(trove_testtools.TestCase):
                                                         self.cluster_id)
        self.assertTrue(ret_val)

-    @patch.object(ClusterTasks, 'reset_task')
+    @patch('trove.common.strategies.cluster.experimental.pxc.taskmanager.LOG')
+    @patch.object(ClusterTasks, 'update_statuses_on_failure')
    @patch.object(ClusterTasks, '_all_instances_ready', return_value=False)
    @patch.object(Instance, 'load')
    @patch.object(DBInstance, 'find_all')
@ -111,14 +120,15 @@ class PXCClusterTasksTest(trove_testtools.TestCase):
    @patch.object(datastore_models.DatastoreVersion, 'load_by_uuid')
    def test_create_cluster_instance_not_ready(self, mock_dv, mock_ds,
                                               mock_find_all, mock_load,
-                                               mock_ready, mock_reset_task):
+                                               mock_ready, mock_update,
+                                               mock_logging):
        mock_find_all.return_value.all.return_value = [self.dbinst1]
        mock_load.return_value = BaseInstance(Mock(),
                                              self.dbinst1, Mock(),
                                              InstanceServiceStatus(
                                                  ServiceStatuses.NEW))
        self.clustertasks.create_cluster(Mock(), self.cluster_id)
-        mock_reset_task.assert_called_with()
+        mock_update.assert_called_with(self.cluster_id)

    @patch.object(ClusterTasks, 'update_statuses_on_failure')
    @patch.object(ClusterTasks, 'reset_task')
@ -139,13 +149,83 @@ class PXCClusterTasksTest(trove_testtools.TestCase):
                                                  ServiceStatuses.NEW))
        mock_ip.return_value = "10.0.0.2"
        guest_client = Mock()
-        guest_client.install_cluster = Mock(side_effect=GuestError("Error"))
+        guest_client.install_cluster = Mock(
+            side_effect=exception.GuestError("Error"))
        with patch.object(ClusterTasks, 'get_guest',
                          return_value=guest_client):
            self.clustertasks.create_cluster(Mock(), self.cluster_id)
            mock_update_status.assert_called_with('1232')
            mock_reset_task.assert_called_with()

+    @patch.object(ClusterTasks, 'update_statuses_on_failure')
+    @patch('trove.common.strategies.cluster.experimental.pxc.taskmanager.LOG')
+    def test_grow_cluster_does_not_exist(self, mock_logging,
+                                         mock_update_status):
+        context = Mock()
+        bad_cluster_id = '1234'
+        new_instances = [Mock(), Mock()]
+        self.clustertasks.grow_cluster(context, bad_cluster_id, new_instances)
+        mock_update_status.assert_called_with(
+            '1234',
+            status=InstanceTasks.GROWING_ERROR)
+
+    @patch.object(ClusterTasks, 'reset_task')
+    @patch.object(ClusterTasks, '_render_cluster_config')
+    @patch.object(ClusterTasks, 'get_ip')
+    @patch.object(ClusterTasks, 'get_guest')
+    @patch.object(ClusterTasks, '_all_instances_ready', return_value=True)
+    @patch.object(Instance, 'load')
+    @patch.object(DBInstance, 'find_all')
+    @patch.object(datastore_models.Datastore, 'load')
+    @patch.object(datastore_models.DatastoreVersion, 'load_by_uuid')
+    def test_grow_cluster_successs(self, mock_dv, mock_ds, mock_find_all,
+                                   mock_load, mock_ready, mock_guest, mock_ip,
+                                   mock_render, mock_reset_task):
+        mock_find_all.return_value.all.return_value = [self.dbinst1]
+
+        mock_ip.return_value = "10.0.0.2"
+        context = Mock()
+        new_instances = [Mock(), Mock()]
+        mock_guest.get_cluster_context = Mock(
+            return_value=self.cluster_context)
+        mock_guest.reset_admin_password = Mock()
+        self.clustertasks.grow_cluster(context, self.cluster_id,
+                                       new_instances)
+        mock_reset_task.assert_called_with()
+
+    @patch.object(ClusterTasks, 'reset_task')
+    @patch.object(Instance, 'load')
+    @patch.object(Instance, 'delete')
+    @patch.object(DBInstance, 'find_all')
+    @patch.object(ClusterTasks, 'get_guest')
+    @patch.object(ClusterTasks, 'get_ip')
+    @patch.object(ClusterTasks, '_render_cluster_config')
+    def test_shrink_cluster_success(self, mock_render, mock_ip, mock_guest,
+                                    mock_find_all, mock_delete, mock_load,
+                                    mock_reset_task):
+        mock_find_all.return_value.all.return_value = [self.dbinst1]
+        context = Mock()
+        remove_instances = [Mock()]
+        mock_ip.return_value = "10.0.0.2"
+        mock_guest.get_cluster_context = Mock(
+            return_value=self.cluster_context)
+        self.clustertasks.shrink_cluster(context, self.cluster_id,
+                                         remove_instances)
+        mock_reset_task.assert_called_with()
+
+    @patch.object(ClusterTasks, 'update_statuses_on_failure')
+    @patch('trove.common.strategies.cluster.experimental.pxc.taskmanager.LOG')
+    def test_shrink_cluster_does_not_exist(self, mock_logging,
+                                           mock_update_status):
+        context = Mock()
+        bad_cluster_id = '1234'
+        remove_instances = [Mock()]
+        self.clustertasks.shrink_cluster(context, bad_cluster_id,
+                                         remove_instances)
+        mock_update_status.assert_called_with(
+            '1234',
+            status=InstanceTasks.SHRINKING_ERROR)
+

 class PXCTaskManagerStrategyTest(trove_testtools.TestCase):

--- a/trove/tests/unittests/util/util.py
+++ b/trove/tests/unittests/util/util.py
@ -12,8 +12,13 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.

+DB_SETUP = None
+

 def init_db():
+    global DB_SETUP
+    if DB_SETUP:
+        return
    from trove.common import cfg
    from trove.db import get_db_api
    from trove.db.sqlalchemy import session
@ -21,3 +26,4 @@ def init_db():
    db_api = get_db_api()
    db_api.db_sync(CONF)
    session.configure_db(CONF)
+    DB_SETUP = True