diff --git a/trove/conductor/manager.py b/trove/conductor/manager.py index 22fb75c578..d5738e198c 100644 --- a/trove/conductor/manager.py +++ b/trove/conductor/manager.py @@ -24,7 +24,7 @@ from trove.common.serializable_notification import SerializableNotification from trove.conductor.models import LastSeen from trove.extensions.mysql import models as mysql_models from trove.instance import models as inst_models -from trove.instance.service_status import ServiceStatus +from trove.instance import service_status as svc_status LOG = logging.getLogger(__name__) CONF = cfg.CONF @@ -86,11 +86,20 @@ class Manager(periodic_task.PeriodicTasks): "payload": str(payload)}) status = inst_models.InstanceServiceStatus.find_by( instance_id=instance_id) + if self._message_too_old(instance_id, 'heartbeat', sent): return + + if status.get_status() == svc_status.ServiceStatuses.RESTART_REQUIRED: + LOG.debug("Instance %s service status is RESTART_REQUIRED, " + "skip heartbeat", instance_id) + return + if payload.get('service_status') is not None: status.set_status( - ServiceStatus.from_description(payload['service_status'])) + svc_status.ServiceStatus.from_description( + payload['service_status']) + ) status.save() def update_backup(self, context, instance_id, backup_id, diff --git a/trove/guestagent/guest_log.py b/trove/guestagent/guest_log.py index 1b7c85d6f6..570a08edbe 100644 --- a/trove/guestagent/guest_log.py +++ b/trove/guestagent/guest_log.py @@ -15,6 +15,7 @@ import enum import hashlib import os +from pathlib import Path from requests.exceptions import ConnectionError from oslo_log import log as logging @@ -245,34 +246,36 @@ class GuestLog(object): 'published': self._published_size}) def _update_details(self): - # Make sure we can read the file - if not self._file_readable or not os.access(self._file, os.R_OK): - if not os.access(self._file, os.R_OK): - if operating_system.exists(self._file, as_root=True): - operating_system.chmod( - self._file, FileMode.ADD_ALL_R, as_root=True) - self._file_readable = True + if operating_system.exists(self._file, as_root=True): + file_path = Path(self._file) - if os.path.isfile(self._file): - logstat = os.stat(self._file) - self._size = logstat.st_size + # Make sure guest agent can read the log file. + if not os.access(self._file, os.R_OK): + operating_system.chmod(self._file, FileMode.ADD_ALL_R, + as_root=True) + operating_system.chmod(str(file_path.parent), + FileMode.ADD_GRP_RX_OTH_RX, + as_root=True) + + self._size = file_path.stat().st_size self._update_log_header_digest(self._file) if self.status != LogStatus.Disabled: if self._log_rotated(): self.status = LogStatus.Rotated # See if we have stuff to publish - elif logstat.st_size > self._published_size: + elif self._size > self._published_size: self._set_status(self._published_size, LogStatus.Partial, LogStatus.Ready) # We've published everything so far - elif logstat.st_size == self._published_size: + elif self._size == self._published_size: self._set_status(self._published_size, LogStatus.Published, LogStatus.Enabled) # We've already handled this case (log rotated) so what gives? else: raise Exception(_("Bug in _log_rotated ?")) else: + LOG.warning(f"File {self._file} does not exist") self._published_size = 0 self._size = 0 diff --git a/trove/instance/models.py b/trove/instance/models.py index cbd6933a4f..b04e3ae68a 100644 --- a/trove/instance/models.py +++ b/trove/instance/models.py @@ -607,23 +607,30 @@ def load_instance(cls, context, id, needs_server=False, return cls(context, db_info, server, service_status) -def load_instance_with_info(cls, context, id, cluster_id=None): - db_info = get_db_info(context, id, cluster_id) - - service_status = InstanceServiceStatus.find_by(instance_id=id) - if (db_info.task_status == InstanceTasks.NONE and - not service_status.is_uptodate()): - LOG.warning('Guest agent heartbeat for instance %s has expried', id) +def update_service_status(task_status, service_status, ins_id): + """Update service status as needed.""" + if (task_status == InstanceTasks.NONE and + service_status.status != srvstatus.ServiceStatuses.RESTART_REQUIRED and + not service_status.is_uptodate()): + LOG.warning('Guest agent heartbeat for instance %s has expried', + ins_id) service_status.status = \ srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT + +def load_instance_with_info(cls, context, ins_id, cluster_id=None): + db_info = get_db_info(context, ins_id, cluster_id) + service_status = InstanceServiceStatus.find_by(instance_id=ins_id) + + update_service_status(db_info.task_status, service_status, ins_id) + load_simple_instance_server_status(context, db_info) load_simple_instance_addresses(context, db_info) instance = cls(context, db_info, service_status) - load_guest_info(instance, context, id) + load_guest_info(instance, context, ins_id) load_server_group_info(instance, context) @@ -901,6 +908,11 @@ class BaseInstance(SimpleInstance): del_instance.set_status(srvstatus.ServiceStatuses.DELETED) del_instance.save() + def set_servicestatus_restart(self): + del_instance = InstanceServiceStatus.find_by(instance_id=self.id) + del_instance.set_status(srvstatus.ServiceStatuses.RESTARTING) + del_instance.save() + def set_instance_fault_deleted(self): try: del_fault = DBInstanceFault.find_by(instance_id=self.id) @@ -1442,7 +1454,10 @@ class Instance(BuiltInstance): LOG.info("Rebooting instance %s.", self.id) if self.db_info.cluster_id is not None and not self.context.is_admin: raise exception.ClusterInstanceOperationNotSupported() + self.update_db(task_status=InstanceTasks.REBOOTING) + self.set_servicestatus_restart() + task_api.API(self.context).reboot(self.id) def restart(self): @@ -1450,13 +1465,10 @@ class Instance(BuiltInstance): LOG.info("Restarting datastore on instance %s.", self.id) if self.db_info.cluster_id is not None and not self.context.is_admin: raise exception.ClusterInstanceOperationNotSupported() - # Set our local status since Nova might not change it quick enough. - # TODO(tim.simpson): Possible bad stuff can happen if this service - # shuts down before it can set status to NONE. - # We need a last updated time to mitigate this; - # after some period of tolerance, we'll assume the - # status is no longer in effect. + self.update_db(task_status=InstanceTasks.REBOOTING) + self.set_servicestatus_restart() + task_api.API(self.context).restart(self.id) def detach_replica(self): @@ -1829,9 +1841,9 @@ class Instances(object): db.server_status = "SHUTDOWN" # Fake it... db.addresses = [] - datastore_status = InstanceServiceStatus.find_by( + service_status = InstanceServiceStatus.find_by( instance_id=db.id) - if not datastore_status.status: # This should never happen. + if not service_status.status: # This should never happen. LOG.error("Server status could not be read for " "instance id(%s).", db.id) continue @@ -1839,24 +1851,15 @@ class Instances(object): # Get the real-time service status. LOG.debug('Task status for instance %s: %s', db.id, db.task_status) - if db.task_status == InstanceTasks.NONE: - last_heartbeat_delta = ( - timeutils.utcnow() - datastore_status.updated_at) - agent_expiry_interval = timedelta( - seconds=CONF.agent_heartbeat_expiry) - if last_heartbeat_delta > agent_expiry_interval: - LOG.warning( - 'Guest agent heartbeat for instance %s has ' - 'expried', id) - datastore_status.status = \ - srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT + update_service_status(db.task_status, service_status, db.id) except exception.ModelNotFoundError: LOG.error("Server status could not be read for " "instance id(%s).", db.id) continue - ret.append(load_instance(context, db, datastore_status, - server=server)) + ret.append( + load_instance(context, db, service_status, server=server) + ) return ret diff --git a/trove/instance/service_status.py b/trove/instance/service_status.py index 913ecb5a89..6c1c3fa99c 100644 --- a/trove/instance/service_status.py +++ b/trove/instance/service_status.py @@ -36,6 +36,7 @@ class ServiceStatus(object): ServiceStatuses.CRASHED._code, ServiceStatuses.BLOCKED._code, ServiceStatuses.HEALTHY._code, + ServiceStatuses.RESTART_REQUIRED._code, ] return self._code in allowed_statuses diff --git a/trove/tests/unittests/instance/test_service.py b/trove/tests/unittests/instance/test_service.py index a38c62f57b..807cb161ad 100644 --- a/trove/tests/unittests/instance/test_service.py +++ b/trove/tests/unittests/instance/test_service.py @@ -11,11 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from datetime import timedelta from unittest import mock from trove.common import cfg from trove.common import clients from trove.common import exception +from trove.common import timeutils from trove.datastore import models as ds_models from trove.instance import models as ins_models from trove.instance import service @@ -174,3 +176,80 @@ class TestInstanceController(trove_testtools.TestCase): mock.ANY, "upgrade", instance_id=instance.id, datastore_version_id=new_ds_version.id) + + @mock.patch('trove.instance.models.load_server_group_info') + @mock.patch('trove.instance.models.load_guest_info') + @mock.patch('trove.instance.models.load_simple_instance_addresses') + @mock.patch('trove.instance.models.load_simple_instance_server_status') + def test_show_with_restart_required(self, load_server_mock, + load_addr_mock, load_guest_mock, + load_server_grp_mock): + # Create an instance in db. + instance = ins_models.DBInstance.create( + name=self.random_name('instance'), + flavor_id=self.random_uuid(), + tenant_id=self.random_uuid(), + volume_size=1, + datastore_version_id=self.ds_version_imageid.id, + task_status=ins_models.InstanceTasks.NONE, + compute_instance_id=self.random_uuid(), + server_status='ACTIVE' + ) + ins_models.InstanceServiceStatus.create( + instance_id=instance.id, + status=srvstatus.ServiceStatuses.RESTART_REQUIRED, + ) + + # workaround to reset updated_at field. + service_status = ins_models.InstanceServiceStatus.find_by( + instance_id=instance.id) + service_status.updated_at = timeutils.utcnow() - timedelta( + seconds=(CONF.agent_heartbeat_expiry + 60)) + ins_models.get_db_api().save(service_status) + + ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id) + self.assertEqual(200, ret.status) + + ret_instance = ret.data(None)['instance'] + + self.assertEqual('ACTIVE', ret_instance.get('status')) + self.assertEqual('RESTART_REQUIRED', + ret_instance.get('operating_status')) + + @mock.patch('trove.instance.models.load_server_group_info') + @mock.patch('trove.instance.models.load_guest_info') + @mock.patch('trove.instance.models.load_simple_instance_addresses') + @mock.patch('trove.instance.models.load_simple_instance_server_status') + def test_show_without_restart_required(self, load_server_mock, + load_addr_mock, load_guest_mock, + load_server_grp_mock): + # Create an instance in db. + instance = ins_models.DBInstance.create( + name=self.random_name('instance'), + flavor_id=self.random_uuid(), + tenant_id=self.random_uuid(), + volume_size=1, + datastore_version_id=self.ds_version_imageid.id, + task_status=ins_models.InstanceTasks.NONE, + compute_instance_id=self.random_uuid(), + server_status='ACTIVE' + ) + ins_models.InstanceServiceStatus.create( + instance_id=instance.id, + status=srvstatus.ServiceStatuses.HEALTHY, + ) + + # workaround to reset updated_at field. + service_status = ins_models.InstanceServiceStatus.find_by( + instance_id=instance.id) + service_status.updated_at = timeutils.utcnow() - timedelta( + seconds=(CONF.agent_heartbeat_expiry + 60)) + ins_models.get_db_api().save(service_status) + + ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id) + self.assertEqual(200, ret.status) + + ret_instance = ret.data(None)['instance'] + + self.assertEqual('ACTIVE', ret_instance.get('status')) + self.assertEqual('ERROR', ret_instance.get('operating_status'))