Split conductor-specific RPCService

The current implementation in common has a lot of assumptions that the manager is a conductor manager. To be able to reuse the same base RPC service for the PXE filter, split the conductor part away from the common one. Change-Id: I4d24cf82d62cb034840435ef15b5373748b65f09
2024-02-26 18:16:17 +01:00 · 2024-02-26 18:16:17 +01:00 · a9397f49d5
commit a9397f49d5
parent 0cfe290d15
6 changed files with 138 additions and 103 deletions
--- a/ironic/cmd/conductor.py
+++ b/ironic/cmd/conductor.py
@ -25,9 +25,9 @@ from oslo_config import cfg
 from oslo_log import log
 from oslo_service import service
 from ironic.common import rpc_service
 from ironic.common import service as ironic_service
 from ironic.common import utils
 from ironic.conductor import rpc_service
 CONF = cfg.CONF
--- a/ironic/cmd/singleprocess.py
+++ b/ironic/cmd/singleprocess.py
@ -17,9 +17,9 @@ from oslo_log import log
 from oslo_service import service
 from ironic.cmd import conductor as conductor_cmd
 from ironic.common import rpc_service
 from ironic.common import service as ironic_service
 from ironic.common import wsgi_service
 from ironic.conductor import rpc_service
 CONF = cfg.CONF
--- a/ironic/common/rpc_service.py
+++ b/ironic/common/rpc_service.py
@ -14,8 +14,6 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 import datetime
 import signal
 import sys
 import time
@ -25,7 +23,6 @@ from oslo_log import log
 import oslo_messaging as messaging
 from oslo_service import service
 from oslo_utils import importutils
 from oslo_utils import timeutils
 from ironic.common import context
 from ironic.common import rpc
@ -35,20 +32,18 @@ LOG = log.getLogger(__name__)
 CONF = cfg.CONF
-class RPCService(service.Service):
+class BaseRPCService(service.Service):
    def __init__(self, host, manager_module, manager_class):
-        super(RPCService, self).__init__()
+        super().__init__()
        self.host = host
        manager_module = importutils.try_import(manager_module)
        manager_class = getattr(manager_module, manager_class)
-        self.manager = manager_class(host, rpc.MANAGER_TOPIC)
+        self.manager = manager_class(host)
        self.topic = self.manager.topic
        self.rpcserver = None
        self.deregister = True
        self._failure = None
        self._started = False
-        self.draining = False
+        self._failure = None
    def wait_for_start(self):
        while not self._started and not self._failure:
@ -60,7 +55,7 @@ class RPCService(service.Service):
    def start(self):
        self._failure = None
        self._started = False
-        super(RPCService, self).start()
+        super().start()
        try:
            self._real_start()
        except Exception as exc:
@ -69,6 +64,9 @@ class RPCService(service.Service):
        else:
            self._started = True
    def handle_signal(self):
        pass
    def _real_start(self):
        admin_context = context.get_admin_context()
@ -88,97 +86,8 @@ class RPCService(service.Service):
        self.handle_signal()
        self.manager.init_host(admin_context)
        rpc.set_global_manager(self.manager)
        LOG.info('Created RPC server with %(transport)s transport for service '
                 '%(service)s on host %(host)s.',
                 {'service': self.topic, 'host': self.host,
                  'transport': CONF.rpc_transport})
    def stop(self):
        initial_time = timeutils.utcnow()
        extend_time = initial_time + datetime.timedelta(
            seconds=CONF.hash_ring_reset_interval)
        try:
            self.manager.del_host(deregister=self.deregister,
                                  clear_node_reservations=False)
        except Exception as e:
            LOG.exception('Service error occurred when cleaning up '
                          'the RPC manager. Error: %s', e)
        if self.manager.get_online_conductor_count() > 1:
            # Delay stopping the server until the hash ring has been
            # reset on the cluster
            stop_time = timeutils.utcnow()
            if stop_time < extend_time:
                stop_wait = max(0, (extend_time - stop_time).seconds)
                LOG.info('Waiting %(stop_wait)s seconds for hash ring reset.',
                         {'stop_wait': stop_wait})
                time.sleep(stop_wait)
        try:
            if self.rpcserver is not None:
                self.rpcserver.stop()
                self.rpcserver.wait()
        except Exception as e:
            LOG.exception('Service error occurred when stopping the '
                          'RPC server. Error: %s', e)
        super(RPCService, self).stop(graceful=True)
        LOG.info('Stopped RPC server for service %(service)s on host '
                 '%(host)s.',
                 {'service': self.topic, 'host': self.host})
        # Wait for reservation locks held by this conductor.
        # The conductor process will end when one of the following occurs:
        # - All reservations for this conductor are released
        # - shutdown_timeout has elapsed
        # - The process manager (systemd, kubernetes) sends SIGKILL after the
        #   configured timeout period
        while (self.manager.has_reserved()
               and not self._shutdown_timeout_reached(initial_time)):
            LOG.info('Waiting for reserved nodes to clear on host %(host)s',
                     {'host': self.host})
            time.sleep(1)
        # Stop the keepalive heartbeat greenthread sending touch(online=False)
        self.manager.keepalive_halt()
        rpc.set_global_manager(None)
    def _shutdown_timeout_reached(self, initial_time):
        if self.draining:
            shutdown_timeout = CONF.drain_shutdown_timeout
        else:
            shutdown_timeout = CONF.graceful_shutdown_timeout
        if shutdown_timeout == 0:
            # No timeout, run until no nodes are reserved
            return False
        shutdown_time = initial_time + datetime.timedelta(
            seconds=shutdown_timeout)
        return shutdown_time < timeutils.utcnow()
    def _handle_no_deregister(self, signo, frame):
        LOG.info('Got signal SIGUSR1. Not deregistering on next shutdown '
                 'of service %(service)s on host %(host)s.',
                 {'service': self.topic, 'host': self.host})
        self.deregister = False
    def _handle_drain(self, signo, frame):
        LOG.info('Got signal SIGUSR2. Starting drain shutdown'
                 'of service %(service)s on host %(host)s.',
                 {'service': self.topic, 'host': self.host})
        self.draining = True
        self.stop()
    def handle_signal(self):
        """Add a signal handler for SIGUSR1, SIGUSR2.
        The SIGUSR1 handler ensures that the manager is not deregistered when
        it is shutdown.
        The SIGUSR2 handler starts a drain shutdown.
        """
        signal.signal(signal.SIGUSR1, self._handle_no_deregister)
        signal.signal(signal.SIGUSR2, self._handle_drain)
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@ -59,6 +59,7 @@ from ironic.common import faults
 from ironic.common.i18n import _
 from ironic.common import network
 from ironic.common import nova
 from ironic.common import rpc
 from ironic.common import states
 from ironic.conductor import allocations
 from ironic.conductor import base_manager
@ -100,7 +101,7 @@ class ConductorManager(base_manager.BaseConductorManager):
    target = messaging.Target(version=RPC_API_VERSION)
-    def __init__(self, host, topic):
+    def __init__(self, host, topic=rpc.MANAGER_TOPIC):
        super(ConductorManager, self).__init__(host, topic)
        # NOTE(TheJulia): This is less a metric-able count, but a means to
        # sort out nodes and prioritise a subset (of non-responding nodes).
--- a/ironic/conductor/rpc_service.py
+++ b/ironic/conductor/rpc_service.py
@ -0,0 +1,125 @@
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
 # a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 import datetime
 import signal
 import time
 from oslo_config import cfg
 from oslo_log import log
 from oslo_utils import timeutils
 from ironic.common import rpc
 from ironic.common import rpc_service
 LOG = log.getLogger(__name__)
 CONF = cfg.CONF
 class RPCService(rpc_service.BaseRPCService):
    def __init__(self, host, manager_module, manager_class):
        super().__init__(host, manager_module, manager_class)
        self.deregister = True
        self.draining = False
    def _real_start(self):
        super()._real_start()
        rpc.set_global_manager(self.manager)
    def stop(self):
        initial_time = timeutils.utcnow()
        extend_time = initial_time + datetime.timedelta(
            seconds=CONF.hash_ring_reset_interval)
        try:
            self.manager.del_host(deregister=self.deregister,
                                  clear_node_reservations=False)
        except Exception as e:
            LOG.exception('Service error occurred when cleaning up '
                          'the RPC manager. Error: %s', e)
        if self.manager.get_online_conductor_count() > 1:
            # Delay stopping the server until the hash ring has been
            # reset on the cluster
            stop_time = timeutils.utcnow()
            if stop_time < extend_time:
                stop_wait = max(0, (extend_time - stop_time).seconds)
                LOG.info('Waiting %(stop_wait)s seconds for hash ring reset.',
                         {'stop_wait': stop_wait})
                time.sleep(stop_wait)
        try:
            if self.rpcserver is not None:
                self.rpcserver.stop()
                self.rpcserver.wait()
        except Exception as e:
            LOG.exception('Service error occurred when stopping the '
                          'RPC server. Error: %s', e)
        super().stop(graceful=True)
        LOG.info('Stopped RPC server for service %(service)s on host '
                 '%(host)s.',
                 {'service': self.topic, 'host': self.host})
        # Wait for reservation locks held by this conductor.
        # The conductor process will end when one of the following occurs:
        # - All reservations for this conductor are released
        # - shutdown_timeout has elapsed
        # - The process manager (systemd, kubernetes) sends SIGKILL after the
        #   configured timeout period
        while (self.manager.has_reserved()
               and not self._shutdown_timeout_reached(initial_time)):
            LOG.info('Waiting for reserved nodes to clear on host %(host)s',
                     {'host': self.host})
            time.sleep(1)
        # Stop the keepalive heartbeat greenthread sending touch(online=False)
        self.manager.keepalive_halt()
        rpc.set_global_manager(None)
    def _shutdown_timeout_reached(self, initial_time):
        if self.draining:
            shutdown_timeout = CONF.drain_shutdown_timeout
        else:
            shutdown_timeout = CONF.graceful_shutdown_timeout
        if shutdown_timeout == 0:
            # No timeout, run until no nodes are reserved
            return False
        shutdown_time = initial_time + datetime.timedelta(
            seconds=shutdown_timeout)
        return shutdown_time < timeutils.utcnow()
    def _handle_no_deregister(self, signo, frame):
        LOG.info('Got signal SIGUSR1. Not deregistering on next shutdown '
                 'of service %(service)s on host %(host)s.',
                 {'service': self.topic, 'host': self.host})
        self.deregister = False
    def _handle_drain(self, signo, frame):
        LOG.info('Got signal SIGUSR2. Starting drain shutdown'
                 'of service %(service)s on host %(host)s.',
                 {'service': self.topic, 'host': self.host})
        self.draining = True
        self.stop()
    def handle_signal(self):
        """Add a signal handler for SIGUSR1, SIGUSR2.
        The SIGUSR1 handler ensures that the manager is not deregistered when
        it is shutdown.
        The SIGUSR2 handler starts a drain shutdown.
        """
        signal.signal(signal.SIGUSR1, self._handle_no_deregister)
        signal.signal(signal.SIGUSR2, self._handle_drain)
--- a/ironic/tests/unit/conductor/test_rpc_service.py
+++ b/ironic/tests/unit/conductor/test_rpc_service.py
@ -21,9 +21,9 @@ from oslo_utils import timeutils
 from ironic.common import context
 from ironic.common import rpc
 from ironic.common import rpc_service
 from ironic.common import service as ironic_service
 from ironic.conductor import manager
 from ironic.conductor import rpc_service
 from ironic.objects import base as objects_base
 from ironic.tests.unit.db import base as db_base
 from ironic.tests.unit.db import utils as db_utils