diff --git a/etc/neutron.conf b/etc/neutron.conf index c3cf25a4a7..805bdd608f 100644 --- a/etc/neutron.conf +++ b/etc/neutron.conf @@ -162,6 +162,10 @@ lock_path = $state_path/lock # routers to first L3 agent which sends sync_routers message to neutron server # router_auto_schedule = True +# Allow automatic rescheduling of routers from dead L3 agents with +# admin_state_up set to True to alive agents. +# allow_automatic_l3agent_failover = False + # Number of DHCP agents scheduled to host a network. This enables redundant # DHCP agents for configured networks. # dhcp_agents_per_network = 1 diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index 3296845a8c..478c235712 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -12,6 +12,9 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +import datetime +import random +import time from oslo.config import cfg import sqlalchemy as sa @@ -21,11 +24,19 @@ from sqlalchemy.orm import exc from sqlalchemy.orm import joinedload from neutron.common import constants +from neutron import context as n_ctx from neutron.db import agents_db from neutron.db import agentschedulers_db from neutron.db import model_base from neutron.extensions import l3agentscheduler from neutron import manager +from neutron.openstack.common.gettextutils import _LI, _LW +from neutron.openstack.common import log as logging +from neutron.openstack.common import loopingcall +from neutron.openstack.common import timeutils + + +LOG = logging.getLogger(__name__) L3_AGENTS_SCHEDULER_OPTS = [ cfg.StrOpt('router_scheduler_driver', @@ -34,6 +45,9 @@ L3_AGENTS_SCHEDULER_OPTS = [ 'router to a default L3 agent')), cfg.BoolOpt('router_auto_schedule', default=True, help=_('Allow auto scheduling of routers to L3 agent.')), + cfg.BoolOpt('allow_automatic_l3agent_failover', default=False, + help=_('Automatically reschedule routers from offline L3 ' + 'agents to online L3 agents.')), ] cfg.CONF.register_opts(L3_AGENTS_SCHEDULER_OPTS) @@ -59,6 +73,54 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, router_scheduler = None + def start_periodic_agent_status_check(self): + if not cfg.CONF.allow_automatic_l3agent_failover: + LOG.info(_LI("Skipping period L3 agent status check because " + "automatic router rescheduling is disabled.")) + return + + self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall( + self.reschedule_routers_from_down_agents) + interval = max(cfg.CONF.agent_down_time / 2, 1) + # add random initial delay to allow agents to check in after the + # neutron server first starts. random to offset multiple servers + self.periodic_agent_loop.start(interval=interval, + initial_delay=random.randint(interval, interval * 2)) + + def reschedule_routers_from_down_agents(self): + """Reschedule routers from down l3 agents if admin state is up.""" + + # give agents extra time to handle transient failures + agent_dead_limit = cfg.CONF.agent_down_time * 2 + + # check for an abrupt clock change since last check. if a change is + # detected, sleep for a while to let the agents check in. + tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary', + timeutils.utcnow()) + if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time: + LOG.warn(_LW("Time since last L3 agent reschedule check has " + "exceeded the interval between checks. Waiting " + "before check to allow agents to send a heartbeat " + "in case there was a clock adjustment.")) + time.sleep(agent_dead_limit) + self._clock_jump_canary = timeutils.utcnow() + + context = n_ctx.get_admin_context() + cutoff = timeutils.utcnow() - datetime.timedelta( + seconds=agent_dead_limit) + down_bindings = ( + context.session.query(RouterL3AgentBinding). + filter(agents_db.Agent.heartbeat_timestamp < cutoff, + agents_db.Agent.admin_state_up)) + for binding in down_bindings: + LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s " + "because the agent did not report to the server in " + "the last %(dead_time)s seconds."), + {'router': binding.router_id, + 'agent': binding.l3_agent_id, + 'dead_time': agent_dead_limit}) + self.reschedule_router(context, binding.router_id) + def add_router_to_l3_agent(self, context, agent_id, router_id): """Add a l3 agent to host a router.""" router = self.get_router(context, router_id) diff --git a/neutron/services/l3_router/l3_router_plugin.py b/neutron/services/l3_router/l3_router_plugin.py index 1fa763c82e..0e94275ce9 100644 --- a/neutron/services/l3_router/l3_router_plugin.py +++ b/neutron/services/l3_router/l3_router_plugin.py @@ -62,6 +62,7 @@ class L3RouterPlugin(common_db_mixin.CommonDbMixin, self.setup_rpc() self.router_scheduler = importutils.import_object( cfg.CONF.router_scheduler_driver) + self.start_periodic_agent_status_check() def setup_rpc(self): # RPC support diff --git a/neutron/tests/unit/openvswitch/test_agent_scheduler.py b/neutron/tests/unit/openvswitch/test_agent_scheduler.py index c9057ed336..ab855b61c9 100644 --- a/neutron/tests/unit/openvswitch/test_agent_scheduler.py +++ b/neutron/tests/unit/openvswitch/test_agent_scheduler.py @@ -15,6 +15,7 @@ import contextlib import copy +import datetime import mock from oslo.config import cfg @@ -27,6 +28,7 @@ from neutron.common import constants from neutron import context from neutron.db import agents_db from neutron.db import dhcp_rpc_base +from neutron.db import l3_agentschedulers_db from neutron.db import l3_rpc_base from neutron.extensions import agent from neutron.extensions import dhcpagentscheduler @@ -231,6 +233,9 @@ class OvsAgentSchedulerTestCaseBase(test_l3_plugin.L3NatTestCaseMixin, self.l3_notify_p = mock.patch( 'neutron.extensions.l3agentscheduler.notify') self.patched_l3_notify = self.l3_notify_p.start() + self.l3_periodic_p = mock.patch('neutron.db.L3AgentSchedulerDbMixin.' + 'start_periodic_agent_status_check') + self.patched_l3_periodic = self.l3_notify_p.start() self.dhcp_notify_p = mock.patch( 'neutron.extensions.dhcpagentscheduler.notify') self.patched_dhcp_notify = self.dhcp_notify_p.start() @@ -617,6 +622,61 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase): self.assertEqual(port_list['ports'][0]['device_id'], constants.DEVICE_ID_RESERVED_DHCP_PORT) + def _take_down_agent_and_run_reschedule(self, host): + # take down the agent on host A and ensure B is alive + self.adminContext.session.begin(subtransactions=True) + query = self.adminContext.session.query(agents_db.Agent) + agt = query.filter_by(host=host).first() + agt.heartbeat_timestamp = ( + agt.heartbeat_timestamp - datetime.timedelta(hours=1)) + self.adminContext.session.commit() + + plugin = manager.NeutronManager.get_service_plugins().get( + service_constants.L3_ROUTER_NAT) + + plugin.reschedule_routers_from_down_agents() + + def _set_agent_admin_state_up(self, host, state): + self.adminContext.session.begin(subtransactions=True) + query = self.adminContext.session.query(agents_db.Agent) + agt_db = query.filter_by(host=host).first() + agt_db.admin_state_up = state + self.adminContext.session.commit() + + def test_router_reschedule_from_dead_agent(self): + with self.router(): + l3_rpc = l3_rpc_base.L3RpcCallbackMixin() + self._register_agent_states() + + # schedule the router to host A + ret_a = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTA) + self._take_down_agent_and_run_reschedule(L3_HOSTA) + + # B should now pick up the router + ret_b = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTB) + self.assertEqual(ret_b, ret_a) + + def test_router_no_reschedule_from_dead_admin_down_agent(self): + with self.router() as r: + l3_rpc = l3_rpc_base.L3RpcCallbackMixin() + self._register_agent_states() + + # schedule the router to host A + l3_rpc.sync_routers(self.adminContext, host=L3_HOSTA) + self._set_agent_admin_state_up(L3_HOSTA, False) + self._take_down_agent_and_run_reschedule(L3_HOSTA) + + # A should still have it even though it was inactive due to the + # admin_state being down + rab = l3_agentschedulers_db.RouterL3AgentBinding + binding = (self.adminContext.session.query(rab). + filter(rab.router_id == r['router']['id']).first()) + self.assertEqual(binding.l3_agent.host, L3_HOSTA) + + # B should not pick up the router + ret_b = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTB) + self.assertFalse(ret_b) + def test_router_auto_schedule_with_invalid_router(self): with self.router() as router: l3_rpc = l3_rpc_base.L3RpcCallbackMixin()