diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index da82789507..bef3119765 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -26,6 +26,7 @@ from sqlalchemy.orm import joinedload from sqlalchemy import sql from neutron.common import constants +from neutron.common import rpc as n_rpc from neutron.common import utils as n_utils from neutron import context as n_ctx from neutron.db import agents_db @@ -34,7 +35,7 @@ from neutron.db import l3_attrs_db from neutron.db import model_base from neutron.extensions import l3agentscheduler from neutron import manager -from neutron.openstack.common.gettextutils import _LI, _LW +from neutron.openstack.common.gettextutils import _LE, _LI, _LW from neutron.openstack.common import log as logging from neutron.openstack.common import loopingcall from neutron.openstack.common import timeutils @@ -122,15 +123,28 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, RouterL3AgentBinding.router_id). filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(), l3_attrs_db.RouterExtraAttributes.ha == sql.null()))) - - for binding in down_bindings: - LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s " - "because the agent did not report to the server in " - "the last %(dead_time)s seconds."), - {'router': binding.router_id, - 'agent': binding.l3_agent_id, - 'dead_time': agent_dead_limit}) - self.reschedule_router(context, binding.router_id) + try: + for binding in down_bindings: + LOG.warn(_LW( + "Rescheduling router %(router)s from agent %(agent)s " + "because the agent did not report to the server in " + "the last %(dead_time)s seconds."), + {'router': binding.router_id, + 'agent': binding.l3_agent_id, + 'dead_time': agent_dead_limit}) + try: + self.reschedule_router(context, binding.router_id) + except (l3agentscheduler.RouterReschedulingFailed, + n_rpc.RemoteError): + # Catch individual router rescheduling errors here + # so one broken one doesn't stop the iteration. + LOG.exception(_LE("Failed to reschedule router %s"), + binding.router_id) + except db_exc.DBError: + # Catch DB errors here so a transient DB connectivity issue + # doesn't stop the loopingcall. + LOG.exception(_LE("Exception encountered during router " + "rescheduling.")) def validate_agent_router_combination(self, context, agent, router): """Validate if the router can be correctly assigned to the agent. diff --git a/neutron/tests/unit/openvswitch/test_agent_scheduler.py b/neutron/tests/unit/openvswitch/test_agent_scheduler.py index f73866f4e6..0fdf9fdbe1 100644 --- a/neutron/tests/unit/openvswitch/test_agent_scheduler.py +++ b/neutron/tests/unit/openvswitch/test_agent_scheduler.py @@ -19,6 +19,7 @@ import datetime import mock from oslo.config import cfg +from oslo.db import exception as db_exc from webob import exc from neutron.api import extensions @@ -27,6 +28,7 @@ from neutron.api.rpc.handlers import dhcp_rpc from neutron.api.rpc.handlers import l3_rpc from neutron.api.v2 import attributes from neutron.common import constants +from neutron.common import rpc as n_rpc from neutron import context from neutron.db import agents_db from neutron.db import l3_agentschedulers_db @@ -648,6 +650,53 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase): agt_db.admin_state_up = state self.adminContext.session.commit() + def test_router_rescheduler_catches_rpc_db_and_reschedule_exceptions(self): + with self.router(): + l3_rpc_cb = l3_rpc.L3RpcCallback() + self._register_agent_states() + # schedule the router to host A + l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA) + + plugin = manager.NeutronManager.get_service_plugins().get( + service_constants.L3_ROUTER_NAT) + mock.patch.object( + plugin, 'reschedule_router', + side_effect=[ + db_exc.DBError(), n_rpc.RemoteError(), + l3agentscheduler.RouterReschedulingFailed(router_id='f', + agent_id='f'), + ValueError('this raises') + ]).start() + # these first three should not raise any errors + self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError + self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError + self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err + + # ValueError is not caught so it should raise + self.assertRaises(ValueError, + self._take_down_agent_and_run_reschedule, + L3_HOSTA) + + def test_router_rescheduler_iterates_after_reschedule_failure(self): + plugin = manager.NeutronManager.get_service_plugins().get( + service_constants.L3_ROUTER_NAT) + l3_rpc_cb = l3_rpc.L3RpcCallback() + self._register_agent_states() + with contextlib.nested(self.router(), self.router()) as (r1, r2): + # schedule the routers to host A + l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA) + + rs_mock = mock.patch.object( + plugin, 'reschedule_router', + side_effect=l3agentscheduler.RouterReschedulingFailed( + router_id='f', agent_id='f'), + ).start() + self._take_down_agent_and_run_reschedule(L3_HOSTA) + # make sure both had a reschedule attempt even though first failed + rs_mock.assert_has_calls([mock.call(mock.ANY, r1['router']['id']), + mock.call(mock.ANY, r2['router']['id'])], + any_order=True) + def test_router_is_not_rescheduled_from_alive_agent(self): with self.router(): l3_rpc_cb = l3_rpc.L3RpcCallback()