Merge "Catch exceptions in router rescheduler"
This commit is contained in:
commit
712b9c3160
@ -26,6 +26,7 @@ from sqlalchemy.orm import joinedload
|
||||
from sqlalchemy import sql
|
||||
|
||||
from neutron.common import constants
|
||||
from neutron.common import rpc as n_rpc
|
||||
from neutron.common import utils as n_utils
|
||||
from neutron import context as n_ctx
|
||||
from neutron.db import agents_db
|
||||
@ -34,7 +35,7 @@ from neutron.db import l3_attrs_db
|
||||
from neutron.db import model_base
|
||||
from neutron.extensions import l3agentscheduler
|
||||
from neutron import manager
|
||||
from neutron.openstack.common.gettextutils import _LI, _LW
|
||||
from neutron.openstack.common.gettextutils import _LE, _LI, _LW
|
||||
from neutron.openstack.common import log as logging
|
||||
from neutron.openstack.common import loopingcall
|
||||
from neutron.openstack.common import timeutils
|
||||
@ -122,15 +123,28 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
|
||||
RouterL3AgentBinding.router_id).
|
||||
filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
|
||||
l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
|
||||
|
||||
try:
|
||||
for binding in down_bindings:
|
||||
LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s "
|
||||
LOG.warn(_LW(
|
||||
"Rescheduling router %(router)s from agent %(agent)s "
|
||||
"because the agent did not report to the server in "
|
||||
"the last %(dead_time)s seconds."),
|
||||
{'router': binding.router_id,
|
||||
'agent': binding.l3_agent_id,
|
||||
'dead_time': agent_dead_limit})
|
||||
try:
|
||||
self.reschedule_router(context, binding.router_id)
|
||||
except (l3agentscheduler.RouterReschedulingFailed,
|
||||
n_rpc.RemoteError):
|
||||
# Catch individual router rescheduling errors here
|
||||
# so one broken one doesn't stop the iteration.
|
||||
LOG.exception(_LE("Failed to reschedule router %s"),
|
||||
binding.router_id)
|
||||
except db_exc.DBError:
|
||||
# Catch DB errors here so a transient DB connectivity issue
|
||||
# doesn't stop the loopingcall.
|
||||
LOG.exception(_LE("Exception encountered during router "
|
||||
"rescheduling."))
|
||||
|
||||
def validate_agent_router_combination(self, context, agent, router):
|
||||
"""Validate if the router can be correctly assigned to the agent.
|
||||
|
@ -19,6 +19,7 @@ import datetime
|
||||
|
||||
import mock
|
||||
from oslo.config import cfg
|
||||
from oslo.db import exception as db_exc
|
||||
from webob import exc
|
||||
|
||||
from neutron.api import extensions
|
||||
@ -27,6 +28,7 @@ from neutron.api.rpc.handlers import dhcp_rpc
|
||||
from neutron.api.rpc.handlers import l3_rpc
|
||||
from neutron.api.v2 import attributes
|
||||
from neutron.common import constants
|
||||
from neutron.common import rpc as n_rpc
|
||||
from neutron import context
|
||||
from neutron.db import agents_db
|
||||
from neutron.db import l3_agentschedulers_db
|
||||
@ -648,6 +650,53 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase):
|
||||
agt_db.admin_state_up = state
|
||||
self.adminContext.session.commit()
|
||||
|
||||
def test_router_rescheduler_catches_rpc_db_and_reschedule_exceptions(self):
|
||||
with self.router():
|
||||
l3_rpc_cb = l3_rpc.L3RpcCallback()
|
||||
self._register_agent_states()
|
||||
# schedule the router to host A
|
||||
l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
|
||||
|
||||
plugin = manager.NeutronManager.get_service_plugins().get(
|
||||
service_constants.L3_ROUTER_NAT)
|
||||
mock.patch.object(
|
||||
plugin, 'reschedule_router',
|
||||
side_effect=[
|
||||
db_exc.DBError(), n_rpc.RemoteError(),
|
||||
l3agentscheduler.RouterReschedulingFailed(router_id='f',
|
||||
agent_id='f'),
|
||||
ValueError('this raises')
|
||||
]).start()
|
||||
# these first three should not raise any errors
|
||||
self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError
|
||||
self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError
|
||||
self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err
|
||||
|
||||
# ValueError is not caught so it should raise
|
||||
self.assertRaises(ValueError,
|
||||
self._take_down_agent_and_run_reschedule,
|
||||
L3_HOSTA)
|
||||
|
||||
def test_router_rescheduler_iterates_after_reschedule_failure(self):
|
||||
plugin = manager.NeutronManager.get_service_plugins().get(
|
||||
service_constants.L3_ROUTER_NAT)
|
||||
l3_rpc_cb = l3_rpc.L3RpcCallback()
|
||||
self._register_agent_states()
|
||||
with contextlib.nested(self.router(), self.router()) as (r1, r2):
|
||||
# schedule the routers to host A
|
||||
l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
|
||||
|
||||
rs_mock = mock.patch.object(
|
||||
plugin, 'reschedule_router',
|
||||
side_effect=l3agentscheduler.RouterReschedulingFailed(
|
||||
router_id='f', agent_id='f'),
|
||||
).start()
|
||||
self._take_down_agent_and_run_reschedule(L3_HOSTA)
|
||||
# make sure both had a reschedule attempt even though first failed
|
||||
rs_mock.assert_has_calls([mock.call(mock.ANY, r1['router']['id']),
|
||||
mock.call(mock.ANY, r2['router']['id'])],
|
||||
any_order=True)
|
||||
|
||||
def test_router_is_not_rescheduled_from_alive_agent(self):
|
||||
with self.router():
|
||||
l3_rpc_cb = l3_rpc.L3RpcCallback()
|
||||
|
Loading…
x
Reference in New Issue
Block a user