Merge "Catch exceptions in router rescheduler"

This commit is contained in:
Jenkins 2014-10-14 16:11:36 +00:00 committed by Gerrit Code Review
commit 712b9c3160
2 changed files with 73 additions and 10 deletions

View File

@ -26,6 +26,7 @@ from sqlalchemy.orm import joinedload
from sqlalchemy import sql
from neutron.common import constants
from neutron.common import rpc as n_rpc
from neutron.common import utils as n_utils
from neutron import context as n_ctx
from neutron.db import agents_db
@ -34,7 +35,7 @@ from neutron.db import l3_attrs_db
from neutron.db import model_base
from neutron.extensions import l3agentscheduler
from neutron import manager
from neutron.openstack.common.gettextutils import _LI, _LW
from neutron.openstack.common.gettextutils import _LE, _LI, _LW
from neutron.openstack.common import log as logging
from neutron.openstack.common import loopingcall
from neutron.openstack.common import timeutils
@ -122,15 +123,28 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
RouterL3AgentBinding.router_id).
filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
for binding in down_bindings:
LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s "
"because the agent did not report to the server in "
"the last %(dead_time)s seconds."),
{'router': binding.router_id,
'agent': binding.l3_agent_id,
'dead_time': agent_dead_limit})
self.reschedule_router(context, binding.router_id)
try:
for binding in down_bindings:
LOG.warn(_LW(
"Rescheduling router %(router)s from agent %(agent)s "
"because the agent did not report to the server in "
"the last %(dead_time)s seconds."),
{'router': binding.router_id,
'agent': binding.l3_agent_id,
'dead_time': agent_dead_limit})
try:
self.reschedule_router(context, binding.router_id)
except (l3agentscheduler.RouterReschedulingFailed,
n_rpc.RemoteError):
# Catch individual router rescheduling errors here
# so one broken one doesn't stop the iteration.
LOG.exception(_LE("Failed to reschedule router %s"),
binding.router_id)
except db_exc.DBError:
# Catch DB errors here so a transient DB connectivity issue
# doesn't stop the loopingcall.
LOG.exception(_LE("Exception encountered during router "
"rescheduling."))
def validate_agent_router_combination(self, context, agent, router):
"""Validate if the router can be correctly assigned to the agent.

View File

@ -19,6 +19,7 @@ import datetime
import mock
from oslo.config import cfg
from oslo.db import exception as db_exc
from webob import exc
from neutron.api import extensions
@ -27,6 +28,7 @@ from neutron.api.rpc.handlers import dhcp_rpc
from neutron.api.rpc.handlers import l3_rpc
from neutron.api.v2 import attributes
from neutron.common import constants
from neutron.common import rpc as n_rpc
from neutron import context
from neutron.db import agents_db
from neutron.db import l3_agentschedulers_db
@ -648,6 +650,53 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase):
agt_db.admin_state_up = state
self.adminContext.session.commit()
def test_router_rescheduler_catches_rpc_db_and_reschedule_exceptions(self):
with self.router():
l3_rpc_cb = l3_rpc.L3RpcCallback()
self._register_agent_states()
# schedule the router to host A
l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)
mock.patch.object(
plugin, 'reschedule_router',
side_effect=[
db_exc.DBError(), n_rpc.RemoteError(),
l3agentscheduler.RouterReschedulingFailed(router_id='f',
agent_id='f'),
ValueError('this raises')
]).start()
# these first three should not raise any errors
self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError
self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError
self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err
# ValueError is not caught so it should raise
self.assertRaises(ValueError,
self._take_down_agent_and_run_reschedule,
L3_HOSTA)
def test_router_rescheduler_iterates_after_reschedule_failure(self):
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)
l3_rpc_cb = l3_rpc.L3RpcCallback()
self._register_agent_states()
with contextlib.nested(self.router(), self.router()) as (r1, r2):
# schedule the routers to host A
l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
rs_mock = mock.patch.object(
plugin, 'reschedule_router',
side_effect=l3agentscheduler.RouterReschedulingFailed(
router_id='f', agent_id='f'),
).start()
self._take_down_agent_and_run_reschedule(L3_HOSTA)
# make sure both had a reschedule attempt even though first failed
rs_mock.assert_has_calls([mock.call(mock.ANY, r1['router']['id']),
mock.call(mock.ANY, r2['router']['id'])],
any_order=True)
def test_router_is_not_rescheduled_from_alive_agent(self):
with self.router():
l3_rpc_cb = l3_rpc.L3RpcCallback()