Catch exceptions in router rescheduler

Catch and log exceptions in router rescheduling loop
rather than just dying which would stop all future
router rescheduling attempts.

This prevents transient DB connectivity issues from
permanently breaking the rescheduler until the process
restarts.

Closes-Bug: #1375597
Change-Id: I2ab37847074fa6bbdd2b13fd03b8742996dcfc78
This commit is contained in:
Kevin Benton 2014-09-29 19:33:06 -07:00
parent bb37895c2f
commit 2c0a32e756
2 changed files with 73 additions and 10 deletions

View File

@ -26,6 +26,7 @@ from sqlalchemy.orm import joinedload
from sqlalchemy import sql
from neutron.common import constants
from neutron.common import rpc as n_rpc
from neutron.common import utils as n_utils
from neutron import context as n_ctx
from neutron.db import agents_db
@ -34,7 +35,7 @@ from neutron.db import l3_attrs_db
from neutron.db import model_base
from neutron.extensions import l3agentscheduler
from neutron import manager
from neutron.openstack.common.gettextutils import _LI, _LW
from neutron.openstack.common.gettextutils import _LE, _LI, _LW
from neutron.openstack.common import log as logging
from neutron.openstack.common import loopingcall
from neutron.openstack.common import timeutils
@ -122,15 +123,28 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
RouterL3AgentBinding.router_id).
filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
for binding in down_bindings:
LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s "
"because the agent did not report to the server in "
"the last %(dead_time)s seconds."),
{'router': binding.router_id,
'agent': binding.l3_agent_id,
'dead_time': agent_dead_limit})
self.reschedule_router(context, binding.router_id)
try:
for binding in down_bindings:
LOG.warn(_LW(
"Rescheduling router %(router)s from agent %(agent)s "
"because the agent did not report to the server in "
"the last %(dead_time)s seconds."),
{'router': binding.router_id,
'agent': binding.l3_agent_id,
'dead_time': agent_dead_limit})
try:
self.reschedule_router(context, binding.router_id)
except (l3agentscheduler.RouterReschedulingFailed,
n_rpc.RemoteError):
# Catch individual router rescheduling errors here
# so one broken one doesn't stop the iteration.
LOG.exception(_LE("Failed to reschedule router %s"),
binding.router_id)
except db_exc.DBError:
# Catch DB errors here so a transient DB connectivity issue
# doesn't stop the loopingcall.
LOG.exception(_LE("Exception encountered during router "
"rescheduling."))
def validate_agent_router_combination(self, context, agent, router):
"""Validate if the router can be correctly assigned to the agent.

View File

@ -19,6 +19,7 @@ import datetime
import mock
from oslo.config import cfg
from oslo.db import exception as db_exc
from webob import exc
from neutron.api import extensions
@ -27,6 +28,7 @@ from neutron.api.rpc.handlers import dhcp_rpc
from neutron.api.rpc.handlers import l3_rpc
from neutron.api.v2 import attributes
from neutron.common import constants
from neutron.common import rpc as n_rpc
from neutron import context
from neutron.db import agents_db
from neutron.db import l3_agentschedulers_db
@ -648,6 +650,53 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase):
agt_db.admin_state_up = state
self.adminContext.session.commit()
def test_router_rescheduler_catches_rpc_db_and_reschedule_exceptions(self):
with self.router():
l3_rpc_cb = l3_rpc.L3RpcCallback()
self._register_agent_states()
# schedule the router to host A
l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)
mock.patch.object(
plugin, 'reschedule_router',
side_effect=[
db_exc.DBError(), n_rpc.RemoteError(),
l3agentscheduler.RouterReschedulingFailed(router_id='f',
agent_id='f'),
ValueError('this raises')
]).start()
# these first three should not raise any errors
self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError
self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError
self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err
# ValueError is not caught so it should raise
self.assertRaises(ValueError,
self._take_down_agent_and_run_reschedule,
L3_HOSTA)
def test_router_rescheduler_iterates_after_reschedule_failure(self):
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)
l3_rpc_cb = l3_rpc.L3RpcCallback()
self._register_agent_states()
with contextlib.nested(self.router(), self.router()) as (r1, r2):
# schedule the routers to host A
l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
rs_mock = mock.patch.object(
plugin, 'reschedule_router',
side_effect=l3agentscheduler.RouterReschedulingFailed(
router_id='f', agent_id='f'),
).start()
self._take_down_agent_and_run_reschedule(L3_HOSTA)
# make sure both had a reschedule attempt even though first failed
rs_mock.assert_has_calls([mock.call(mock.ANY, r1['router']['id']),
mock.call(mock.ANY, r2['router']['id'])],
any_order=True)
def test_router_is_not_rescheduled_from_alive_agent(self):
with self.router():
l3_rpc_cb = l3_rpc.L3RpcCallback()