From 6a6322957a31e8e0f8678474bbd5a5d1d5c774f9 Mon Sep 17 00:00:00 2001 From: Li Ma Date: Fri, 21 Feb 2014 00:57:25 -0800 Subject: [PATCH] Race condition of L3-agent to add/remove routers This race condition happens when repeatedly calling l3-agent-router-add and l3-agent-router-remove by different neutron-servers at the same time. The primary key constraint is added for the pair of (router_id and l3_agent_id). During migration, verification is done if the current records violate the PK constraint defined in this bug fix, and sanitize the data before schema modification. Due to different dialects of database engines, different sql statements are executed correspondingly to do the verification. Change-Id: Ia541e023b757b2e77c4eec9bb1670632c7a271fa Closes-Bug: #1230323 --- neutron/db/l3_agentschedulers_db.py | 10 +- ...1d7f831a591_add_constraint_for_routerid.py | 127 ++++++++++++++++++ .../alembic_migrations/versions/HEAD | 2 +- neutron/scheduler/l3_agent_scheduler.py | 26 ++-- neutron/tests/unit/test_l3_schedulers.py | 35 +++++ 5 files changed, 185 insertions(+), 15 deletions(-) create mode 100644 neutron/db/migration/alembic_migrations/versions/31d7f831a591_add_constraint_for_routerid.py diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index e9a63d28d4..4c64b7b867 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -24,7 +24,6 @@ from neutron.common import constants from neutron.db import agents_db from neutron.db import agentschedulers_db from neutron.db import model_base -from neutron.db import models_v2 from neutron.extensions import l3agentscheduler @@ -40,15 +39,16 @@ L3_AGENTS_SCHEDULER_OPTS = [ cfg.CONF.register_opts(L3_AGENTS_SCHEDULER_OPTS) -class RouterL3AgentBinding(model_base.BASEV2, models_v2.HasId): +class RouterL3AgentBinding(model_base.BASEV2): """Represents binding between neutron routers and L3 agents.""" router_id = sa.Column(sa.String(36), - sa.ForeignKey("routers.id", ondelete='CASCADE')) + sa.ForeignKey("routers.id", ondelete='CASCADE'), + primary_key=True) l3_agent = orm.relation(agents_db.Agent) l3_agent_id = sa.Column(sa.String(36), - sa.ForeignKey("agents.id", - ondelete='CASCADE')) + sa.ForeignKey("agents.id", ondelete='CASCADE'), + primary_key=True) class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, diff --git a/neutron/db/migration/alembic_migrations/versions/31d7f831a591_add_constraint_for_routerid.py b/neutron/db/migration/alembic_migrations/versions/31d7f831a591_add_constraint_for_routerid.py new file mode 100644 index 0000000000..8d79d38e95 --- /dev/null +++ b/neutron/db/migration/alembic_migrations/versions/31d7f831a591_add_constraint_for_routerid.py @@ -0,0 +1,127 @@ +# Copyright 2014 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +"""add constraint for routerid + +Revision ID: 31d7f831a591 +Revises: 37f322991f59 +Create Date: 2014-02-26 06:47:16.494393 + +""" + +# revision identifiers, used by Alembic. +revision = '31d7f831a591' +down_revision = '37f322991f59' + +from alembic import op +import sqlalchemy as sa + +TABLE_NAME = 'routerl3agentbindings' +PK_NAME = 'pk_routerl3agentbindings' + +fk_names = {'postgresql': + {'router_id': + 'routerl3agentbindings_router_id_fkey', + 'l3_agent_id': + 'routerl3agentbindings_l3_agent_id_fkey'}, + 'mysql': + {'router_id': + 'routerl3agentbindings_ibfk_2', + 'l3_agent_id': + 'routerl3agentbindings_ibfk_1'}} + + +def upgrade(active_plugins=None, options=None): + # In order to sanitize the data during migration, + # the current records in the table need to be verified + # and all the duplicate records which violate the PK + # constraint need to be removed. + context = op.get_context() + if context.bind.dialect.name == 'postgresql': + op.execute('DELETE FROM %(table)s WHERE id in (' + 'SELECT %(table)s.id FROM %(table)s LEFT OUTER JOIN ' + '(SELECT MIN(id) as id, router_id, l3_agent_id ' + ' FROM %(table)s GROUP BY router_id, l3_agent_id) AS temp ' + 'ON %(table)s.id = temp.id WHERE temp.id is NULL);' + % {'table': TABLE_NAME}) + else: + op.execute('DELETE %(table)s FROM %(table)s LEFT OUTER JOIN ' + '(SELECT MIN(id) as id, router_id, l3_agent_id ' + ' FROM %(table)s GROUP BY router_id, l3_agent_id) AS temp ' + 'ON %(table)s.id = temp.id WHERE temp.id is NULL;' + % {'table': TABLE_NAME}) + + op.drop_column(TABLE_NAME, 'id') + + op.create_primary_key( + name=PK_NAME, + table_name=TABLE_NAME, + cols=['router_id', 'l3_agent_id'] + ) + + +def downgrade(active_plugins=None, options=None): + + context = op.get_context() + dialect = context.bind.dialect.name + + # Drop the existed foreign key constraints + # In order to perform primary key changes + op.drop_constraint( + name=fk_names[dialect]['l3_agent_id'], + table_name=TABLE_NAME, + type_='foreignkey' + ) + op.drop_constraint( + name=fk_names[dialect]['router_id'], + table_name=TABLE_NAME, + type_='foreignkey' + ) + + op.drop_constraint( + name=PK_NAME, + table_name=TABLE_NAME, + type_='primary' + ) + + op.add_column( + TABLE_NAME, + sa.Column('id', sa.String(32)) + ) + + # Restore the foreign key constraints + op.create_foreign_key( + name=fk_names[dialect]['router_id'], + source=TABLE_NAME, + referent='routers', + local_cols=['router_id'], + remote_cols=['id'], + ondelete='CASCADE' + ) + + op.create_foreign_key( + name=fk_names[dialect]['l3_agent_id'], + source=TABLE_NAME, + referent='agents', + local_cols=['l3_agent_id'], + remote_cols=['id'], + ondelete='CASCADE' + ) + + op.create_primary_key( + name=PK_NAME, + table_name=TABLE_NAME, + cols=['id'] + ) diff --git a/neutron/db/migration/alembic_migrations/versions/HEAD b/neutron/db/migration/alembic_migrations/versions/HEAD index 4bd668eab4..2478aa4333 100644 --- a/neutron/db/migration/alembic_migrations/versions/HEAD +++ b/neutron/db/migration/alembic_migrations/versions/HEAD @@ -1 +1 @@ -37f322991f59 \ No newline at end of file +31d7f831a591 diff --git a/neutron/scheduler/l3_agent_scheduler.py b/neutron/scheduler/l3_agent_scheduler.py index 8c0beca08b..d6b4dc7d4d 100644 --- a/neutron/scheduler/l3_agent_scheduler.py +++ b/neutron/scheduler/l3_agent_scheduler.py @@ -16,6 +16,7 @@ import abc import random +from oslo.db import exception as db_exc import six from sqlalchemy.orm import exc from sqlalchemy import sql @@ -145,15 +146,22 @@ class L3Scheduler(object): def bind_router(self, context, router_id, chosen_agent): """Bind the router to the l3 agent which has been chosen.""" - with context.session.begin(subtransactions=True): - binding = l3_agentschedulers_db.RouterL3AgentBinding() - binding.l3_agent = chosen_agent - binding.router_id = router_id - context.session.add(binding) - LOG.debug(_('Router %(router_id)s is scheduled to ' - 'L3 agent %(agent_id)s'), - {'router_id': router_id, - 'agent_id': chosen_agent.id}) + try: + with context.session.begin(subtransactions=True): + binding = l3_agentschedulers_db.RouterL3AgentBinding() + binding.l3_agent = chosen_agent + binding.router_id = router_id + context.session.add(binding) + except db_exc.DBDuplicateEntry: + LOG.debug('Router %(router_id)s has already been scheduled ' + 'to L3 agent %(agent_id)s.', + {'agent_id': chosen_agent.id, + 'router_id': router_id}) + return + + LOG.debug('Router %(router_id)s is scheduled to L3 agent ' + '%(agent_id)s', {'router_id': router_id, + 'agent_id': chosen_agent.id}) class ChanceScheduler(L3Scheduler): diff --git a/neutron/tests/unit/test_l3_schedulers.py b/neutron/tests/unit/test_l3_schedulers.py index 99bf8d83ea..5e619db8d8 100644 --- a/neutron/tests/unit/test_l3_schedulers.py +++ b/neutron/tests/unit/test_l3_schedulers.py @@ -31,6 +31,7 @@ from neutron.db import l3_agentschedulers_db from neutron.extensions import l3 as ext_l3 from neutron import manager from neutron.openstack.common import timeutils +from neutron.scheduler import l3_agent_scheduler from neutron.tests.unit import test_db_plugin from neutron.tests.unit import test_l3_plugin @@ -93,6 +94,7 @@ class L3SchedulerTestCase(l3_agentschedulers_db.L3AgentSchedulerDbMixin, agent_db = self.plugin.get_agents_db(self.adminContext, filters={'host': [HOST]}) self.agent_id1 = agent_db[0].id + self.agent1 = agent_db[0] callback.report_state(self.adminContext, agent_state={'agent_state': SECOND_L3_AGENT}, @@ -124,6 +126,39 @@ class L3SchedulerTestCase(l3_agentschedulers_db.L3AgentSchedulerDbMixin, router['router']['id'], subnet['subnet']['network_id']) self._delete('routers', router['router']['id']) + def _test_schedule_bind_router(self, agent, router): + ctx = self.adminContext + session = ctx.session + db = l3_agentschedulers_db.RouterL3AgentBinding + scheduler = l3_agent_scheduler.ChanceScheduler() + + rid = router['router']['id'] + scheduler.bind_router(ctx, rid, agent) + results = (session.query(db).filter_by(router_id=rid).all()) + self.assertTrue(len(results) > 0) + self.assertIn(agent.id, [bind.l3_agent_id for bind in results]) + + def test_bind_new_router(self): + router = self._make_router(self.fmt, + tenant_id=str(uuid.uuid4()), + name='r1') + with mock.patch.object(l3_agent_scheduler.LOG, 'debug') as flog: + self._test_schedule_bind_router(self.agent1, router) + self.assertEqual(1, flog.call_count) + args, kwargs = flog.call_args + self.assertIn('is scheduled', args[0]) + + def test_bind_existing_router(self): + router = self._make_router(self.fmt, + tenant_id=str(uuid.uuid4()), + name='r2') + self._test_schedule_bind_router(self.agent1, router) + with mock.patch.object(l3_agent_scheduler.LOG, 'debug') as flog: + self._test_schedule_bind_router(self.agent1, router) + self.assertEqual(1, flog.call_count) + args, kwargs = flog.call_args + self.assertIn('has already been scheduled', args[0]) + class L3AgentChanceSchedulerTestCase(L3SchedulerTestCase):