Record and log reason for dhcp agent resync

A dhcp resync can be triggered at a number of points, but the actual
resync is done asynchronously by a helper thread.  This means by the
time the resync happens, it's hard to establish what actually caused
it.

There have been a number of problems in production systems that cause
excessive resyncs.  One is a ipv6/dnsmasq issue (rhbz#1077487) and
another is db corruption with duplicate entries [1].  The resync
triggers a whole lot of logs itself, so it becomes very unclear how to
establish any causality.

This change keeps track of what triggered the resync with some helpful
information.  We add a schedule_resync() function to to replace a
explicit set of needs_resync which tracks why it was called.

[1] The logs will contain output like "DBDuplicateEntry
(IntegrityError) (1062, "Duplicate entry
'6d799c6a-7a09-4c1e-bb63-7d30fd052c8a-d3e3ac5b-9962-428a-a9f8-6b2' for
key 'PRIMARY'") ..." in this case

Closes-Bug: #1294445
Change-Id: I9b1c6202f5a6bbad8589a8b64b2a38c9d9edb43b
This commit is contained in:
Ian Wienand 2014-03-18 16:04:49 +11:00
parent 4e49095156
commit f199a7b1f0
2 changed files with 45 additions and 29 deletions

View File

@ -67,7 +67,7 @@ class DhcpAgent(manager.Manager):
def __init__(self, host=None):
super(DhcpAgent, self).__init__(host=host)
self.needs_resync = False
self.needs_resync_reasons = []
self.conf = cfg.CONF
self.cache = NetworkCache()
self.root_helper = config.get_root_helper(self.conf)
@ -135,7 +135,7 @@ class DhcpAgent(manager.Manager):
'that the network and/or its subnet(s) still exist.')
% {'net_id': network.id, 'action': action})
except Exception as e:
self.needs_resync = True
self.schedule_resync(e)
if (isinstance(e, common.RemoteError)
and e.exc_type == 'NetworkNotFound'
or isinstance(e, exceptions.NetworkNotFound)):
@ -144,6 +144,10 @@ class DhcpAgent(manager.Manager):
LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.')
% {'net_id': network.id, 'action': action})
def schedule_resync(self, reason):
"""Schedule a resync for a given reason."""
self.needs_resync_reasons.append(reason)
@utils.synchronized('dhcp-agent')
def sync_state(self):
"""Sync the local DHCP state with Neutron."""
@ -157,8 +161,8 @@ class DhcpAgent(manager.Manager):
for deleted_id in known_network_ids - active_network_ids:
try:
self.disable_dhcp_helper(deleted_id)
except Exception:
self.needs_resync = True
except Exception as e:
self.schedule_resync(e)
LOG.exception(_('Unable to sync network state on deleted '
'network %s'), deleted_id)
@ -167,16 +171,22 @@ class DhcpAgent(manager.Manager):
pool.waitall()
LOG.info(_('Synchronizing state complete'))
except Exception:
self.needs_resync = True
except Exception as e:
self.schedule_resync(e)
LOG.exception(_('Unable to sync network state.'))
def _periodic_resync_helper(self):
"""Resync the dhcp state at the configured interval."""
while True:
eventlet.sleep(self.conf.resync_interval)
if self.needs_resync:
self.needs_resync = False
if self.needs_resync_reasons:
# be careful to avoid a race with additions to list
# from other threads
reasons = self.needs_resync_reasons
self.needs_resync_reasons = []
for r in reasons:
LOG.debug(_("resync: %(reason)s"),
{"reason": r})
self.sync_state()
def periodic_resync(self):
@ -189,8 +199,8 @@ class DhcpAgent(manager.Manager):
if not network:
LOG.warn(_('Network %s has been deleted.'), network_id)
return network
except Exception:
self.needs_resync = True
except Exception as e:
self.schedule_resync(e)
LOG.exception(_('Network %s info call failed.'), network_id)
def enable_dhcp_helper(self, network_id):
@ -579,7 +589,8 @@ class DhcpAgentWithStateReport(DhcpAgent):
def agent_updated(self, context, payload):
"""Handle the agent_updated notification event."""
self.needs_resync = True
self.schedule_resync(_("Agent updated: %(payload)s") %
{"payload": payload})
LOG.info(_("agent_updated by server side %s!"), payload)
def after_start(self):

View File

@ -211,14 +211,16 @@ class TestDhcpAgent(base.BaseTestCase):
self.driver.return_value.foo.side_effect = exc or Exception
with mock.patch.object(dhcp_agent.LOG, trace_level) as log:
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
self.assertIsNone(dhcp.call_driver('foo', network))
self.driver.assert_called_once_with(cfg.CONF,
mock.ANY,
'sudo',
mock.ANY,
mock.ANY)
self.assertEqual(log.call_count, 1)
self.assertEqual(expected_sync, dhcp.needs_resync)
with mock.patch.object(dhcp,
'schedule_resync') as schedule_resync:
self.assertIsNone(dhcp.call_driver('foo', network))
self.driver.assert_called_once_with(cfg.CONF,
mock.ANY,
'sudo',
mock.ANY,
mock.ANY)
self.assertEqual(log.call_count, 1)
self.assertEqual(expected_sync, schedule_resync.called)
def test_call_driver_failure(self):
self._test_call_driver_failure()
@ -298,10 +300,12 @@ class TestDhcpAgent(base.BaseTestCase):
with mock.patch.object(dhcp_agent.LOG, 'exception') as log:
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
dhcp.sync_state()
with mock.patch.object(dhcp,
'schedule_resync') as schedule_resync:
dhcp.sync_state()
self.assertTrue(log.called)
self.assertTrue(dhcp.needs_resync)
self.assertTrue(log.called)
self.assertTrue(schedule_resync.called)
def test_periodic_resync(self):
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
@ -312,14 +316,14 @@ class TestDhcpAgent(base.BaseTestCase):
def test_periodoc_resync_helper(self):
with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep:
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
dhcp.needs_resync = True
dhcp.needs_resync_reasons = ['reason1', 'reason2']
with mock.patch.object(dhcp, 'sync_state') as sync_state:
sync_state.side_effect = RuntimeError
with testtools.ExpectedException(RuntimeError):
dhcp._periodic_resync_helper()
sync_state.assert_called_once_with()
sleep.assert_called_once_with(dhcp.conf.resync_interval)
self.assertFalse(dhcp.needs_resync)
self.assertEqual(len(dhcp.needs_resync_reasons), 0)
def test_populate_cache_on_start_without_active_networks_support(self):
# emul dhcp driver that doesn't support retrieving of active networks
@ -468,14 +472,15 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
self.mock_init_p = mock.patch('neutron.agent.dhcp_agent.'
'DhcpAgent._populate_networks_cache')
self.mock_init = self.mock_init_p.start()
with mock.patch.object(dhcp.Dnsmasq,
'check_version') as check_v:
check_v.return_value = dhcp.Dnsmasq.MINIMUM_VERSION
self.dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
self.call_driver_p = mock.patch.object(self.dhcp, 'call_driver')
self.call_driver = self.call_driver_p.start()
self.schedule_resync_p = mock.patch.object(self.dhcp,
'schedule_resync')
self.schedule_resync = self.schedule_resync_p.start()
self.external_process_p = mock.patch(
'neutron.agent.linux.external_process.ProcessManager'
)
@ -525,7 +530,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
[mock.call.get_network_info('fake_id')])
self.assertFalse(self.call_driver.called)
self.assertTrue(log.called)
self.assertFalse(self.dhcp.needs_resync)
self.assertFalse(self.dhcp.schedule_resync.called)
def test_enable_dhcp_helper_exception_during_rpc(self):
self.plugin.get_network_info.side_effect = Exception
@ -535,7 +540,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
[mock.call.get_network_info(fake_network.id)])
self.assertFalse(self.call_driver.called)
self.assertTrue(log.called)
self.assertTrue(self.dhcp.needs_resync)
self.assertTrue(self.schedule_resync.called)
self.assertFalse(self.cache.called)
self.assertFalse(self.external_process.called)
@ -723,7 +728,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
self.cache.assert_has_calls(
[mock.call.get_network_by_id('net-id')])
self.assertTrue(log.called)
self.assertTrue(self.dhcp.needs_resync)
self.assertTrue(self.dhcp.schedule_resync.called)
def test_subnet_update_end(self):
payload = dict(subnet=dict(network_id=fake_network.id))