Record and log reason for dhcp agent resync
A dhcp resync can be triggered at a number of points, but the actual resync is done asynchronously by a helper thread. This means by the time the resync happens, it's hard to establish what actually caused it. There have been a number of problems in production systems that cause excessive resyncs. One is a ipv6/dnsmasq issue (rhbz#1077487) and another is db corruption with duplicate entries [1]. The resync triggers a whole lot of logs itself, so it becomes very unclear how to establish any causality. This change keeps track of what triggered the resync with some helpful information. We add a schedule_resync() function to to replace a explicit set of needs_resync which tracks why it was called. [1] The logs will contain output like "DBDuplicateEntry (IntegrityError) (1062, "Duplicate entry '6d799c6a-7a09-4c1e-bb63-7d30fd052c8a-d3e3ac5b-9962-428a-a9f8-6b2' for key 'PRIMARY'") ..." in this case Closes-Bug: #1294445 Change-Id: I9b1c6202f5a6bbad8589a8b64b2a38c9d9edb43b
This commit is contained in:
parent
4e49095156
commit
f199a7b1f0
@ -67,7 +67,7 @@ class DhcpAgent(manager.Manager):
|
||||
|
||||
def __init__(self, host=None):
|
||||
super(DhcpAgent, self).__init__(host=host)
|
||||
self.needs_resync = False
|
||||
self.needs_resync_reasons = []
|
||||
self.conf = cfg.CONF
|
||||
self.cache = NetworkCache()
|
||||
self.root_helper = config.get_root_helper(self.conf)
|
||||
@ -135,7 +135,7 @@ class DhcpAgent(manager.Manager):
|
||||
'that the network and/or its subnet(s) still exist.')
|
||||
% {'net_id': network.id, 'action': action})
|
||||
except Exception as e:
|
||||
self.needs_resync = True
|
||||
self.schedule_resync(e)
|
||||
if (isinstance(e, common.RemoteError)
|
||||
and e.exc_type == 'NetworkNotFound'
|
||||
or isinstance(e, exceptions.NetworkNotFound)):
|
||||
@ -144,6 +144,10 @@ class DhcpAgent(manager.Manager):
|
||||
LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.')
|
||||
% {'net_id': network.id, 'action': action})
|
||||
|
||||
def schedule_resync(self, reason):
|
||||
"""Schedule a resync for a given reason."""
|
||||
self.needs_resync_reasons.append(reason)
|
||||
|
||||
@utils.synchronized('dhcp-agent')
|
||||
def sync_state(self):
|
||||
"""Sync the local DHCP state with Neutron."""
|
||||
@ -157,8 +161,8 @@ class DhcpAgent(manager.Manager):
|
||||
for deleted_id in known_network_ids - active_network_ids:
|
||||
try:
|
||||
self.disable_dhcp_helper(deleted_id)
|
||||
except Exception:
|
||||
self.needs_resync = True
|
||||
except Exception as e:
|
||||
self.schedule_resync(e)
|
||||
LOG.exception(_('Unable to sync network state on deleted '
|
||||
'network %s'), deleted_id)
|
||||
|
||||
@ -167,16 +171,22 @@ class DhcpAgent(manager.Manager):
|
||||
pool.waitall()
|
||||
LOG.info(_('Synchronizing state complete'))
|
||||
|
||||
except Exception:
|
||||
self.needs_resync = True
|
||||
except Exception as e:
|
||||
self.schedule_resync(e)
|
||||
LOG.exception(_('Unable to sync network state.'))
|
||||
|
||||
def _periodic_resync_helper(self):
|
||||
"""Resync the dhcp state at the configured interval."""
|
||||
while True:
|
||||
eventlet.sleep(self.conf.resync_interval)
|
||||
if self.needs_resync:
|
||||
self.needs_resync = False
|
||||
if self.needs_resync_reasons:
|
||||
# be careful to avoid a race with additions to list
|
||||
# from other threads
|
||||
reasons = self.needs_resync_reasons
|
||||
self.needs_resync_reasons = []
|
||||
for r in reasons:
|
||||
LOG.debug(_("resync: %(reason)s"),
|
||||
{"reason": r})
|
||||
self.sync_state()
|
||||
|
||||
def periodic_resync(self):
|
||||
@ -189,8 +199,8 @@ class DhcpAgent(manager.Manager):
|
||||
if not network:
|
||||
LOG.warn(_('Network %s has been deleted.'), network_id)
|
||||
return network
|
||||
except Exception:
|
||||
self.needs_resync = True
|
||||
except Exception as e:
|
||||
self.schedule_resync(e)
|
||||
LOG.exception(_('Network %s info call failed.'), network_id)
|
||||
|
||||
def enable_dhcp_helper(self, network_id):
|
||||
@ -579,7 +589,8 @@ class DhcpAgentWithStateReport(DhcpAgent):
|
||||
|
||||
def agent_updated(self, context, payload):
|
||||
"""Handle the agent_updated notification event."""
|
||||
self.needs_resync = True
|
||||
self.schedule_resync(_("Agent updated: %(payload)s") %
|
||||
{"payload": payload})
|
||||
LOG.info(_("agent_updated by server side %s!"), payload)
|
||||
|
||||
def after_start(self):
|
||||
|
@ -211,14 +211,16 @@ class TestDhcpAgent(base.BaseTestCase):
|
||||
self.driver.return_value.foo.side_effect = exc or Exception
|
||||
with mock.patch.object(dhcp_agent.LOG, trace_level) as log:
|
||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||
self.assertIsNone(dhcp.call_driver('foo', network))
|
||||
self.driver.assert_called_once_with(cfg.CONF,
|
||||
mock.ANY,
|
||||
'sudo',
|
||||
mock.ANY,
|
||||
mock.ANY)
|
||||
self.assertEqual(log.call_count, 1)
|
||||
self.assertEqual(expected_sync, dhcp.needs_resync)
|
||||
with mock.patch.object(dhcp,
|
||||
'schedule_resync') as schedule_resync:
|
||||
self.assertIsNone(dhcp.call_driver('foo', network))
|
||||
self.driver.assert_called_once_with(cfg.CONF,
|
||||
mock.ANY,
|
||||
'sudo',
|
||||
mock.ANY,
|
||||
mock.ANY)
|
||||
self.assertEqual(log.call_count, 1)
|
||||
self.assertEqual(expected_sync, schedule_resync.called)
|
||||
|
||||
def test_call_driver_failure(self):
|
||||
self._test_call_driver_failure()
|
||||
@ -298,10 +300,12 @@ class TestDhcpAgent(base.BaseTestCase):
|
||||
|
||||
with mock.patch.object(dhcp_agent.LOG, 'exception') as log:
|
||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||
dhcp.sync_state()
|
||||
with mock.patch.object(dhcp,
|
||||
'schedule_resync') as schedule_resync:
|
||||
dhcp.sync_state()
|
||||
|
||||
self.assertTrue(log.called)
|
||||
self.assertTrue(dhcp.needs_resync)
|
||||
self.assertTrue(log.called)
|
||||
self.assertTrue(schedule_resync.called)
|
||||
|
||||
def test_periodic_resync(self):
|
||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||
@ -312,14 +316,14 @@ class TestDhcpAgent(base.BaseTestCase):
|
||||
def test_periodoc_resync_helper(self):
|
||||
with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep:
|
||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||
dhcp.needs_resync = True
|
||||
dhcp.needs_resync_reasons = ['reason1', 'reason2']
|
||||
with mock.patch.object(dhcp, 'sync_state') as sync_state:
|
||||
sync_state.side_effect = RuntimeError
|
||||
with testtools.ExpectedException(RuntimeError):
|
||||
dhcp._periodic_resync_helper()
|
||||
sync_state.assert_called_once_with()
|
||||
sleep.assert_called_once_with(dhcp.conf.resync_interval)
|
||||
self.assertFalse(dhcp.needs_resync)
|
||||
self.assertEqual(len(dhcp.needs_resync_reasons), 0)
|
||||
|
||||
def test_populate_cache_on_start_without_active_networks_support(self):
|
||||
# emul dhcp driver that doesn't support retrieving of active networks
|
||||
@ -468,14 +472,15 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
||||
self.mock_init_p = mock.patch('neutron.agent.dhcp_agent.'
|
||||
'DhcpAgent._populate_networks_cache')
|
||||
self.mock_init = self.mock_init_p.start()
|
||||
|
||||
with mock.patch.object(dhcp.Dnsmasq,
|
||||
'check_version') as check_v:
|
||||
check_v.return_value = dhcp.Dnsmasq.MINIMUM_VERSION
|
||||
self.dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||
self.call_driver_p = mock.patch.object(self.dhcp, 'call_driver')
|
||||
|
||||
self.call_driver = self.call_driver_p.start()
|
||||
self.schedule_resync_p = mock.patch.object(self.dhcp,
|
||||
'schedule_resync')
|
||||
self.schedule_resync = self.schedule_resync_p.start()
|
||||
self.external_process_p = mock.patch(
|
||||
'neutron.agent.linux.external_process.ProcessManager'
|
||||
)
|
||||
@ -525,7 +530,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
||||
[mock.call.get_network_info('fake_id')])
|
||||
self.assertFalse(self.call_driver.called)
|
||||
self.assertTrue(log.called)
|
||||
self.assertFalse(self.dhcp.needs_resync)
|
||||
self.assertFalse(self.dhcp.schedule_resync.called)
|
||||
|
||||
def test_enable_dhcp_helper_exception_during_rpc(self):
|
||||
self.plugin.get_network_info.side_effect = Exception
|
||||
@ -535,7 +540,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
||||
[mock.call.get_network_info(fake_network.id)])
|
||||
self.assertFalse(self.call_driver.called)
|
||||
self.assertTrue(log.called)
|
||||
self.assertTrue(self.dhcp.needs_resync)
|
||||
self.assertTrue(self.schedule_resync.called)
|
||||
self.assertFalse(self.cache.called)
|
||||
self.assertFalse(self.external_process.called)
|
||||
|
||||
@ -723,7 +728,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
||||
self.cache.assert_has_calls(
|
||||
[mock.call.get_network_by_id('net-id')])
|
||||
self.assertTrue(log.called)
|
||||
self.assertTrue(self.dhcp.needs_resync)
|
||||
self.assertTrue(self.dhcp.schedule_resync.called)
|
||||
|
||||
def test_subnet_update_end(self):
|
||||
payload = dict(subnet=dict(network_id=fake_network.id))
|
||||
|
Loading…
Reference in New Issue
Block a user