Record and log reason for dhcp agent resync
A dhcp resync can be triggered at a number of points, but the actual resync is done asynchronously by a helper thread. This means by the time the resync happens, it's hard to establish what actually caused it. There have been a number of problems in production systems that cause excessive resyncs. One is a ipv6/dnsmasq issue (rhbz#1077487) and another is db corruption with duplicate entries [1]. The resync triggers a whole lot of logs itself, so it becomes very unclear how to establish any causality. This change keeps track of what triggered the resync with some helpful information. We add a schedule_resync() function to to replace a explicit set of needs_resync which tracks why it was called. [1] The logs will contain output like "DBDuplicateEntry (IntegrityError) (1062, "Duplicate entry '6d799c6a-7a09-4c1e-bb63-7d30fd052c8a-d3e3ac5b-9962-428a-a9f8-6b2' for key 'PRIMARY'") ..." in this case Closes-Bug: #1294445 Change-Id: I9b1c6202f5a6bbad8589a8b64b2a38c9d9edb43b
This commit is contained in:
parent
4e49095156
commit
f199a7b1f0
@ -67,7 +67,7 @@ class DhcpAgent(manager.Manager):
|
|||||||
|
|
||||||
def __init__(self, host=None):
|
def __init__(self, host=None):
|
||||||
super(DhcpAgent, self).__init__(host=host)
|
super(DhcpAgent, self).__init__(host=host)
|
||||||
self.needs_resync = False
|
self.needs_resync_reasons = []
|
||||||
self.conf = cfg.CONF
|
self.conf = cfg.CONF
|
||||||
self.cache = NetworkCache()
|
self.cache = NetworkCache()
|
||||||
self.root_helper = config.get_root_helper(self.conf)
|
self.root_helper = config.get_root_helper(self.conf)
|
||||||
@ -135,7 +135,7 @@ class DhcpAgent(manager.Manager):
|
|||||||
'that the network and/or its subnet(s) still exist.')
|
'that the network and/or its subnet(s) still exist.')
|
||||||
% {'net_id': network.id, 'action': action})
|
% {'net_id': network.id, 'action': action})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.needs_resync = True
|
self.schedule_resync(e)
|
||||||
if (isinstance(e, common.RemoteError)
|
if (isinstance(e, common.RemoteError)
|
||||||
and e.exc_type == 'NetworkNotFound'
|
and e.exc_type == 'NetworkNotFound'
|
||||||
or isinstance(e, exceptions.NetworkNotFound)):
|
or isinstance(e, exceptions.NetworkNotFound)):
|
||||||
@ -144,6 +144,10 @@ class DhcpAgent(manager.Manager):
|
|||||||
LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.')
|
LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.')
|
||||||
% {'net_id': network.id, 'action': action})
|
% {'net_id': network.id, 'action': action})
|
||||||
|
|
||||||
|
def schedule_resync(self, reason):
|
||||||
|
"""Schedule a resync for a given reason."""
|
||||||
|
self.needs_resync_reasons.append(reason)
|
||||||
|
|
||||||
@utils.synchronized('dhcp-agent')
|
@utils.synchronized('dhcp-agent')
|
||||||
def sync_state(self):
|
def sync_state(self):
|
||||||
"""Sync the local DHCP state with Neutron."""
|
"""Sync the local DHCP state with Neutron."""
|
||||||
@ -157,8 +161,8 @@ class DhcpAgent(manager.Manager):
|
|||||||
for deleted_id in known_network_ids - active_network_ids:
|
for deleted_id in known_network_ids - active_network_ids:
|
||||||
try:
|
try:
|
||||||
self.disable_dhcp_helper(deleted_id)
|
self.disable_dhcp_helper(deleted_id)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
self.needs_resync = True
|
self.schedule_resync(e)
|
||||||
LOG.exception(_('Unable to sync network state on deleted '
|
LOG.exception(_('Unable to sync network state on deleted '
|
||||||
'network %s'), deleted_id)
|
'network %s'), deleted_id)
|
||||||
|
|
||||||
@ -167,16 +171,22 @@ class DhcpAgent(manager.Manager):
|
|||||||
pool.waitall()
|
pool.waitall()
|
||||||
LOG.info(_('Synchronizing state complete'))
|
LOG.info(_('Synchronizing state complete'))
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
self.needs_resync = True
|
self.schedule_resync(e)
|
||||||
LOG.exception(_('Unable to sync network state.'))
|
LOG.exception(_('Unable to sync network state.'))
|
||||||
|
|
||||||
def _periodic_resync_helper(self):
|
def _periodic_resync_helper(self):
|
||||||
"""Resync the dhcp state at the configured interval."""
|
"""Resync the dhcp state at the configured interval."""
|
||||||
while True:
|
while True:
|
||||||
eventlet.sleep(self.conf.resync_interval)
|
eventlet.sleep(self.conf.resync_interval)
|
||||||
if self.needs_resync:
|
if self.needs_resync_reasons:
|
||||||
self.needs_resync = False
|
# be careful to avoid a race with additions to list
|
||||||
|
# from other threads
|
||||||
|
reasons = self.needs_resync_reasons
|
||||||
|
self.needs_resync_reasons = []
|
||||||
|
for r in reasons:
|
||||||
|
LOG.debug(_("resync: %(reason)s"),
|
||||||
|
{"reason": r})
|
||||||
self.sync_state()
|
self.sync_state()
|
||||||
|
|
||||||
def periodic_resync(self):
|
def periodic_resync(self):
|
||||||
@ -189,8 +199,8 @@ class DhcpAgent(manager.Manager):
|
|||||||
if not network:
|
if not network:
|
||||||
LOG.warn(_('Network %s has been deleted.'), network_id)
|
LOG.warn(_('Network %s has been deleted.'), network_id)
|
||||||
return network
|
return network
|
||||||
except Exception:
|
except Exception as e:
|
||||||
self.needs_resync = True
|
self.schedule_resync(e)
|
||||||
LOG.exception(_('Network %s info call failed.'), network_id)
|
LOG.exception(_('Network %s info call failed.'), network_id)
|
||||||
|
|
||||||
def enable_dhcp_helper(self, network_id):
|
def enable_dhcp_helper(self, network_id):
|
||||||
@ -579,7 +589,8 @@ class DhcpAgentWithStateReport(DhcpAgent):
|
|||||||
|
|
||||||
def agent_updated(self, context, payload):
|
def agent_updated(self, context, payload):
|
||||||
"""Handle the agent_updated notification event."""
|
"""Handle the agent_updated notification event."""
|
||||||
self.needs_resync = True
|
self.schedule_resync(_("Agent updated: %(payload)s") %
|
||||||
|
{"payload": payload})
|
||||||
LOG.info(_("agent_updated by server side %s!"), payload)
|
LOG.info(_("agent_updated by server side %s!"), payload)
|
||||||
|
|
||||||
def after_start(self):
|
def after_start(self):
|
||||||
|
@ -211,14 +211,16 @@ class TestDhcpAgent(base.BaseTestCase):
|
|||||||
self.driver.return_value.foo.side_effect = exc or Exception
|
self.driver.return_value.foo.side_effect = exc or Exception
|
||||||
with mock.patch.object(dhcp_agent.LOG, trace_level) as log:
|
with mock.patch.object(dhcp_agent.LOG, trace_level) as log:
|
||||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||||
self.assertIsNone(dhcp.call_driver('foo', network))
|
with mock.patch.object(dhcp,
|
||||||
self.driver.assert_called_once_with(cfg.CONF,
|
'schedule_resync') as schedule_resync:
|
||||||
mock.ANY,
|
self.assertIsNone(dhcp.call_driver('foo', network))
|
||||||
'sudo',
|
self.driver.assert_called_once_with(cfg.CONF,
|
||||||
mock.ANY,
|
mock.ANY,
|
||||||
mock.ANY)
|
'sudo',
|
||||||
self.assertEqual(log.call_count, 1)
|
mock.ANY,
|
||||||
self.assertEqual(expected_sync, dhcp.needs_resync)
|
mock.ANY)
|
||||||
|
self.assertEqual(log.call_count, 1)
|
||||||
|
self.assertEqual(expected_sync, schedule_resync.called)
|
||||||
|
|
||||||
def test_call_driver_failure(self):
|
def test_call_driver_failure(self):
|
||||||
self._test_call_driver_failure()
|
self._test_call_driver_failure()
|
||||||
@ -298,10 +300,12 @@ class TestDhcpAgent(base.BaseTestCase):
|
|||||||
|
|
||||||
with mock.patch.object(dhcp_agent.LOG, 'exception') as log:
|
with mock.patch.object(dhcp_agent.LOG, 'exception') as log:
|
||||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||||
dhcp.sync_state()
|
with mock.patch.object(dhcp,
|
||||||
|
'schedule_resync') as schedule_resync:
|
||||||
|
dhcp.sync_state()
|
||||||
|
|
||||||
self.assertTrue(log.called)
|
self.assertTrue(log.called)
|
||||||
self.assertTrue(dhcp.needs_resync)
|
self.assertTrue(schedule_resync.called)
|
||||||
|
|
||||||
def test_periodic_resync(self):
|
def test_periodic_resync(self):
|
||||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||||
@ -312,14 +316,14 @@ class TestDhcpAgent(base.BaseTestCase):
|
|||||||
def test_periodoc_resync_helper(self):
|
def test_periodoc_resync_helper(self):
|
||||||
with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep:
|
with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep:
|
||||||
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||||
dhcp.needs_resync = True
|
dhcp.needs_resync_reasons = ['reason1', 'reason2']
|
||||||
with mock.patch.object(dhcp, 'sync_state') as sync_state:
|
with mock.patch.object(dhcp, 'sync_state') as sync_state:
|
||||||
sync_state.side_effect = RuntimeError
|
sync_state.side_effect = RuntimeError
|
||||||
with testtools.ExpectedException(RuntimeError):
|
with testtools.ExpectedException(RuntimeError):
|
||||||
dhcp._periodic_resync_helper()
|
dhcp._periodic_resync_helper()
|
||||||
sync_state.assert_called_once_with()
|
sync_state.assert_called_once_with()
|
||||||
sleep.assert_called_once_with(dhcp.conf.resync_interval)
|
sleep.assert_called_once_with(dhcp.conf.resync_interval)
|
||||||
self.assertFalse(dhcp.needs_resync)
|
self.assertEqual(len(dhcp.needs_resync_reasons), 0)
|
||||||
|
|
||||||
def test_populate_cache_on_start_without_active_networks_support(self):
|
def test_populate_cache_on_start_without_active_networks_support(self):
|
||||||
# emul dhcp driver that doesn't support retrieving of active networks
|
# emul dhcp driver that doesn't support retrieving of active networks
|
||||||
@ -468,14 +472,15 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
|||||||
self.mock_init_p = mock.patch('neutron.agent.dhcp_agent.'
|
self.mock_init_p = mock.patch('neutron.agent.dhcp_agent.'
|
||||||
'DhcpAgent._populate_networks_cache')
|
'DhcpAgent._populate_networks_cache')
|
||||||
self.mock_init = self.mock_init_p.start()
|
self.mock_init = self.mock_init_p.start()
|
||||||
|
|
||||||
with mock.patch.object(dhcp.Dnsmasq,
|
with mock.patch.object(dhcp.Dnsmasq,
|
||||||
'check_version') as check_v:
|
'check_version') as check_v:
|
||||||
check_v.return_value = dhcp.Dnsmasq.MINIMUM_VERSION
|
check_v.return_value = dhcp.Dnsmasq.MINIMUM_VERSION
|
||||||
self.dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
self.dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
|
||||||
self.call_driver_p = mock.patch.object(self.dhcp, 'call_driver')
|
self.call_driver_p = mock.patch.object(self.dhcp, 'call_driver')
|
||||||
|
|
||||||
self.call_driver = self.call_driver_p.start()
|
self.call_driver = self.call_driver_p.start()
|
||||||
|
self.schedule_resync_p = mock.patch.object(self.dhcp,
|
||||||
|
'schedule_resync')
|
||||||
|
self.schedule_resync = self.schedule_resync_p.start()
|
||||||
self.external_process_p = mock.patch(
|
self.external_process_p = mock.patch(
|
||||||
'neutron.agent.linux.external_process.ProcessManager'
|
'neutron.agent.linux.external_process.ProcessManager'
|
||||||
)
|
)
|
||||||
@ -525,7 +530,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
|||||||
[mock.call.get_network_info('fake_id')])
|
[mock.call.get_network_info('fake_id')])
|
||||||
self.assertFalse(self.call_driver.called)
|
self.assertFalse(self.call_driver.called)
|
||||||
self.assertTrue(log.called)
|
self.assertTrue(log.called)
|
||||||
self.assertFalse(self.dhcp.needs_resync)
|
self.assertFalse(self.dhcp.schedule_resync.called)
|
||||||
|
|
||||||
def test_enable_dhcp_helper_exception_during_rpc(self):
|
def test_enable_dhcp_helper_exception_during_rpc(self):
|
||||||
self.plugin.get_network_info.side_effect = Exception
|
self.plugin.get_network_info.side_effect = Exception
|
||||||
@ -535,7 +540,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
|||||||
[mock.call.get_network_info(fake_network.id)])
|
[mock.call.get_network_info(fake_network.id)])
|
||||||
self.assertFalse(self.call_driver.called)
|
self.assertFalse(self.call_driver.called)
|
||||||
self.assertTrue(log.called)
|
self.assertTrue(log.called)
|
||||||
self.assertTrue(self.dhcp.needs_resync)
|
self.assertTrue(self.schedule_resync.called)
|
||||||
self.assertFalse(self.cache.called)
|
self.assertFalse(self.cache.called)
|
||||||
self.assertFalse(self.external_process.called)
|
self.assertFalse(self.external_process.called)
|
||||||
|
|
||||||
@ -723,7 +728,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
|
|||||||
self.cache.assert_has_calls(
|
self.cache.assert_has_calls(
|
||||||
[mock.call.get_network_by_id('net-id')])
|
[mock.call.get_network_by_id('net-id')])
|
||||||
self.assertTrue(log.called)
|
self.assertTrue(log.called)
|
||||||
self.assertTrue(self.dhcp.needs_resync)
|
self.assertTrue(self.dhcp.schedule_resync.called)
|
||||||
|
|
||||||
def test_subnet_update_end(self):
|
def test_subnet_update_end(self):
|
||||||
payload = dict(subnet=dict(network_id=fake_network.id))
|
payload = dict(subnet=dict(network_id=fake_network.id))
|
||||||
|
Loading…
Reference in New Issue
Block a user