Reset the client object after proxy timeouts

In situations where the connection between nodepool and the cloud
it manages may be through an evil proxy, the fact that the
requests library in python-novaclient keeps port connections open
means that the evil proxy might timeout the connection during the
period of time that the image is being built.

If the Task object encounters a ProxyError it needs to be propagated
up in the calling thread to the ProviderManager rather than caught
and handled in TaskManager in the main thread. Once caught in the
ProviderManager, create a new client object and retry once. If a second
ProxyError is raised, then it will be passed to the main thread the
same as any other Exception.

Change-Id: I3d69de48be1993c04356d209f622b0a17e6e8d03
This commit is contained in:
Monty Taylor 2014-02-19 22:41:33 -08:00 committed by K Jonathan Harker
parent ddd600368a
commit 8b87890b97
4 changed files with 75 additions and 2 deletions

View File

@ -16,6 +16,7 @@
import StringIO
import novaclient
import requests.exceptions
import threading
import time
import uuid
@ -118,6 +119,12 @@ class FakeHTTPClient(object):
return None, dict(extensions=dict())
class BadHTTPClient(object):
'''Always raises a ProxyError'''
def get(self, path):
raise requests.exceptions.ProxyError
class FakeClient(object):
def __init__(self, *args, **kwargs):
self.flavors = FakeList([
@ -130,6 +137,12 @@ class FakeClient(object):
self.servers.api = self
class BadClient(FakeClient):
def __init__(self):
super(BadClient, self).__init__()
self.client = BadHTTPClient()
class FakeGlanceClient(object):
def __init__(self, *args, **kwargs):
self.id = 'fake-glance-id'
@ -226,3 +239,4 @@ class FakeJenkins(object):
FAKE_CLIENT = FakeClient()
BAD_CLIENT = BadClient()

View File

@ -27,6 +27,8 @@ import glanceclient
import glanceclient.client
import keystoneclient.v2_0.client as ksclient
import time
import requests.exceptions
import sys
from nodeutils import iterate_timeout
from task_manager import Task, TaskManager, ManagerStoppedException
@ -251,7 +253,7 @@ class ProviderManager(TaskManager):
super(ProviderManager, self).__init__(None, provider.name,
provider.rate)
self.provider = provider
self._client = self._getClient()
self.resetClient()
self._images = {}
self._networks = {}
self._cloud_metadata_read = False
@ -297,6 +299,24 @@ class ProviderManager(TaskManager):
kwargs['timeout'] = self.provider.api_timeout
return novaclient.client.Client(*args, **kwargs)
def runTask(self, task):
try:
task.run(self._client)
except requests.exceptions.ProxyError:
# Try to get a new client object if we get a ProxyError
self.log.exception('Resetting client due to ProxyError')
self.resetClient()
try:
task.run(self._client)
except requests.exceptions.ProxyError as e:
# If we get a second ProxyError, then make sure it gets raised
# the same way all other Exceptions from the Task object do.
# This will move the Exception to the main thread.
task.exception(e, sys.exc_info()[2])
def resetClient(self):
self._client = self._getClient()
def _getFlavors(self):
flavors = self.listFlavors()
flavors.sort(lambda a, b: cmp(a['ram'], b['ram']))

View File

@ -22,6 +22,7 @@ from six.moves import queue as Queue
import logging
import time
from statsd import statsd
import requests.exceptions
class ManagerStoppedException(Exception):
@ -54,6 +55,8 @@ class Task(object):
def run(self, client):
try:
self.done(self.main(client))
except requests.exceptions.ProxyError as e:
raise e
except Exception as e:
self.exception(e, sys.exc_info()[2])
@ -90,7 +93,7 @@ class TaskManager(threading.Thread):
self.log.debug("Manager %s running task %s (queue: %s)" %
(self.name, task, self.queue.qsize()))
start = time.time()
task.run(self._client)
self.runTask(task)
last_ts = time.time()
dt = last_ts - start
self.log.debug("Manager %s ran task %s in %ss" %
@ -110,3 +113,6 @@ class TaskManager(threading.Thread):
"Manager %s is no longer running" % self.name)
self.queue.put(task)
return task.wait()
def runTask(self, task):
task.run(self._client)

View File

@ -17,8 +17,12 @@ import fixtures
from nodepool import tests
from nodepool import nodedb
import nodepool.fakeprovider
import nodepool.nodepool
import requests.exceptions
from testtools import ExpectedException
class TestNodepool(tests.DBTestCase):
def test_db(self):
@ -243,3 +247,32 @@ class TestNodepool(tests.DBTestCase):
# Make sure our old node is in delete state
self.assertEqual(len(deleted_nodes), 1)
self.assertEqual(node_id, deleted_nodes[0].id)
def test_proxy_timeout(self):
"""Test that we re-run a task after a ProxyError"""
configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
self.waitForNodes(pool)
provider = pool.config.providers['fake-provider']
manager = pool.getProviderManager(provider)
# In order to test recovering from a ProxyError from the client
# we are going manually set the client object to be a bad client that
# always raises a ProxyError. If our client reset works correctly
# then we will create a new client object, which in this case would
# be a new fake client in place of the bad client.
manager._client = nodepool.fakeprovider.BAD_CLIENT
# The only implemented function for the fake and bad clients
# If we don't raise an uncaught exception, we pass
manager.listExtensions()
# Now let's do it again, but let's prevent the client object from being
# replaced and then assert that we raised the exception that we expect.
manager._client = nodepool.fakeprovider.BAD_CLIENT
manager._getClient = lambda: nodepool.fakeprovider.BAD_CLIENT
with ExpectedException(requests.exceptions.ProxyError):
manager.listExtensions()