Fix relaunch attempts when hitting quota errors

The quota calculations in nodepool never can be perfectly accurate because there still can be some races with other launchers in the tenant or by other workloads sharing the tenant. So we must be able to gracefully handle the case when the cloud refuses a launch because of quota. Currently we just invalidate the quota cache and immediately try again to launch the node. Of course that will fail again in most cases. This makes the node request and thus the zuul job fail with NODE_FAILURE. Instead we need to pause the handler like it would happen because of quota calculation. Instead we mark the node as aborted which is no failure indicator and pause the handler so it will automatically reschedule a new node launch as soon as the quota calculation allows it. Change-Id: I122268dc7723901c04a58fa6f5c1688a3fdab227
2018-01-23 19:43:10 +01:00 · 2018-01-23 19:43:10 +01:00 · 31c276c234
commit 31c276c234
parent eb52394c8c
7 changed files with 77 additions and 25 deletions
--- a/nodepool/driver/init.py
+++ b/nodepool/driver/init.py
@ -300,6 +300,15 @@ class LabelRecorder(object):
        self.data.remove({'label': label, 'node_id': node_id})
        return node_id

+    def removeNode(self, id):
+        '''
+        Remove the node with the specified ID.
+        '''
+        for d in self.data:
+            if d['node_id'] == id:
+                self.data.remove(d)
+                return
+

 class NodeRequestHandlerNotifications(object):
    """
@ -645,9 +654,18 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
            return False

        # Launches are complete, so populate ready_nodes and failed_nodes.
-        for node in self.nodeset:
+        aborted_nodes = []
+        for node in self.nodeset[:]:
            if node.state == zk.READY:
                self.ready_nodes.append(node)
+            elif node.state == zk.ABORTED:
+                # ABORTED is a transient error triggered by overquota. In order
+                # to handle this gracefully don't count this as failed so the
+                # node is relaunched within this provider. Unlock the node so
+                # the DeletedNodeWorker cleans up the zNode.
+                aborted_nodes.append(node)
+                self.nodeset.remove(node)
+                self.zk.unlockNode(node)
            else:
                self.failed_nodes.append(node)

@ -674,6 +692,14 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
            self.log.debug("Declining node request %s because nodes failed",
                           self.request.id)
            self.decline_request()
+        elif aborted_nodes:
+            # Because nodes are added to the satisfied types list before they
+            # are ready we need to remove the aborted nodes again so they can
+            # be created again.
+            for node in aborted_nodes:
+                self._satisfied_types.removeNode(node.id)
+            self.paused = True
+            return False
        else:
            # The assigned nodes must be added to the request in the order
            # in which they were requested.
--- a/nodepool/driver/openstack/handler.py
+++ b/nodepool/driver/openstack/handler.py
@ -246,10 +246,12 @@ class OpenStackNodeLauncher(NodeLauncher):
                    self.zk.storeNode(self.node)
                if attempts == self._retries:
                    raise
-                # Invalidate the quota cache if we encountered a quota error.
                if 'quota exceeded' in str(e).lower():
+                    # A quota exception is not directly recoverable so bail
+                    # out immediately with a specific exception.
                    self.log.info("Quota exceeded, invalidating quota cache")
                    self.handler.manager.invalidateQuotaCache()
+                    raise exceptions.QuotaException("Quota exceeded")
                attempts += 1

        self.node.state = zk.READY
@ -380,8 +382,8 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
        '''
        Check if all launch requests have completed.

-        When all of the Node objects have reached a final state (READY or
-        FAILED), we'll know all threads have finished the launch process.
+        When all of the Node objects have reached a final state (READY, FAILED
+        or ABORTED), we'll know all threads have finished the launch process.
        '''
        if not self._threads:
            return True
@ -392,9 +394,10 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):

        node_states = [node.state for node in self.nodeset]

-        # NOTE: It very important that NodeLauncher always sets one of
+        # NOTE: It's very important that NodeLauncher always sets one of
        # these states, no matter what.
-        if not all(s in (zk.READY, zk.FAILED) for s in node_states):
+        if not all(s in (zk.READY, zk.FAILED, zk.ABORTED)
+                   for s in node_states):
            return False

        return True
--- a/nodepool/driver/utils.py
+++ b/nodepool/driver/utils.py
@ -22,6 +22,7 @@ import time

 from kazoo import exceptions as kze

+from nodepool import exceptions
 from nodepool import stats
 from nodepool import zk

@ -69,6 +70,16 @@ class NodeLauncher(threading.Thread,
                self.node.id)
            self.node.state = zk.FAILED
            statsd_key = 'error.zksession'
+        except exceptions.QuotaException:
+            # We encountered a quota error when trying to launch a
+            # node. In this case we need to abort the launch. The upper
+            # layers will take care of this and reschedule a new node once
+            # the quota is ok again.
+            self.log.info("Aborting node %s due to quota failure" %
+                          self.node.id)
+            self.node.state = zk.ABORTED
+            self.zk.storeNode(self.node)
+            statsd_key = 'error.quota'
        except Exception as e:
            self.log.exception("Launch failed for node %s:", self.node.id)
            self.node.state = zk.FAILED
--- a/nodepool/exceptions.py
+++ b/nodepool/exceptions.py
@ -45,6 +45,10 @@ class DibFailedError(BuilderError):
    pass


+class QuotaException(Exception):
+    pass
+
+
 class TimeoutException(Exception):
    pass

--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@ -224,6 +224,8 @@ class PoolWorker(threading.Thread):
            try:
                if not r.poll():
                    active_handlers.append(r)
+                    if r.paused:
+                        self.paused_handler = r
                else:
                    self.log.debug("Removing handler for request %s",
                                   r.request.id)
@ -662,7 +664,7 @@ class DeletedNodeWorker(BaseCleanupWorker):
        Delete instances from providers and nodes entries from ZooKeeper.
        '''
        cleanup_states = (zk.USED, zk.IN_USE, zk.BUILDING, zk.FAILED,
-                          zk.DELETING)
+                          zk.DELETING, zk.ABORTED)

        zk_conn = self._nodepool.getZK()
        for node in zk_conn.nodeIterator():
--- a/nodepool/tests/test_launcher.py
+++ b/nodepool/tests/test_launcher.py
@ -297,32 +297,35 @@ class TestLauncher(tests.DBTestCase):
        # (according to nodepool's quota estimate) fails.
        client.max_instances = 1

-        # Request a second node; this request should fail.
+        # Request a second node; this request should pause the handler.
        req2 = zk.NodeRequest()
        req2.state = zk.REQUESTED
        req2.node_types.append('fake-label')
        self.log.debug("Adding second request")
        self.zk.storeNodeRequest(req2)
-        req2 = self.waitForNodeRequest(req2)
-        self.assertEqual(req2.state, zk.FAILED)

-        # After the second request failed, the internal quota estimate
-        # should be reset, so the next request should pause to wait
-        # for more quota to become available.
-        req3 = zk.NodeRequest()
-        req3.state = zk.REQUESTED
-        req3.node_types.append('fake-label')
-        self.log.debug("Adding third request")
-        self.zk.storeNodeRequest(req3)
-        req3 = self.waitForNodeRequest(req3, (zk.PENDING,))
-        self.assertEqual(req3.state, zk.PENDING)
-
-        # Wait until there is a paused request handler and verify that
-        # there is still only one server built (from the first
-        # request).
        pool_worker = pool.getPoolWorkers('fake-provider')
        while not pool_worker[0].paused_handler:
+            # self.log.debug("tick")
            time.sleep(0.1)
+        self.log.debug("finished waiting")
+
+        # The handler is paused now and the request should be in state PENDING
+        req2 = self.waitForNodeRequest(req2, zk.PENDING)
+        self.assertEqual(req2.state, zk.PENDING)
+
+        # Now free up the first node
+        self.log.debug("Marking first node as used %s", req1.id)
+        req1_node.state = zk.USED
+        self.zk.storeNode(req1_node)
+        self.zk.unlockNode(req1_node)
+        self.waitForNodeDeletion(req1_node)
+
+        # After the first node is cleaned up the second request should be
+        # able to fulfill now.
+        req2 = self.waitForNodeRequest(req2)
+        self.assertEqual(req2.state, zk.FULFILLED)
+
        self.assertEqual(len(client._server_list), 1)

    def test_fail_request_on_launch_failure(self):
--- a/nodepool/zk.py
+++ b/nodepool/zk.py
@ -52,6 +52,9 @@ USED = 'used'
 HOLD = 'hold'
 # Initial node state
 INIT = 'init'
+# Aborted due to a transient error like overquota that should not count as a
+# failed launch attempt
+ABORTED = 'aborted'


 # NOTE(Shrews): Importing this from nodepool.config causes an import error
@ -484,7 +487,7 @@ class Node(BaseModel):
    Class representing a launched node.
    '''
    VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED,
-                        HOLD, DELETING, FAILED, INIT])
+                        HOLD, DELETING, FAILED, INIT, ABORTED])

    def __init__(self, id=None):
        super(Node, self).__init__(id)