From 43c1a28b8437d9469ed51d0603b15eeb301f0c4d Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Fri, 31 Jul 2020 11:00:15 +0200 Subject: [PATCH] Delete init nodes when resetting lost requests When creating a new node it is first created it is done in the following order: 1. Store new node with state 'init' and linked to a node request 2. Lock node 3. Set node state 'building' There is a chance that if a launcher got killed between steps 1 and 3 that we leak znodes in state 'init'. Nodes that are tied to a node request get deallocated from it when resetting it if the according launcher got offline. If the state is init the node will never be deleted so we leak that znode. While resetting a lost request we can be sure that a node in state init is orphaned since the lock on the according request got lost already. Thus we can mark those nodes to be deleted to prevent this leak. Change-Id: I83ec79ebf89e935339e9f3b39411f6ea23951a9b --- nodepool/launcher.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nodepool/launcher.py b/nodepool/launcher.py index 94e9c7292..afb57cb0b 100644 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -390,6 +390,14 @@ class CleanupWorker(BaseCleanupWorker): "request", node.id) return + # If the node is in state init then the launcher that worked + # on the lost request has been interrupted between creating + # the znode and locking/setting to building. In this case the + # znode is leaked and we should delete the node instead of + # just deallocating it. + if node.state == zk.INIT: + node.state = zk.DELETING + node.allocated_to = None try: zk_conn.storeNode(node)