From 70be90e74272f5f0799a44b45bfec974d2649aec Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 24 Jan 2024 14:54:43 -0800 Subject: [PATCH] Fix duplicate fd registration in nodescan In an attempt to make the nodescan process as quick as possible, we start the connection in the provider statemachine thread before handing the remaining work off to the nodescan statemachine thread. However, if the nodescan worker is near the end of its request list when the provider adds the request, then it may end up performing the initial connection nearly simultaneously with the provider thread. They may both create a socket and attempt to register the FD. If the race results in them registering the same FD, the following exception occurs: Traceback (most recent call last): File "/usr/local/lib/python3.11/site-packages/nodepool/driver/statemachine.py", line 253, in runStateMachine keys = self.nodescan_request.result() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/nodepool/driver/statemachine.py", line 1295, in result raise self.exception File "/usr/local/lib/python3.11/site-packages/nodepool/driver/statemachine.py", line 1147, in addRequest self._advance(request, False) File "/usr/local/lib/python3.11/site-packages/nodepool/driver/statemachine.py", line 1187, in _advance request.advance(socket_ready) File "/usr/local/lib/python3.11/site-packages/nodepool/driver/statemachine.py", line 1379, in advance self._connect() File "/usr/local/lib/python3.11/site-packages/nodepool/driver/statemachine.py", line 1340, in _connect self.worker.registerDescriptor(self.sock) File "/usr/local/lib/python3.11/site-packages/nodepool/driver/statemachine.py", line 1173, in registerDescriptor self.poll.register( FileExistsError: [Errno 17] File exists To address this, rather than attempting to coordinate work between these two threads, let's just let the nodescan worker handle it. To try to keep the process responsive, we'll wake the nodescan worker if it's sleeping. Change-Id: I5ceda68b856c09bf7606e62ac72ca5c5c76d2661 --- nodepool/driver/statemachine.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nodepool/driver/statemachine.py b/nodepool/driver/statemachine.py index 5ef2882ff..4a4c8bc93 100644 --- a/nodepool/driver/statemachine.py +++ b/nodepool/driver/statemachine.py @@ -1143,12 +1143,8 @@ class NodescanWorker: self._pending_requests.append(request) else: self._active_requests.append(request) - try: - self._advance(request, False) - except Exception as e: - request.fail(e) - if request.complete: - self.removeRequest(request) + # If the poll is sleeping, wake it up for immediate action + os.write(self.wake_write, b'\n') def removeRequest(self, request): """Remove the request and cleanup"""