diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py index e38376a1d..6e2388b0c 100644 --- a/nodepool/driver/fake/provider.py +++ b/nodepool/driver/fake/provider.py @@ -105,8 +105,7 @@ class FakeOpenStackCloud(object): ] self._azs = ['az1', 'az2'] self._server_list = [] - self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ - _get_quota() + self._update_quota() self._down_ports = [ Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN', device_owner="compute:nova"), @@ -114,6 +113,10 @@ class FakeOpenStackCloud(object): device_owner=None), ] + def _update_quota(self): + self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ + _get_quota() + def _get(self, name_or_id, instance_list): self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list))) for instance in instance_list: @@ -162,6 +165,7 @@ class FakeOpenStackCloud(object): private_v4 = 'fake' host_id = 'fake' interface_ip = 'fake' + self._update_quota() over_quota = False if (instance_type == Dummy.INSTANCE and self.max_instances > -1 and @@ -292,6 +296,7 @@ class FakeOpenStackCloud(object): return self._azs.copy() def get_compute_limits(self): + self._update_quota() return Dummy( 'limits', max_total_cores=self.max_cores, diff --git a/nodepool/driver/utils.py b/nodepool/driver/utils.py index e017e569d..a2fe32267 100644 --- a/nodepool/driver/utils.py +++ b/nodepool/driver/utils.py @@ -259,7 +259,8 @@ class QuotaSupport: def __init__(self, *args, **kw): super().__init__(*args, **kw) - self._current_nodepool_quota = None + self._current_nodepool_quota_timestamp = 0 + self._current_nodepool_quota = {} @abc.abstractmethod def quotaNeededByLabel(self, label, pool): @@ -291,7 +292,7 @@ class QuotaSupport: pass def invalidateQuotaCache(self): - self._current_nodepool_quota['timestamp'] = 0 + self._current_nodepool_quota_timestamp = 0 def estimatedNodepoolQuota(self): ''' @@ -307,8 +308,8 @@ class QuotaSupport: if self._current_nodepool_quota: now = time.time() - if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE: - return copy.deepcopy(self._current_nodepool_quota['quota']) + if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE: + return copy.deepcopy(self._current_nodepool_quota) # This is initialized with the full tenant quota and later becomes # the quota available for nodepool. @@ -318,7 +319,7 @@ class QuotaSupport: if self._current_nodepool_quota: self.log.exception("Unable to get provider quota, " "using cached value") - return copy.deepcopy(self._current_nodepool_quota['quota']) + return copy.deepcopy(self._current_nodepool_quota) raise self.log.debug("Provider quota for %s: %s", @@ -328,10 +329,8 @@ class QuotaSupport: # to get the quota available for us. nodepool_quota.subtract(self.unmanagedQuotaUsed()) - self._current_nodepool_quota = { - 'quota': nodepool_quota, - 'timestamp': time.time() - } + self._current_nodepool_quota = nodepool_quota + self._current_nodepool_quota_timestamp = time.time() self.log.debug("Available quota for %s: %s", self.provider.name, nodepool_quota) diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 56edb03db..1dc4eb663 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase): self.wait_for_threads() return ready_nodes[label] + def waitForAnyNodeInState(self, state): + # Wait for a node to be in the aborted state + for _ in iterate_timeout(ONE_MINUTE, Exception, + f"Timeout waiting for node in {state} state", + interval=1): + for node in self.zk.nodeIterator(False): + if node.state == state: + return node + def waitForNodeRequest(self, req, states=None): ''' Wait for a node request to transition to a final state. diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py index 9f8b1aa38..453d4843d 100644 --- a/nodepool/tests/unit/test_launcher.py +++ b/nodepool/tests/unit/test_launcher.py @@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase): # patch the cloud with requested quota def fake_get_quota(): + nonlocal max_cores, max_instances, max_ram return (max_cores, max_instances, max_ram) self.useFixture(fixtures.MockPatchObject( fakeprovider.FakeProvider.fake_cloud, '_get_quota', @@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase): # Now, reduce the quota so the next node unexpectedly # (according to nodepool's quota estimate) fails. - client.max_instances = 1 + max_instances = 1 # Request a second node; this request should pause the handler. req2 = zk.NodeRequest() @@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase): req3 = self.waitForNodeRequest(req3) self.assertEqual(req3.state, zk.FAILED) + def test_ignore_provider_quota_true_with_provider_error(self): + ''' + Tests that quota errors returned from the provider when allocating a + node are correctly handled and retried. The node request should be + retried until there is sufficient quota to satisfy it. + ''' + max_instances = 0 + + def fake_get_quota(): + nonlocal max_instances + return (100, max_instances, 1000000) + + self.useFixture(fixtures.MockPatchObject( + fakeprovider.FakeProvider.fake_cloud, '_get_quota', + fake_get_quota + )) + + configfile = self.setup_config('ignore_provider_quota_true.yaml') + self.useBuilder(configfile) + self.waitForImage('fake-provider', 'fake-image') + + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + + # Create a request with ignore-provider-quota set to true that should + # pass regardless of the lack of cloud/provider quota. + self.replace_config(configfile, 'ignore_provider_quota_true.yaml') + + self.log.debug( + "Submitting an initial request with ignore-provider-quota True") + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + self.waitForAnyNodeInState(zk.ABORTED) + + self.assertReportedStat( + 'nodepool.launch.provider.fake-provider.error.quota') + self.assertReportedStat( + 'nodepool.launch.error.quota') + + # Bump up the quota to allow the provider to allocate a node + max_instances = 1 + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FULFILLED) + def test_request_order(self): """Test that requests are handled in sorted order""" configfile = self.setup_config('node_no_min_ready.yaml') diff --git a/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml new file mode 100644 index 000000000..235e0f1d9 --- /dev/null +++ b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an exception that was raised by the OpenStack driver when attempting + to reset the quota timestamp and `ignore-provider-quota` is `true`. This + exception prevented nodepool from seeing quota errors from OpenStack, + causing it to fail the node request instead of retrying as it does for + other providers.