From a57231cb60a385f277cca35343f9dcccbb3b00e7 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 24 Apr 2024 14:15:45 -0700 Subject: [PATCH] Add max-age to metastatic driver This allows the metastatic driver to gracefully remove a backing node from service after a certain amount of time. This forced retirement can be used to periodically ensure that fresh backing nodes are used even in busy systems (which can help ensure that, over time, job behavior does not change based on the contents of the backing node server). Change-Id: I62a95411a5d0b75185739a3c2553c75124c78c25 --- doc/source/metastatic.rst | 12 +++++ nodepool/driver/metastatic/adapter.py | 9 ++++ nodepool/driver/metastatic/config.py | 8 +++- nodepool/tests/fixtures/metastatic.yaml | 1 + nodepool/tests/unit/test_driver_metastatic.py | 44 +++++++++++++++++++ 5 files changed, 73 insertions(+), 1 deletion(-) diff --git a/doc/source/metastatic.rst b/doc/source/metastatic.rst index dc0110f33..7412a9843 100644 --- a/doc/source/metastatic.rst +++ b/doc/source/metastatic.rst @@ -203,6 +203,18 @@ itself, which is "meta". used to ensure that the backing node is retained for at least the minimum billing interval. + .. attr:: max-age + :type: int + + If this value is set, the backing node will be removed + from service after this amount of time (in seconds) has + passed since the backing node was launched. After a + backing node reaches this point, any existing jobs will + be permitted to run to completion, but no new metastatic + nodes will be created with that backing node and once all + metastatic nodes using it have been deleted, then backing + node will be deleted. + .. attr:: host-key-checking :type: bool :default: False diff --git a/nodepool/driver/metastatic/adapter.py b/nodepool/driver/metastatic/adapter.py index 7aefdac75..6c71cd024 100644 --- a/nodepool/driver/metastatic/adapter.py +++ b/nodepool/driver/metastatic/adapter.py @@ -298,6 +298,15 @@ class MetastaticAdapter(statemachine.Adapter): if label_config: grace_time = label_config.grace_time min_time = label_config.min_retention_time + if label_config.max_age: + if now - bnr.launched > label_config.max_age: + # Mark it as failed; even though it + # hasn't really failed, the lifecycle + # is the same: do not allocate any + # more jobs to this node but let any + # remaining ones finish, then delete + # ASAP. + bnr.failed = True else: # The label doesn't exist in our config any more, # it must have been removed. diff --git a/nodepool/driver/metastatic/config.py b/nodepool/driver/metastatic/config.py index 4ab9ab88d..af1291576 100644 --- a/nodepool/driver/metastatic/config.py +++ b/nodepool/driver/metastatic/config.py @@ -46,8 +46,12 @@ class MetastaticLabel(ConfigValue): self.max_parallel_jobs = label.get('max-parallel-jobs', 1) self.grace_time = label.get('grace-time', 60) self.min_retention_time = label.get('min-retention-time', 0) + self.max_age = label.get('max-age', None) self.host_key_checking = label.get('host-key-checking', self.pool.host_key_checking) + if self.max_age and self.max_age < self.min_retention_time: + raise Exception("The max_age must be greater than or " + "equal to the min_retention_time") @staticmethod def getSchema(): @@ -57,6 +61,7 @@ class MetastaticLabel(ConfigValue): 'max-parallel-jobs': int, 'grace-time': int, 'min-retention-time': int, + 'max-age': int, 'host-key-checking': bool, } @@ -66,7 +71,8 @@ class MetastaticLabel(ConfigValue): self.backing_label == other.backing_label and self.max_parallel_jobs == other.max_parallel_jobs and self.grace_time == other.grace_time and - self.min_retention_time == other.min_retention_time + self.min_retention_time == other.min_retention_time and + self.max_age == other.max_age ) diff --git a/nodepool/tests/fixtures/metastatic.yaml b/nodepool/tests/fixtures/metastatic.yaml index 343562869..3a5b0e766 100644 --- a/nodepool/tests/fixtures/metastatic.yaml +++ b/nodepool/tests/fixtures/metastatic.yaml @@ -66,6 +66,7 @@ providers: backing-label: backing-label max-parallel-jobs: 2 grace-time: 2 + max-age: 300 host-key-checking: true - name: user-label-min-retention backing-label: backing-label-min-retention diff --git a/nodepool/tests/unit/test_driver_metastatic.py b/nodepool/tests/unit/test_driver_metastatic.py index feaa7358e..d53f57d2f 100644 --- a/nodepool/tests/unit/test_driver_metastatic.py +++ b/nodepool/tests/unit/test_driver_metastatic.py @@ -372,3 +372,47 @@ class TestDriverMetastatic(tests.DBTestCase): meta_manager.adapter.listResources() nodes = self._getNodes() self.waitForNodeDeletion(bn1) + + def test_metastatic_max_age(self): + # Test the max-age option + configfile = self.setup_config('metastatic.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + manager = pool.getProviderManager('fake-provider') + manager.adapter._client.create_image(name="fake-image") + + # Launch one metastatic node on a backing node + node1 = self._requestNode() + nodes = self._getNodes() + self.assertEqual(len(nodes), 2) + bn1 = nodes[1] + self.assertEqual(bn1.provider, 'fake-provider') + self.assertEqual(bn1.id, node1.driver_data['backing_node']) + + # Create a second node and verify it uses the same backing node. + node2 = self._requestNode() + nodes = self._getNodes() + self.assertEqual(len(nodes), 3) + self.assertEqual(bn1.id, node2.driver_data['backing_node']) + + # Delete the second node. + node2.state = zk.DELETING + self.zk.storeNode(node2) + self.waitForNodeDeletion(node2) + nodes = self._getNodes() + self.assertEqual(len(nodes), 2) + + # Falsify the launch time so that the node is older than + # max_age (300). + meta_manager = pool.getProviderManager('meta-provider') + bnr = meta_manager.adapter.backing_node_records['user-label'][0] + bnr.launched = 0 + + # This has the side effect of marking the backing node as failed. + meta_manager.adapter.listResources() + + # Create another node and verify it gets a new backing node. + node3 = self._requestNode() + nodes = self._getNodes() + self.assertEqual(len(nodes), 4) + self.assertNotEqual(bn1.id, node3.driver_data['backing_node'])