From 07c83f555d4ebe383bcc40191d486342c9ded7e1 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 19 Jun 2023 11:49:14 -0700 Subject: [PATCH] Add ZK cache stats To observe the performance of the ZK connection and the new tree caches, add some statsd metrics for each of these. This will let us monitor queue size over time. Also, update the assertReportedStat method to output all received stats if the expected stat was not found (like Zuul). Change-Id: Ia7e1e0980fdc34007f80371ee0a77d4478948518 Depends-On: https://review.opendev.org/886552 --- doc/source/operation.rst | 42 ++++++++++++++++++++++++++++ nodepool/launcher.py | 23 +++++++++++++++ nodepool/tests/__init__.py | 10 +++++-- nodepool/tests/unit/test_launcher.py | 24 ++++++++++++++++ nodepool/zk/zookeeper.py | 30 ++++++++++++++++++++ 5 files changed, 126 insertions(+), 3 deletions(-) diff --git a/doc/source/operation.rst b/doc/source/operation.rst index f2f963a0d..b8192f6b0 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -688,3 +688,45 @@ generic format ``..``. For example, the Since these calls reflect the internal operations of the ``openstacksdk``, the exact keys logged may vary across providers and releases. + +Internal metrics +^^^^^^^^^^^^^^^^ + +The following metrics are low-level performance metrics of the +launcher itself, primarily of interest to Nodepool developers, and are +subject to change in the future as development needs change: + +.. zuul:stat:: nodepool.launcher..zk.client.connection_queue + :type: gauge + + ZooKeeper client connection queue length. + +.. zuul:stat:: nodepool.launcher..zk.node_cache.event_queue + :type: gauge + + Node cache event queue length. + +.. zuul:stat:: nodepool.launcher..zk.node_cache.playback_queue + :type: gauge + + Node cache playback queue length. + +.. zuul:stat:: nodepool.launcher..zk.request_cache.event_queue + :type: gauge + + Request cache event queue length. + +.. zuul:stat:: nodepool.launcher..zk.request_cache.playback_queue + :type: gauge + + Request cache playback queue length. + +.. zuul:stat:: nodepool.launcher..zk.image_cache.event_queue + :type: gauge + + Image cache event queue length. + +.. zuul:stat:: nodepool.launcher..zk.image_cache.playback_queue + :type: gauge + + Image cache playback queue length. diff --git a/nodepool/launcher.py b/nodepool/launcher.py index c16d94da0..a02fb6798 100644 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -1043,6 +1043,7 @@ class NodePool(threading.Thread): self._cleanup_thread = None self._delete_thread = None self._stats_thread = None + self._local_stats_thread = None self._submittedRequests = {} self.ready = False @@ -1066,6 +1067,9 @@ class NodePool(threading.Thread): self._stats_thread.stop() self._stats_thread.join() + if self._local_stats_thread: + self._local_stats_thread.join() + # Don't let stop() return until all pool threads have been # terminated. self.log.debug("Stopping pool threads") @@ -1272,6 +1276,20 @@ class NodePool(threading.Thread): for i in range(0, need): createRequest(label.name) + def _localStats(self): + if not self.statsd: + self.log.info("Statsd not configured") + return + hostname = socket.gethostname() + key = f'nodepool.launcher.{hostname}' + while not self._stopped: + try: + if self.zk: + self.zk.reportStats(self.statsd, key) + except Exception: + self.log.exception("Unable to run local stats reporting:") + self._stop_event.wait(10) + def run(self): ''' Start point for the NodePool thread. @@ -1309,6 +1327,11 @@ class NodePool(threading.Thread): self._stats_thread = StatsWorker(self, self.stats_interval) self._stats_thread.start() + if not self._local_stats_thread: + self._local_stats_thread = threading.Thread( + target=self._localStats) + self._local_stats_thread.start() + # Stop any PoolWorker threads if the pool was removed # from the config. pool_keys = set() diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 97540ff24..175d1b07f 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -319,9 +319,9 @@ class BaseTestCase(testtools.TestCase): if value: self.assertNotEqual(kind, None) - for _ in iterate_timeout(5 * SECOND, Exception, - "Find statsd event", - interval=0.1): + start = time.time() + timeout = 5 + while time.time() <= (start + timeout): # Note our fake statsd just queues up results in a queue. # We just keep going through them until we find one that # matches, or fail out. If statsd pipelines are used, @@ -358,6 +358,10 @@ class BaseTestCase(testtools.TestCase): # this key matches return True + stats = itertools.chain.from_iterable( + [s.decode('utf-8').split('\n') for s in self.statsd.stats]) + for stat in stats: + self.log.debug("Stat: %s", stat) raise Exception("Key %s not found in reported stats" % key) diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py index dac293815..c79eabb22 100644 --- a/nodepool/tests/unit/test_launcher.py +++ b/nodepool/tests/unit/test_launcher.py @@ -144,6 +144,30 @@ class TestLauncher(tests.DBTestCase): self.assertReportedStat('nodepool.label.fake-label2.nodes.aborted', value='0', kind='g') + hostname = socket.gethostname() + # Only check for presence since other threads could have + # outstanding requests and we don't know the value. + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.client.connection_queue') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.node_cache.event_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.node_cache.playback_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.request_cache.event_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.request_cache.playback_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.image_cache.event_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.image_cache.playback_queue', + value='0', kind='g') + def test_node_assignment_order(self): """Test that nodes are assigned in the order requested""" configfile = self.setup_config('node_many_labels.yaml') diff --git a/nodepool/zk/zookeeper.py b/nodepool/zk/zookeeper.py index 49372c5e3..7df0e4d75 100644 --- a/nodepool/zk/zookeeper.py +++ b/nodepool/zk/zookeeper.py @@ -1514,6 +1514,36 @@ class ZooKeeper(ZooKeeperBase): ''' self.client.resetHosts(hosts) + def reportStats(self, statsd, root_key): + ''' + Report stats using the supplied statsd object. + + :param statsd statsd: A statsd instance + :param str root_key: The root key (eg 'nodepool.launcher.hostname') + ''' + + pipeline = statsd.pipeline() + + key = f'{root_key}.zk.client.connection_queue' + pipeline.gauge(key, len(self.client.client._queue)) + + key = f'{root_key}.zk.node_cache.event_queue' + pipeline.gauge(key, self._node_cache._event_queue.qsize()) + key = f'{root_key}.zk.node_cache.playback_queue' + pipeline.gauge(key, self._node_cache._playback_queue.qsize()) + + key = f'{root_key}.zk.request_cache.event_queue' + pipeline.gauge(key, self._request_cache._event_queue.qsize()) + key = f'{root_key}.zk.request_cache.playback_queue' + pipeline.gauge(key, self._request_cache._playback_queue.qsize()) + + key = f'{root_key}.zk.image_cache.event_queue' + pipeline.gauge(key, self._image_cache._event_queue.qsize()) + key = f'{root_key}.zk.image_cache.playback_queue' + pipeline.gauge(key, self._image_cache._playback_queue.qsize()) + + pipeline.send() + @contextmanager def imageBuildLock(self, image, blocking=True, timeout=None): '''