diff --git a/doc/source/operation.rst b/doc/source/operation.rst index f2f963a0d..b8192f6b0 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -688,3 +688,45 @@ generic format ``..``. For example, the Since these calls reflect the internal operations of the ``openstacksdk``, the exact keys logged may vary across providers and releases. + +Internal metrics +^^^^^^^^^^^^^^^^ + +The following metrics are low-level performance metrics of the +launcher itself, primarily of interest to Nodepool developers, and are +subject to change in the future as development needs change: + +.. zuul:stat:: nodepool.launcher..zk.client.connection_queue + :type: gauge + + ZooKeeper client connection queue length. + +.. zuul:stat:: nodepool.launcher..zk.node_cache.event_queue + :type: gauge + + Node cache event queue length. + +.. zuul:stat:: nodepool.launcher..zk.node_cache.playback_queue + :type: gauge + + Node cache playback queue length. + +.. zuul:stat:: nodepool.launcher..zk.request_cache.event_queue + :type: gauge + + Request cache event queue length. + +.. zuul:stat:: nodepool.launcher..zk.request_cache.playback_queue + :type: gauge + + Request cache playback queue length. + +.. zuul:stat:: nodepool.launcher..zk.image_cache.event_queue + :type: gauge + + Image cache event queue length. + +.. zuul:stat:: nodepool.launcher..zk.image_cache.playback_queue + :type: gauge + + Image cache playback queue length. diff --git a/nodepool/launcher.py b/nodepool/launcher.py index c16d94da0..a02fb6798 100644 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -1043,6 +1043,7 @@ class NodePool(threading.Thread): self._cleanup_thread = None self._delete_thread = None self._stats_thread = None + self._local_stats_thread = None self._submittedRequests = {} self.ready = False @@ -1066,6 +1067,9 @@ class NodePool(threading.Thread): self._stats_thread.stop() self._stats_thread.join() + if self._local_stats_thread: + self._local_stats_thread.join() + # Don't let stop() return until all pool threads have been # terminated. self.log.debug("Stopping pool threads") @@ -1272,6 +1276,20 @@ class NodePool(threading.Thread): for i in range(0, need): createRequest(label.name) + def _localStats(self): + if not self.statsd: + self.log.info("Statsd not configured") + return + hostname = socket.gethostname() + key = f'nodepool.launcher.{hostname}' + while not self._stopped: + try: + if self.zk: + self.zk.reportStats(self.statsd, key) + except Exception: + self.log.exception("Unable to run local stats reporting:") + self._stop_event.wait(10) + def run(self): ''' Start point for the NodePool thread. @@ -1309,6 +1327,11 @@ class NodePool(threading.Thread): self._stats_thread = StatsWorker(self, self.stats_interval) self._stats_thread.start() + if not self._local_stats_thread: + self._local_stats_thread = threading.Thread( + target=self._localStats) + self._local_stats_thread.start() + # Stop any PoolWorker threads if the pool was removed # from the config. pool_keys = set() diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 97540ff24..175d1b07f 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -319,9 +319,9 @@ class BaseTestCase(testtools.TestCase): if value: self.assertNotEqual(kind, None) - for _ in iterate_timeout(5 * SECOND, Exception, - "Find statsd event", - interval=0.1): + start = time.time() + timeout = 5 + while time.time() <= (start + timeout): # Note our fake statsd just queues up results in a queue. # We just keep going through them until we find one that # matches, or fail out. If statsd pipelines are used, @@ -358,6 +358,10 @@ class BaseTestCase(testtools.TestCase): # this key matches return True + stats = itertools.chain.from_iterable( + [s.decode('utf-8').split('\n') for s in self.statsd.stats]) + for stat in stats: + self.log.debug("Stat: %s", stat) raise Exception("Key %s not found in reported stats" % key) diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py index dac293815..c79eabb22 100644 --- a/nodepool/tests/unit/test_launcher.py +++ b/nodepool/tests/unit/test_launcher.py @@ -144,6 +144,30 @@ class TestLauncher(tests.DBTestCase): self.assertReportedStat('nodepool.label.fake-label2.nodes.aborted', value='0', kind='g') + hostname = socket.gethostname() + # Only check for presence since other threads could have + # outstanding requests and we don't know the value. + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.client.connection_queue') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.node_cache.event_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.node_cache.playback_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.request_cache.event_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.request_cache.playback_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.image_cache.event_queue', + value='0', kind='g') + self.assertReportedStat(f'nodepool.launcher.{hostname}.zk' + '.image_cache.playback_queue', + value='0', kind='g') + def test_node_assignment_order(self): """Test that nodes are assigned in the order requested""" configfile = self.setup_config('node_many_labels.yaml') diff --git a/nodepool/zk/zookeeper.py b/nodepool/zk/zookeeper.py index 49372c5e3..7df0e4d75 100644 --- a/nodepool/zk/zookeeper.py +++ b/nodepool/zk/zookeeper.py @@ -1514,6 +1514,36 @@ class ZooKeeper(ZooKeeperBase): ''' self.client.resetHosts(hosts) + def reportStats(self, statsd, root_key): + ''' + Report stats using the supplied statsd object. + + :param statsd statsd: A statsd instance + :param str root_key: The root key (eg 'nodepool.launcher.hostname') + ''' + + pipeline = statsd.pipeline() + + key = f'{root_key}.zk.client.connection_queue' + pipeline.gauge(key, len(self.client.client._queue)) + + key = f'{root_key}.zk.node_cache.event_queue' + pipeline.gauge(key, self._node_cache._event_queue.qsize()) + key = f'{root_key}.zk.node_cache.playback_queue' + pipeline.gauge(key, self._node_cache._playback_queue.qsize()) + + key = f'{root_key}.zk.request_cache.event_queue' + pipeline.gauge(key, self._request_cache._event_queue.qsize()) + key = f'{root_key}.zk.request_cache.playback_queue' + pipeline.gauge(key, self._request_cache._playback_queue.qsize()) + + key = f'{root_key}.zk.image_cache.event_queue' + pipeline.gauge(key, self._image_cache._event_queue.qsize()) + key = f'{root_key}.zk.image_cache.playback_queue' + pipeline.gauge(key, self._image_cache._playback_queue.qsize()) + + pipeline.send() + @contextmanager def imageBuildLock(self, image, blocking=True, timeout=None): '''