Add ZK cache stats
To observe the performance of the ZK connection and the new tree caches, add some statsd metrics for each of these. This will let us monitor queue size over time. Also, update the assertReportedStat method to output all received stats if the expected stat was not found (like Zuul). Change-Id: Ia7e1e0980fdc34007f80371ee0a77d4478948518 Depends-On: https://review.opendev.org/886552
This commit is contained in:
parent
426951af18
commit
07c83f555d
@ -688,3 +688,45 @@ generic format ``<service-type>.<method>.<operation>``. For example, the
|
||||
Since these calls reflect the internal operations of the
|
||||
``openstacksdk``, the exact keys logged may vary across providers and
|
||||
releases.
|
||||
|
||||
Internal metrics
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
The following metrics are low-level performance metrics of the
|
||||
launcher itself, primarily of interest to Nodepool developers, and are
|
||||
subject to change in the future as development needs change:
|
||||
|
||||
.. zuul:stat:: nodepool.launcher.<hostname>.zk.client.connection_queue
|
||||
:type: gauge
|
||||
|
||||
ZooKeeper client connection queue length.
|
||||
|
||||
.. zuul:stat:: nodepool.launcher.<hostname>.zk.node_cache.event_queue
|
||||
:type: gauge
|
||||
|
||||
Node cache event queue length.
|
||||
|
||||
.. zuul:stat:: nodepool.launcher.<hostname>.zk.node_cache.playback_queue
|
||||
:type: gauge
|
||||
|
||||
Node cache playback queue length.
|
||||
|
||||
.. zuul:stat:: nodepool.launcher.<hostname>.zk.request_cache.event_queue
|
||||
:type: gauge
|
||||
|
||||
Request cache event queue length.
|
||||
|
||||
.. zuul:stat:: nodepool.launcher.<hostname>.zk.request_cache.playback_queue
|
||||
:type: gauge
|
||||
|
||||
Request cache playback queue length.
|
||||
|
||||
.. zuul:stat:: nodepool.launcher.<hostname>.zk.image_cache.event_queue
|
||||
:type: gauge
|
||||
|
||||
Image cache event queue length.
|
||||
|
||||
.. zuul:stat:: nodepool.launcher.<hostname>.zk.image_cache.playback_queue
|
||||
:type: gauge
|
||||
|
||||
Image cache playback queue length.
|
||||
|
@ -1043,6 +1043,7 @@ class NodePool(threading.Thread):
|
||||
self._cleanup_thread = None
|
||||
self._delete_thread = None
|
||||
self._stats_thread = None
|
||||
self._local_stats_thread = None
|
||||
self._submittedRequests = {}
|
||||
self.ready = False
|
||||
|
||||
@ -1066,6 +1067,9 @@ class NodePool(threading.Thread):
|
||||
self._stats_thread.stop()
|
||||
self._stats_thread.join()
|
||||
|
||||
if self._local_stats_thread:
|
||||
self._local_stats_thread.join()
|
||||
|
||||
# Don't let stop() return until all pool threads have been
|
||||
# terminated.
|
||||
self.log.debug("Stopping pool threads")
|
||||
@ -1272,6 +1276,20 @@ class NodePool(threading.Thread):
|
||||
for i in range(0, need):
|
||||
createRequest(label.name)
|
||||
|
||||
def _localStats(self):
|
||||
if not self.statsd:
|
||||
self.log.info("Statsd not configured")
|
||||
return
|
||||
hostname = socket.gethostname()
|
||||
key = f'nodepool.launcher.{hostname}'
|
||||
while not self._stopped:
|
||||
try:
|
||||
if self.zk:
|
||||
self.zk.reportStats(self.statsd, key)
|
||||
except Exception:
|
||||
self.log.exception("Unable to run local stats reporting:")
|
||||
self._stop_event.wait(10)
|
||||
|
||||
def run(self):
|
||||
'''
|
||||
Start point for the NodePool thread.
|
||||
@ -1309,6 +1327,11 @@ class NodePool(threading.Thread):
|
||||
self._stats_thread = StatsWorker(self, self.stats_interval)
|
||||
self._stats_thread.start()
|
||||
|
||||
if not self._local_stats_thread:
|
||||
self._local_stats_thread = threading.Thread(
|
||||
target=self._localStats)
|
||||
self._local_stats_thread.start()
|
||||
|
||||
# Stop any PoolWorker threads if the pool was removed
|
||||
# from the config.
|
||||
pool_keys = set()
|
||||
|
@ -319,9 +319,9 @@ class BaseTestCase(testtools.TestCase):
|
||||
if value:
|
||||
self.assertNotEqual(kind, None)
|
||||
|
||||
for _ in iterate_timeout(5 * SECOND, Exception,
|
||||
"Find statsd event",
|
||||
interval=0.1):
|
||||
start = time.time()
|
||||
timeout = 5
|
||||
while time.time() <= (start + timeout):
|
||||
# Note our fake statsd just queues up results in a queue.
|
||||
# We just keep going through them until we find one that
|
||||
# matches, or fail out. If statsd pipelines are used,
|
||||
@ -358,6 +358,10 @@ class BaseTestCase(testtools.TestCase):
|
||||
# this key matches
|
||||
return True
|
||||
|
||||
stats = itertools.chain.from_iterable(
|
||||
[s.decode('utf-8').split('\n') for s in self.statsd.stats])
|
||||
for stat in stats:
|
||||
self.log.debug("Stat: %s", stat)
|
||||
raise Exception("Key %s not found in reported stats" % key)
|
||||
|
||||
|
||||
|
@ -144,6 +144,30 @@ class TestLauncher(tests.DBTestCase):
|
||||
self.assertReportedStat('nodepool.label.fake-label2.nodes.aborted',
|
||||
value='0', kind='g')
|
||||
|
||||
hostname = socket.gethostname()
|
||||
# Only check for presence since other threads could have
|
||||
# outstanding requests and we don't know the value.
|
||||
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
|
||||
'.client.connection_queue')
|
||||
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
|
||||
'.node_cache.event_queue',
|
||||
value='0', kind='g')
|
||||
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
|
||||
'.node_cache.playback_queue',
|
||||
value='0', kind='g')
|
||||
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
|
||||
'.request_cache.event_queue',
|
||||
value='0', kind='g')
|
||||
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
|
||||
'.request_cache.playback_queue',
|
||||
value='0', kind='g')
|
||||
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
|
||||
'.image_cache.event_queue',
|
||||
value='0', kind='g')
|
||||
self.assertReportedStat(f'nodepool.launcher.{hostname}.zk'
|
||||
'.image_cache.playback_queue',
|
||||
value='0', kind='g')
|
||||
|
||||
def test_node_assignment_order(self):
|
||||
"""Test that nodes are assigned in the order requested"""
|
||||
configfile = self.setup_config('node_many_labels.yaml')
|
||||
|
@ -1514,6 +1514,36 @@ class ZooKeeper(ZooKeeperBase):
|
||||
'''
|
||||
self.client.resetHosts(hosts)
|
||||
|
||||
def reportStats(self, statsd, root_key):
|
||||
'''
|
||||
Report stats using the supplied statsd object.
|
||||
|
||||
:param statsd statsd: A statsd instance
|
||||
:param str root_key: The root key (eg 'nodepool.launcher.hostname')
|
||||
'''
|
||||
|
||||
pipeline = statsd.pipeline()
|
||||
|
||||
key = f'{root_key}.zk.client.connection_queue'
|
||||
pipeline.gauge(key, len(self.client.client._queue))
|
||||
|
||||
key = f'{root_key}.zk.node_cache.event_queue'
|
||||
pipeline.gauge(key, self._node_cache._event_queue.qsize())
|
||||
key = f'{root_key}.zk.node_cache.playback_queue'
|
||||
pipeline.gauge(key, self._node_cache._playback_queue.qsize())
|
||||
|
||||
key = f'{root_key}.zk.request_cache.event_queue'
|
||||
pipeline.gauge(key, self._request_cache._event_queue.qsize())
|
||||
key = f'{root_key}.zk.request_cache.playback_queue'
|
||||
pipeline.gauge(key, self._request_cache._playback_queue.qsize())
|
||||
|
||||
key = f'{root_key}.zk.image_cache.event_queue'
|
||||
pipeline.gauge(key, self._image_cache._event_queue.qsize())
|
||||
key = f'{root_key}.zk.image_cache.playback_queue'
|
||||
pipeline.gauge(key, self._image_cache._playback_queue.qsize())
|
||||
|
||||
pipeline.send()
|
||||
|
||||
@contextmanager
|
||||
def imageBuildLock(self, image, blocking=True, timeout=None):
|
||||
'''
|
||||
|
Loading…
Reference in New Issue
Block a user