nodepool/nodepool/stats.py
James E. Blair 99d2a361a1 Use cached ids in node iterator more often
There are several places where it is now probably safe to use
cached ids when iterating over ZK nodes.  The main reasons not to
use cached ids are in the case of race conditions or in case the
tree cache may have missed an event and is unaware of a node.  We
have increased confidence in the accuracy of our cache now, so at
least in the cases where we know that races are not an issue, we
can switch those to use cached ids and save a ZK round trip (a
possibly significant one if there is a long list of children).

This change adds the flag in the following places (with
explanations of why it's safe):

* State machine cleanup routines

    Leaked instances have to show up on two subsequent calls to
    be acted upon, so this is not sensitive to timing

* Quota calculation

    If we do get the quota wrong, drivers are expected to handle
    that gracefully anyway.

* Held node cleanup

    Worst case is we wait until next iteration to clean up.

* Stats

    They're a snapshot anyway, so a cache mismatch is really just
    a shift in the snapshot time.

Change-Id: Ie7af2f62188951bf302ffdb64827d868609a1e3c
2023-05-30 13:27:45 -07:00

183 lines
5.8 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
Helper to create a statsd client from environment variables
"""
import os
import logging
import statsd
from nodepool.zk import zookeeper as zk
log = logging.getLogger("nodepool.stats")
def get_client():
"""Return a statsd client object setup from environment variables; or
None if they are not set
"""
# note we're just being careful to let the default values fall
# through to StatsClient()
statsd_args = {}
if os.getenv('STATSD_HOST', None):
statsd_args['host'] = os.environ['STATSD_HOST']
if os.getenv('STATSD_PORT', None):
statsd_args['port'] = os.environ['STATSD_PORT']
if statsd_args:
return statsd.StatsClient(**statsd_args)
else:
return None
def normalize_statsd_name(name):
return name.replace('.', '_').replace(':', '_')
class StatsReporter(object):
'''
Class adding statsd reporting functionality.
'''
def __init__(self):
super(StatsReporter, self).__init__()
self._statsd = get_client()
def recordLaunchStats(self, subkey, dt):
'''
Record node launch statistics.
:param str subkey: statsd key
:param int dt: Time delta in milliseconds
'''
if not self._statsd:
return
keys = [
'nodepool.launch.provider.%s.%s' % (
self.provider_config.name, subkey),
'nodepool.launch.%s' % (subkey,),
]
if self.node.az:
keys.append('nodepool.launch.provider.%s.%s.%s' %
(self.provider_config.name, self.node.az, subkey))
if self.handler.request.requestor:
# Replace '.' which is a graphite hierarchy, and ':' which is
# a statsd delimeter.
requestor = normalize_statsd_name(self.handler.request.requestor)
keys.append('nodepool.launch.requestor.%s.%s' %
(requestor, subkey))
pipeline = self._statsd.pipeline()
for key in keys:
pipeline.timing(key, dt)
pipeline.incr(key)
pipeline.send()
def updateNodeStats(self, zk_conn):
'''
Refresh statistics for all known nodes.
:param ZooKeeper zk_conn: A ZooKeeper connection object.
'''
if not self._statsd:
return
states = {}
launcher_pools = zk_conn.getRegisteredPools()
labels = set()
for launcher_pool in launcher_pools:
labels.update(launcher_pool.supported_labels)
providers = set()
for launcher_pool in launcher_pools:
providers.add(launcher_pool.provider_name)
# Initialize things we know about to zero
for state in zk.Node.VALID_STATES:
key = 'nodepool.nodes.%s' % state
states[key] = 0
for provider in providers:
key = 'nodepool.provider.%s.nodes.%s' % (provider, state)
states[key] = 0
# Initialize label stats to 0
for label in labels:
for state in zk.Node.VALID_STATES:
key = 'nodepool.label.%s.nodes.%s' % (label, state)
states[key] = 0
for node in zk_conn.nodeIterator(cached_ids=True):
# nodepool.nodes.STATE
key = 'nodepool.nodes.%s' % node.state
states[key] += 1
# nodepool.label.LABEL.nodes.STATE
# nodes can have several labels
for label in node.type:
key = 'nodepool.label.%s.nodes.%s' % (label, node.state)
# It's possible we could see node types that aren't in our
# config
if key in states:
states[key] += 1
else:
states[key] = 1
# nodepool.provider.PROVIDER.nodes.STATE
key = 'nodepool.provider.%s.nodes.%s' % (node.provider, node.state)
# It's possible we could see providers that aren't in our config
if key in states:
states[key] += 1
else:
states[key] = 1
pipeline = self._statsd.pipeline()
for key, count in states.items():
pipeline.gauge(key, count)
pipeline.send()
def updateProviderLimits(self, provider):
if not self._statsd:
return
pipeline = self._statsd.pipeline()
# nodepool.provider.PROVIDER.max_servers
key = 'nodepool.provider.%s.max_servers' % provider.name
max_servers = sum([p.max_servers for p in provider.pools.values()
if p.max_servers])
pipeline.gauge(key, max_servers)
pipeline.send()
def updateTenantLimits(self, tenant_limits):
if not self._statsd:
return
pipeline = self._statsd.pipeline()
# nodepool.tenant_limits.TENANT.[cores,ram,instances]
key_template = 'nodepool.tenant_limits.%s.%s'
for tenant, limits in tenant_limits.items():
for k, lim in limits.items():
# normalize for statsd name, as parts come from arbitrary
# user config
tenant = normalize_statsd_name(tenant)
k = normalize_statsd_name(k)
key = key_template % (tenant, k)
pipeline.gauge(key, lim)
pipeline.send()