
There are several places where it is now probably safe to use cached ids when iterating over ZK nodes. The main reasons not to use cached ids are in the case of race conditions or in case the tree cache may have missed an event and is unaware of a node. We have increased confidence in the accuracy of our cache now, so at least in the cases where we know that races are not an issue, we can switch those to use cached ids and save a ZK round trip (a possibly significant one if there is a long list of children). This change adds the flag in the following places (with explanations of why it's safe): * State machine cleanup routines Leaked instances have to show up on two subsequent calls to be acted upon, so this is not sensitive to timing * Quota calculation If we do get the quota wrong, drivers are expected to handle that gracefully anyway. * Held node cleanup Worst case is we wait until next iteration to clean up. * Stats They're a snapshot anyway, so a cache mismatch is really just a shift in the snapshot time. Change-Id: Ie7af2f62188951bf302ffdb64827d868609a1e3c
183 lines
5.8 KiB
Python
183 lines
5.8 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""
|
|
Helper to create a statsd client from environment variables
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import statsd
|
|
|
|
from nodepool.zk import zookeeper as zk
|
|
|
|
|
|
log = logging.getLogger("nodepool.stats")
|
|
|
|
|
|
def get_client():
|
|
"""Return a statsd client object setup from environment variables; or
|
|
None if they are not set
|
|
"""
|
|
|
|
# note we're just being careful to let the default values fall
|
|
# through to StatsClient()
|
|
statsd_args = {}
|
|
if os.getenv('STATSD_HOST', None):
|
|
statsd_args['host'] = os.environ['STATSD_HOST']
|
|
if os.getenv('STATSD_PORT', None):
|
|
statsd_args['port'] = os.environ['STATSD_PORT']
|
|
if statsd_args:
|
|
return statsd.StatsClient(**statsd_args)
|
|
else:
|
|
return None
|
|
|
|
|
|
def normalize_statsd_name(name):
|
|
return name.replace('.', '_').replace(':', '_')
|
|
|
|
|
|
class StatsReporter(object):
|
|
'''
|
|
Class adding statsd reporting functionality.
|
|
'''
|
|
def __init__(self):
|
|
super(StatsReporter, self).__init__()
|
|
self._statsd = get_client()
|
|
|
|
def recordLaunchStats(self, subkey, dt):
|
|
'''
|
|
Record node launch statistics.
|
|
|
|
:param str subkey: statsd key
|
|
:param int dt: Time delta in milliseconds
|
|
'''
|
|
if not self._statsd:
|
|
return
|
|
|
|
keys = [
|
|
'nodepool.launch.provider.%s.%s' % (
|
|
self.provider_config.name, subkey),
|
|
'nodepool.launch.%s' % (subkey,),
|
|
]
|
|
|
|
if self.node.az:
|
|
keys.append('nodepool.launch.provider.%s.%s.%s' %
|
|
(self.provider_config.name, self.node.az, subkey))
|
|
|
|
if self.handler.request.requestor:
|
|
# Replace '.' which is a graphite hierarchy, and ':' which is
|
|
# a statsd delimeter.
|
|
requestor = normalize_statsd_name(self.handler.request.requestor)
|
|
keys.append('nodepool.launch.requestor.%s.%s' %
|
|
(requestor, subkey))
|
|
|
|
pipeline = self._statsd.pipeline()
|
|
for key in keys:
|
|
pipeline.timing(key, dt)
|
|
pipeline.incr(key)
|
|
pipeline.send()
|
|
|
|
def updateNodeStats(self, zk_conn):
|
|
'''
|
|
Refresh statistics for all known nodes.
|
|
|
|
:param ZooKeeper zk_conn: A ZooKeeper connection object.
|
|
'''
|
|
if not self._statsd:
|
|
return
|
|
|
|
states = {}
|
|
|
|
launcher_pools = zk_conn.getRegisteredPools()
|
|
labels = set()
|
|
for launcher_pool in launcher_pools:
|
|
labels.update(launcher_pool.supported_labels)
|
|
providers = set()
|
|
for launcher_pool in launcher_pools:
|
|
providers.add(launcher_pool.provider_name)
|
|
|
|
# Initialize things we know about to zero
|
|
for state in zk.Node.VALID_STATES:
|
|
key = 'nodepool.nodes.%s' % state
|
|
states[key] = 0
|
|
for provider in providers:
|
|
key = 'nodepool.provider.%s.nodes.%s' % (provider, state)
|
|
states[key] = 0
|
|
|
|
# Initialize label stats to 0
|
|
for label in labels:
|
|
for state in zk.Node.VALID_STATES:
|
|
key = 'nodepool.label.%s.nodes.%s' % (label, state)
|
|
states[key] = 0
|
|
|
|
for node in zk_conn.nodeIterator(cached_ids=True):
|
|
# nodepool.nodes.STATE
|
|
key = 'nodepool.nodes.%s' % node.state
|
|
states[key] += 1
|
|
|
|
# nodepool.label.LABEL.nodes.STATE
|
|
# nodes can have several labels
|
|
for label in node.type:
|
|
key = 'nodepool.label.%s.nodes.%s' % (label, node.state)
|
|
# It's possible we could see node types that aren't in our
|
|
# config
|
|
if key in states:
|
|
states[key] += 1
|
|
else:
|
|
states[key] = 1
|
|
|
|
# nodepool.provider.PROVIDER.nodes.STATE
|
|
key = 'nodepool.provider.%s.nodes.%s' % (node.provider, node.state)
|
|
# It's possible we could see providers that aren't in our config
|
|
if key in states:
|
|
states[key] += 1
|
|
else:
|
|
states[key] = 1
|
|
|
|
pipeline = self._statsd.pipeline()
|
|
for key, count in states.items():
|
|
pipeline.gauge(key, count)
|
|
|
|
pipeline.send()
|
|
|
|
def updateProviderLimits(self, provider):
|
|
if not self._statsd:
|
|
return
|
|
|
|
pipeline = self._statsd.pipeline()
|
|
|
|
# nodepool.provider.PROVIDER.max_servers
|
|
key = 'nodepool.provider.%s.max_servers' % provider.name
|
|
max_servers = sum([p.max_servers for p in provider.pools.values()
|
|
if p.max_servers])
|
|
pipeline.gauge(key, max_servers)
|
|
|
|
pipeline.send()
|
|
|
|
def updateTenantLimits(self, tenant_limits):
|
|
if not self._statsd:
|
|
return
|
|
|
|
pipeline = self._statsd.pipeline()
|
|
# nodepool.tenant_limits.TENANT.[cores,ram,instances]
|
|
key_template = 'nodepool.tenant_limits.%s.%s'
|
|
for tenant, limits in tenant_limits.items():
|
|
for k, lim in limits.items():
|
|
# normalize for statsd name, as parts come from arbitrary
|
|
# user config
|
|
tenant = normalize_statsd_name(tenant)
|
|
k = normalize_statsd_name(k)
|
|
key = key_template % (tenant, k)
|
|
pipeline.gauge(key, lim)
|
|
pipeline.send()
|