Rename swift-stats-* to swift-dispersion-* to avoid confusion with log stats stuff

This commit is contained in:
gholt 2011-03-31 22:32:41 +00:00
parent fdc13c42e4
commit 6c13001244
7 changed files with 437 additions and 24 deletions

152
bin/swift-dispersion-populate Executable file
View File

@ -0,0 +1,152 @@
#!/usr/bin/python -u
# Copyright (c) 2010-2011 OpenStack, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import traceback
from ConfigParser import ConfigParser
from cStringIO import StringIO
from sys import exit, argv
from time import time
from uuid import uuid4
from eventlet import GreenPool, patcher, sleep
from eventlet.pools import Pool
from swift.common.client import Connection, get_auth
from swift.common.ring import Ring
from swift.common.utils import compute_eta, get_time_units
def put_container(connpool, container, report):
global retries_done
try:
with connpool.item() as conn:
conn.put_container(container)
retries_done += conn.attempts - 1
if report:
report(True)
except Exception:
if report:
report(False)
raise
def put_object(connpool, container, obj, report):
global retries_done
try:
with connpool.item() as conn:
conn.put_object(container, obj, StringIO(obj),
headers={'x-object-meta-dispersion': obj})
retries_done += conn.attempts - 1
if report:
report(True)
except Exception:
if report:
report(False)
raise
def report(success):
global begun, created, item_type, next_report, need_to_create, retries_done
if not success:
traceback.print_exc()
exit('Gave up due to error(s).')
created += 1
if time() < next_report:
return
next_report = time() + 5
eta, eta_unit = compute_eta(begun, created, need_to_create)
print '\r\x1B[KCreating %s: %d of %d, %d%s left, %d retries' % (item_type,
created, need_to_create, round(eta), eta_unit, retries_done),
if __name__ == '__main__':
global begun, created, item_type, next_report, need_to_create, retries_done
patcher.monkey_patch()
conffile = '/etc/swift/dispersion.conf'
if len(argv) == 2:
conffile = argv[1]
elif len(argv) > 2:
exit('Syntax: %s [conffile]' % argv[0])
c = ConfigParser()
if not c.read(conffile):
exit('Unable to read config file: %s' % conffile)
conf = dict(c.items('dispersion'))
swift_dir = conf.get('swift_dir', '/etc/swift')
dispersion_coverage = int(conf.get('dispersion_coverage', 1))
retries = int(conf.get('retries', 5))
concurrency = int(conf.get('concurrency', 25))
coropool = GreenPool(size=concurrency)
retries_done = 0
url, token = get_auth(conf['auth_url'], conf['auth_user'],
conf['auth_key'])
account = url.rsplit('/', 1)[1]
connpool = Pool(max_size=concurrency)
connpool.create = lambda: Connection(conf['auth_url'],
conf['auth_user'], conf['auth_key'],
retries=retries,
preauthurl=url, preauthtoken=token)
container_ring = Ring(os.path.join(swift_dir, 'container.ring.gz'))
parts_left = dict((x, x) for x in xrange(container_ring.partition_count))
item_type = 'containers'
created = 0
retries_done = 0
need_to_create = need_to_queue = \
dispersion_coverage / 100.0 * container_ring.partition_count
begun = next_report = time()
next_report += 2
while need_to_queue >= 1:
container = 'dispersion_%s' % uuid4().hex
part, _junk = container_ring.get_nodes(account, container)
if part in parts_left:
coropool.spawn(put_container, connpool, container, report)
sleep()
del parts_left[part]
need_to_queue -= 1
coropool.waitall()
elapsed, elapsed_unit = get_time_units(time() - begun)
print '\r\x1B[KCreated %d containers for dispersion reporting, %d%s, %d ' \
'retries' % \
(need_to_create, round(elapsed), elapsed_unit, retries_done)
container = 'dispersion_objects'
put_container(connpool, container, None)
object_ring = Ring(os.path.join(swift_dir, 'object.ring.gz'))
parts_left = dict((x, x) for x in xrange(object_ring.partition_count))
item_type = 'objects'
created = 0
retries_done = 0
need_to_create = need_to_queue = \
dispersion_coverage / 100.0 * object_ring.partition_count
begun = next_report = time()
next_report += 2
while need_to_queue >= 1:
obj = 'dispersion_%s' % uuid4().hex
part, _junk = object_ring.get_nodes(account, container, obj)
if part in parts_left:
coropool.spawn(put_object, connpool, container, obj, report)
sleep()
del parts_left[part]
need_to_queue -= 1
coropool.waitall()
elapsed, elapsed_unit = get_time_units(time() - begun)
print '\r\x1B[KCreated %d objects for dispersion reporting, %d%s, %d ' \
'retries' % \
(need_to_create, round(elapsed), elapsed_unit, retries_done)

249
bin/swift-dispersion-report Executable file
View File

@ -0,0 +1,249 @@
#!/usr/bin/python -u
# Copyright (c) 2010-2011 OpenStack, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import os
import socket
from ConfigParser import ConfigParser
from httplib import HTTPException
from optparse import OptionParser
from sys import argv, exit, stderr
from time import time
from uuid import uuid4
from eventlet import GreenPool, hubs, patcher, sleep, Timeout
from eventlet.pools import Pool
from swift.common import direct_client
from swift.common.client import ClientException, Connection, get_auth
from swift.common.ring import Ring
from swift.common.utils import compute_eta, get_time_units
unmounted = []
def get_error_log(prefix):
def error_log(msg_or_exc):
global unmounted
if hasattr(msg_or_exc, 'http_status') and \
msg_or_exc.http_status == 507:
identifier = '%s:%s/%s'
if identifier not in unmounted:
unmounted.append(identifier)
print >>stderr, 'ERROR: %s:%s/%s is unmounted -- This will ' \
'cause replicas designated for that device to be ' \
'considered missing until resolved or the ring is ' \
'updated.' % (msg_or_exc.http_host, msg_or_exc.http_port,
msg_or_exc.http_device)
if not hasattr(msg_or_exc, 'http_status') or \
msg_or_exc.http_status not in (404, 507):
print >>stderr, 'ERROR: %s: %s' % (prefix, msg_or_exc)
return error_log
def container_dispersion_report(coropool, connpool, account, container_ring,
retries):
with connpool.item() as conn:
containers = [c['name'] for c in conn.get_account(prefix='dispersion_',
full_listing=True)[1]]
containers_listed = len(containers)
if not containers_listed:
print >>stderr, 'No containers to query. Has ' \
'swift-dispersion-populate been run?'
return
retries_done = [0]
containers_queried = [0]
container_copies_found = [0, 0, 0, 0]
begun = time()
next_report = [time() + 2]
def direct(container, part, nodes):
found_count = 0
for node in nodes:
error_log = get_error_log('%(ip)s:%(port)s/%(device)s' % node)
try:
attempts, _junk = direct_client.retry(
direct_client.direct_head_container, node,
part, account, container, error_log=error_log,
retries=retries)
retries_done[0] += attempts - 1
found_count += 1
except ClientException, err:
if err.http_status not in (404, 507):
error_log('Giving up on /%s/%s/%s: %s' % (part, account,
container, err))
except (Exception, Timeout), err:
error_log('Giving up on /%s/%s/%s: %s' % (part, account,
container, err))
container_copies_found[found_count] += 1
containers_queried[0] += 1
if time() >= next_report[0]:
next_report[0] = time() + 5
eta, eta_unit = compute_eta(begun, containers_queried[0],
containers_listed)
print '\r\x1B[KQuerying containers: %d of %d, %d%s left, %d ' \
'retries' % (containers_queried[0], containers_listed,
round(eta), eta_unit, retries_done[0]),
container_parts = {}
for container in containers:
part, nodes = container_ring.get_nodes(account, container)
if part not in container_parts:
container_parts[part] = part
coropool.spawn(direct, container, part, nodes)
coropool.waitall()
distinct_partitions = len(container_parts)
copies_expected = distinct_partitions * container_ring.replica_count
copies_found = sum(a * b for a, b in enumerate(container_copies_found))
value = 100.0 * copies_found / copies_expected
elapsed, elapsed_unit = get_time_units(time() - begun)
print '\r\x1B[KQueried %d containers for dispersion reporting, ' \
'%d%s, %d retries' % (containers_listed, round(elapsed),
elapsed_unit, retries_done[0])
if containers_listed - distinct_partitions:
print 'There were %d overlapping partitions' % (
containers_listed - distinct_partitions)
if container_copies_found[2]:
print 'There were %d partitions missing one copy.' % \
container_copies_found[2]
if container_copies_found[1]:
print '! There were %d partitions missing two copies.' % \
container_copies_found[1]
if container_copies_found[0]:
print '!!! There were %d partitions missing all copies.' % \
container_copies_found[0]
print '%.02f%% of container copies found (%d of %d)' % (
value, copies_found, copies_expected)
print 'Sample represents %.02f%% of the container partition space' % (
100.0 * distinct_partitions / container_ring.partition_count)
def object_dispersion_report(coropool, connpool, account, object_ring,
retries):
container = 'dispersion_objects'
with connpool.item() as conn:
try:
objects = [o['name'] for o in conn.get_container(container,
prefix='dispersion_', full_listing=True)[1]]
except ClientException, err:
if err.http_status != 404:
raise
print >>stderr, 'No objects to query. Has ' \
'swift-dispersion-populate been run?'
return
objects_listed = len(objects)
if not objects_listed:
print >>stderr, 'No objects to query. Has swift-dispersion-populate ' \
'been run?'
return
retries_done = [0]
objects_queried = [0]
object_copies_found = [0, 0, 0, 0]
begun = time()
next_report = [time() + 2]
def direct(obj, part, nodes):
found_count = 0
for node in nodes:
error_log = get_error_log('%(ip)s:%(port)s/%(device)s' % node)
try:
attempts, _junk = direct_client.retry(
direct_client.direct_head_object, node, part,
account, container, obj, error_log=error_log,
retries=retries)
retries_done[0] += attempts - 1
found_count += 1
except ClientException, err:
if err.http_status not in (404, 507):
error_log('Giving up on /%s/%s/%s/%s: %s' % (part, account,
container, obj, err))
except (Exception, Timeout), err:
error_log('Giving up on /%s/%s/%s/%s: %s' % (part, account,
container, obj, err))
object_copies_found[found_count] += 1
objects_queried[0] += 1
if time() >= next_report[0]:
next_report[0] = time() + 5
eta, eta_unit = compute_eta(begun, objects_queried[0],
objects_listed)
print '\r\x1B[KQuerying objects: %d of %d, %d%s left, %d ' \
'retries' % (objects_queried[0], objects_listed, round(eta),
eta_unit, retries_done[0]),
object_parts = {}
for obj in objects:
part, nodes = object_ring.get_nodes(account, container, obj)
if part not in object_parts:
object_parts[part] = part
coropool.spawn(direct, obj, part, nodes)
coropool.waitall()
distinct_partitions = len(object_parts)
copies_expected = distinct_partitions * object_ring.replica_count
copies_found = sum(a * b for a, b in enumerate(object_copies_found))
value = 100.0 * copies_found / copies_expected
elapsed, elapsed_unit = get_time_units(time() - begun)
print '\r\x1B[KQueried %d objects for dispersion reporting, ' \
'%d%s, %d retries' % (objects_listed, round(elapsed),
elapsed_unit, retries_done[0])
if objects_listed - distinct_partitions:
print 'There were %d overlapping partitions' % (
objects_listed - distinct_partitions)
if object_copies_found[2]:
print 'There were %d partitions missing one copy.' % \
object_copies_found[2]
if object_copies_found[1]:
print '! There were %d partitions missing two copies.' % \
object_copies_found[1]
if object_copies_found[0]:
print '!!! There were %d partitions missing all copies.' % \
object_copies_found[0]
print '%.02f%% of object copies found (%d of %d)' % \
(value, copies_found, copies_expected)
print 'Sample represents %.02f%% of the object partition space' % (
100.0 * distinct_partitions / object_ring.partition_count)
if __name__ == '__main__':
patcher.monkey_patch()
hubs.get_hub().debug_exceptions = False
conffile = '/etc/swift/dispersion.conf'
if len(argv) == 2:
conffile = argv[1]
elif len(argv) > 2:
exit('Syntax: %s [conffile]' % argv[0])
c = ConfigParser()
if not c.read(conffile):
exit('Unable to read config file: %s' % conffile)
conf = dict(c.items('dispersion'))
swift_dir = conf.get('swift_dir', '/etc/swift')
dispersion_coverage = int(conf.get('dispersion_coverage', 1))
retries = int(conf.get('retries', 5))
concurrency = int(conf.get('concurrency', 25))
coropool = GreenPool(size=concurrency)
url, token = get_auth(conf['auth_url'], conf['auth_user'],
conf['auth_key'])
account = url.rsplit('/', 1)[1]
connpool = Pool(max_size=concurrency)
connpool.create = lambda: Connection(conf['auth_url'],
conf['auth_user'], conf['auth_key'],
retries=retries,
preauthurl=url, preauthtoken=token)
container_ring = Ring(os.path.join(swift_dir, 'container.ring.gz'))
object_ring = Ring(os.path.join(swift_dir, 'object.ring.gz'))
container_dispersion_report(coropool, connpool, account, container_ring,
retries)
object_dispersion_report(coropool, connpool, account, object_ring, retries)

View File

@ -18,7 +18,7 @@ import os
import traceback
from ConfigParser import ConfigParser
from optparse import OptionParser
from sys import exit, argv
from sys import exit, argv, stderr
from time import time
from uuid import uuid4
@ -77,6 +77,11 @@ if __name__ == '__main__':
global begun, created, item_type, next_report, need_to_create, retries_done
patcher.monkey_patch()
print >>stderr, '''
WARNING: This command is being replaced with swift-dispersion-populate; you
should switch to that before the next Swift release.
'''
parser = OptionParser()
parser.add_option('-d', '--dispersion', action='store_true',
dest='dispersion', default=False,

View File

@ -749,6 +749,11 @@ if __name__ == '__main__':
patcher.monkey_patch()
hubs.get_hub().debug_exceptions = False
print >>stderr, '''
WARNING: This command is being replaced with swift-dispersion-report; you
should switch to that before the next Swift release.
'''
parser = OptionParser(usage='''
Usage: %prog [options] [conf_file]

View File

@ -134,9 +134,9 @@ different distro or OS, some care should be taken before using in production.
Cluster Health
--------------
There is a swift-stats-report tool for measuring overall cluster health. This
is accomplished by checking if a set of deliberately distributed containers and
objects are currently in their proper places within the cluster.
There is a swift-dispersion-report tool for measuring overall cluster health.
This is accomplished by checking if a set of deliberately distributed
containers and objects are currently in their proper places within the cluster.
For instance, a common deployment has three replicas of each object. The health
of that object can be measured by checking if each replica is in its proper
@ -153,15 +153,15 @@ to gather results.
The first thing that needs to be done to provide this health value is create a
new account solely for this usage. Next, we need to place the containers and
objects throughout the system so that they are on distinct partitions. The
swift-stats-populate tool does this by making up random container and object
names until they fall on distinct partitions. Last, and repeatedly for the life
of the cluster, we need to run the swift-stats-report tool to check the health
of each of these containers and objects.
swift-dispersion-populate tool does this by making up random container and
object names until they fall on distinct partitions. Last, and repeatedly for
the life of the cluster, we need to run the swift-dispersion-report tool to
check the health of each of these containers and objects.
These tools need direct access to the entire cluster and to the ring files
(installing them on a proxy server will probably do). Both
swift-stats-populate and swift-stats-report use the same configuration file,
/etc/swift/stats.conf. Example conf file::
swift-dispersion-populate and swift-dispersion-report use the same
configuration file, /etc/swift/dispersion.conf. Example conf file::
[stats]
auth_url = http://saio:11000/auth/v1.0
@ -169,17 +169,17 @@ swift-stats-populate and swift-stats-report use the same configuration file,
auth_key = testing
There are also options for the conf file for specifying the dispersion coverage
(defaults to 1%), retries, concurrency, CSV output file, etc. though usually
the defaults are fine.
(defaults to 1%), retries, concurrency, etc. though usually the defaults are
fine.
Once the configuration is in place, run `swift-stats-populate -d` to populate
Once the configuration is in place, run `swift-dispersion-populate` to populate
the containers and objects throughout the cluster.
Now that those containers and objects are in place, you can run
`swift-stats-report -d` to get a dispersion report, or the overall health of
`swift-dispersion-report` to get a dispersion report, or the overall health of
the cluster. Here is an example of a cluster in perfect health::
$ swift-stats-report -d
$ swift-dispersion-report
Queried 2621 containers for dispersion reporting, 19s, 0 retries
100.00% of container copies found (7863 of 7863)
Sample represents 1.00% of the container partition space
@ -195,7 +195,7 @@ that has::
$ swift-ring-builder object.builder set_weight d0 200
$ swift-ring-builder object.builder rebalance
...
$ swift-stats-report -d
$ swift-dispersion-report
Queried 2621 containers for dispersion reporting, 8s, 0 retries
100.00% of container copies found (7863 of 7863)
Sample represents 1.00% of the container partition space
@ -212,7 +212,7 @@ is much less. Next, I'll run the replicators to get everything put back into
place and then rerun the dispersion report::
... start object replicators and monitor logs until they're caught up ...
$ swift-stats-report -d
$ swift-dispersion-report
Queried 2621 containers for dispersion reporting, 17s, 0 retries
100.00% of container copies found (7863 of 7863)
Sample represents 1.00% of the container partition space
@ -221,13 +221,6 @@ place and then rerun the dispersion report::
100.00% of object copies found (7857 of 7857)
Sample represents 1.00% of the object partition space
So that's a summation of how to use swift-stats-report to monitor the health of
a cluster. There are a few other things it can do, such as performance
monitoring, but those are currently in their infancy and little used. For
instance, you can run `swift-stats-populate -p` and `swift-stats-report -p` to
get performance timings (warning: the initial populate takes a while). These
timings are dumped into a CSV file (/etc/swift/stats.csv by default) and can
then be graphed to see how cluster performance is trending.
------------------------------------
Additional Cleanup Script for Swauth

View File

@ -0,0 +1,8 @@
[dispersion]
auth_url = http://saio:8080/auth/v1.0
auth_user = test:tester
auth_key = testing
# swift_dir = /etc/swift
# dispersion_coverage = 1
# retries = 5
# concurrency = 25

View File

@ -90,6 +90,7 @@ setup(
'bin/swift-object-updater', 'bin/swift-proxy-server',
'bin/swift-ring-builder', 'bin/swift-stats-populate',
'bin/swift-stats-report',
'bin/swift-dispersion-populate', 'bin/swift-dispersion-report',
'bin/swift-bench',
'bin/swift-log-uploader',
'bin/swift-log-stats-collector',