swift/bin/swift-recon
Florian Hines 413ca11a5f Add sockstat info to recon.
Add's support for pulling info from /proc/net/sockstat and /proc/net/sockstat6 via recon.

Change-Id: Idb403c6eda199c5d36d96cc9027ee249c12c7d8b
2011-11-15 17:55:14 +00:00

409 lines
14 KiB
Python
Executable File

#! /usr/bin/env python
"""
cmdline utility to perform cluster reconnaissance
"""
from eventlet.green import urllib2
from swift.common.ring import Ring
import simplejson as json
from hashlib import md5
import datetime
import eventlet
import optparse
import sys
import os
VERBOSE = False
SUPPRESS_ERRORS = False
def get_devices(zone_filter, ring_file):
ring_data = Ring(ring_file)
if zone_filter:
ips = set((n['ip'], n['port']) for n in ring_data.devs if n \
if n['zone'] == zone_filter)
else:
ips = set((n['ip'], n['port']) for n in ring_data.devs if n)
return ips
def scout(base_url, recon_type):
global VERBOSE, SUPPRESS_ERRORS
url = base_url + recon_type
try:
body = urllib2.urlopen(url).read()
content = json.loads(body)
if VERBOSE:
print "-> %s: %s" % (url, content)
status = 200
except urllib2.HTTPError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
content = e
status = e.code
except urllib2.URLError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
content = e
status = -1
return url, content, status
def scout_md5(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "ringmd5")
return url, content, status
def scout_async(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "async")
return url, content, status
def scout_replication(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "replication")
return url, content, status
def scout_load(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "load")
return url, content, status
def scout_du(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "diskusage")
return url, content, status
def scout_umount(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "unmounted")
return url, content, status
def scout_quarantine(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "quarantined")
return url, content, status
def scout_sockstat(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "sockstat")
return url, content, status
def get_ringmd5(hosts, ringfile):
stats = {}
matches = 0
errors = 0
md5sum = md5()
with open(ringfile, 'rb') as f:
block = f.read(4096)
while block:
md5sum.update(block)
block = f.read(4096)
ring_sum = md5sum.hexdigest()
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts))
if VERBOSE:
print "-> On disk md5sum: %s" % ring_sum
for url, response, status in pool.imap(scout_md5, hosts):
if status == 200:
#fixme - need to grab from config
stats[url] = response[ringfile]
if response[ringfile] != ring_sum:
ringsmatch = False
print "!! %s (%s) doesn't match on disk md5sum" % \
(url, response[ringfile])
else:
matches = matches + 1
if VERBOSE:
print "-> %s matches." % url
else:
errors = errors + 1
print "%s/%s hosts matched, %s error[s] while checking hosts." % \
(matches, len(hosts), errors)
print "=" * 79
def async_check(hosts):
stats = {}
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_async, hosts):
if status == 200:
stats[url] = response['async_pending']
if len(stats) > 0:
low = min(stats.values())
high = max(stats.values())
total = sum(stats.values())
average = total / len(stats)
print "Async stats: low: %d, high: %d, avg: %d, total: %d" % (low,
high, average, total)
else:
print "Error: No hosts available or returned valid information."
print "=" * 79
def umount_check(hosts):
stats = {}
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Getting unmounted drives from %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_umount, hosts):
if status == 200:
for i in response:
stats[url] = i['device']
for host in stats:
print "Not mounted: %s on %s" % (stats[host], host)
print "=" * 79
def replication_check(hosts):
stats = {}
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking replication times on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_replication, hosts):
if status == 200:
stats[url] = response['object_replication_time']
if len(stats) > 0:
low = min(stats.values())
high = max(stats.values())
total = sum(stats.values())
average = total / len(stats)
print "[Replication Times] shortest: %s, longest: %s, avg: %s" % \
(low, high, average)
else:
print "Error: No hosts available or returned valid information."
print "=" * 79
def load_check(hosts):
load1 = {}
load5 = {}
load15 = {}
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_load, hosts):
if status == 200:
load1[url] = response['1m']
load5[url] = response['5m']
load15[url] = response['15m']
stats = {"1m": load1, "5m": load5, "15m": load15}
for item in stats:
if len(stats[item]) > 0:
low = min(stats[item].values())
high = max(stats[item].values())
total = sum(stats[item].values())
average = total / len(stats[item])
print "[%s load average] lowest: %s, highest: %s, avg: %s" % \
(item, low, high, average)
else:
print "Error: No hosts available or returned valid information."
print "=" * 79
def quarantine_check(hosts):
objq = {}
conq = {}
acctq = {}
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking quarantine dirs on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_quarantine, hosts):
if status == 200:
objq[url] = response['objects']
conq[url] = response['containers']
acctq[url] = response['accounts']
stats = {"objects": objq, "containers": conq, "accounts": acctq}
for item in stats:
if len(stats[item]) > 0:
low = min(stats[item].values())
high = max(stats[item].values())
total = sum(stats[item].values())
average = total / len(stats[item])
print "[Quarantined %s] low: %d, high: %d, avg: %d, total: %d" % \
(item, low, high, average, total)
else:
print "Error: No hosts available or returned valid information."
print "=" * 79
def socket_usage(hosts):
inuse4 = {}
mem = {}
inuse6 = {}
timewait = {}
orphan = {}
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking socket usage on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_sockstat, hosts):
if status == 200:
inuse4[url] = response['tcp_in_use']
mem[url] = response['tcp_mem_allocated_bytes']
inuse6[url] = response['tcp6_in_use']
timewait[url] = response['time_wait']
orphan[url] = response['orphan']
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem, \
"tcp6_in_use": inuse6, "time_wait": timewait, "orphan": orphan}
for item in stats:
if len(stats[item]) > 0:
low = min(stats[item].values())
high = max(stats[item].values())
total = sum(stats[item].values())
average = total / len(stats[item])
print "[%s] low: %d, high: %d, avg: %d, total: %d" % \
(item, low, high, average, total)
else:
print "Error: No hosts or info available."
print "=" * 79
def disk_usage(hosts):
stats = {}
highs = []
lows = []
averages = []
percents = {}
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_du, hosts):
if status == 200:
hostusage = []
for entry in response:
if entry['mounted']:
used = float(entry['used']) / float(entry['size']) * 100.0
hostusage.append(round(used, 2))
stats[url] = hostusage
for url in stats:
if len(stats[url]) > 0:
#get per host hi/los for another day
low = min(stats[url])
high = max(stats[url])
total = sum(stats[url])
average = total / len(stats[url])
highs.append(high)
lows.append(low)
averages.append(average)
for percent in stats[url]:
percents[int(percent)] = percents.get(int(percent), 0) + 1
else:
print "-> %s: Error. No drive info available." % url
if len(lows) > 0:
low = min(lows)
high = max(highs)
average = sum(averages) / len(averages)
#distrib graph shamelessly stolen from https://github.com/gholt/tcod
print "Distribution Graph:"
mul = 69.0 / max(percents.values())
for percent in sorted(percents):
print '% 3d%% % 4d %s' % (percent, percents[percent], \
'*' * int(percents[percent] * mul))
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
(low, high, average)
else:
print "Error: No hosts available or returned valid information."
print "=" * 79
def main():
global VERBOSE, SUPPRESS_ERRORS, swift_dir, pool
print "=" * 79
usage = '''
usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5]
'''
args = optparse.OptionParser(usage)
args.add_option('--verbose', '-v', action="store_true",
help="Print verbose info")
args.add_option('--suppress', action="store_true",
help="Suppress most connection related errors")
args.add_option('--async', '-a', action="store_true",
help="Get async stats")
args.add_option('--replication', '-r', action="store_true",
help="Get replication stats")
args.add_option('--unmounted', '-u', action="store_true",
help="Check cluster for unmounted devices")
args.add_option('--diskusage', '-d', action="store_true",
help="Get disk usage stats")
args.add_option('--loadstats', '-l', action="store_true",
help="Get cluster load average stats")
args.add_option('--quarantined', '-q', action="store_true",
help="Get cluster quarantine stats")
args.add_option('--objmd5', action="store_true",
help="Get md5sums of object.ring.gz and compare to local copy")
args.add_option('--sockstat', action="store_true",
help="Get cluster socket usage stats")
args.add_option('--all', action="store_true",
help="Perform all checks. Equivalent to -arudlq --objmd5 --sockstat")
args.add_option('--zone', '-z', type="int",
help="Only query servers in specified zone")
args.add_option('--swiftdir', default="/etc/swift",
help="Default = /etc/swift")
options, arguments = args.parse_args()
if len(sys.argv) <= 1:
args.print_help()
swift_dir = options.swiftdir
obj_ring = os.path.join(swift_dir, 'object.ring.gz')
con_ring = os.path.join(swift_dir, 'container.ring.gz')
acct_ring = os.path.join(swift_dir, 'account.ring.gz')
VERBOSE = options.verbose
SUPPRESS_ERRORS = options.suppress
if options.zone:
hosts = get_devices(options.zone, obj_ring)
else:
hosts = get_devices(None, obj_ring)
if options.all:
async_check(hosts)
umount_check(hosts)
replication_check(hosts)
load_check(hosts)
disk_usage(hosts)
get_ringmd5(hosts, obj_ring)
quarantine_check(hosts)
socket_usage(hosts)
else:
if options.async:
async_check(hosts)
if options.unmounted:
umount_check(hosts)
if options.replication:
replication_check(hosts)
if options.loadstats:
load_check(hosts)
if options.diskusage:
disk_usage(hosts)
if options.objmd5:
get_ringmd5(hosts, obj_ring)
if options.quarantined:
quarantine_check(hosts)
if options.sockstat:
socket_usage(hosts)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print '\n'