From a88b412e17a473b867faf3f76d9d662979b152c5 Mon Sep 17 00:00:00 2001 From: gholt Date: Sat, 12 Jan 2013 04:30:16 +0000 Subject: [PATCH] swift-recon: Added oldest and most recent repl I've been doing this with cluster-wide log searches for far too long. This adds support for reporting the oldest replication pass completion as well as the most recent. This is quite useful for finding those odd replicators that have hung up for some reason and need intervention. Change-Id: I7fd7260eca162d6b085f3e82aaa3cf90670f2d53 --- bin/swift-recon | 75 +++++++++++++++++++++++ swift/common/db_replicator.py | 3 +- swift/common/middleware/recon.py | 9 ++- swift/obj/replicator.py | 6 +- test/unit/common/middleware/test_recon.py | 27 +++++--- 5 files changed, 105 insertions(+), 15 deletions(-) diff --git a/bin/swift-recon b/bin/swift-recon index b72cc2841d..6cc27fb040 100755 --- a/bin/swift-recon +++ b/bin/swift-recon @@ -19,6 +19,21 @@ import sys import os +def seconds2timeunit(seconds): + elapsed = seconds + unit = 'seconds' + if elapsed >= 60: + elapsed = elapsed / 60.0 + unit = 'minutes' + if elapsed >= 60: + elapsed = elapsed / 60.0 + unit = 'hours' + if elapsed >= 24: + elapsed = elapsed / 24.0 + unit = 'days' + return elapsed, unit + + class Scout(object): """ Obtain swift recon information @@ -261,6 +276,10 @@ class SwiftRecon(object): recon = Scout("replication/%s" % self.server_type, self.verbose, self.suppress_errors, self.timeout) print "[%s] Checking on replication" % self._ptime() + least_recent_time = 9999999999 + least_recent_url = None + most_recent_time = 0 + most_recent_url = None for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: stats['replication_time'].append( @@ -269,6 +288,13 @@ class SwiftRecon(object): if repl_stats: for stat_key in ['attempted', 'failure', 'success']: stats[stat_key].append(repl_stats.get(stat_key)) + last = response.get('replication_last', 0) + if last < least_recent_time: + least_recent_time = last + least_recent_url = url + if last > most_recent_time: + most_recent_time = last + most_recent_url = url for k in stats: if stats[k]: if k != 'replication_time': @@ -282,6 +308,25 @@ class SwiftRecon(object): print "[%s] - No hosts returned valid data." % k else: print "[%s] - No hosts returned valid data." % k + if least_recent_url is not None: + host = urlparse(url).netloc + if not least_recent_time: + print 'Oldest completion was NEVER by %s.' % host + else: + elapsed = time.time() - least_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print 'Oldest completion was %s (%d %s ago) by %s.' % ( + time.strftime('%Y-%m-%d %H:%M:%S', + time.gmtime(least_recent_time)), + elapsed, elapsed_unit, host) + if most_recent_url is not None: + host = urlparse(url).netloc + elapsed = time.time() - most_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print 'Most recent completion was %s (%d %s ago) by %s.' % ( + time.strftime('%Y-%m-%d %H:%M:%S', + time.gmtime(most_recent_time)), + elapsed, elapsed_unit, host) print "=" * 79 def object_replication_check(self, hosts): @@ -295,9 +340,20 @@ class SwiftRecon(object): recon = Scout("replication", self.verbose, self.suppress_errors, self.timeout) print "[%s] Checking on replication" % self._ptime() + least_recent_time = 9999999999 + least_recent_url = None + most_recent_time = 0 + most_recent_url = None for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: stats[url] = response['object_replication_time'] + last = response.get('object_replication_last', 0) + if last < least_recent_time: + least_recent_time = last + least_recent_url = url + if last > most_recent_time: + most_recent_time = last + most_recent_url = url times = [x for x in stats.values() if x is not None] if len(stats) > 0 and len(times) > 0: computed = self._gen_stats(times, 'replication_time') @@ -307,6 +363,25 @@ class SwiftRecon(object): print "[replication_time] - No hosts returned valid data." else: print "[replication_time] - No hosts returned valid data." + if least_recent_url is not None: + host = urlparse(url).netloc + if not least_recent_time: + print 'Oldest completion was NEVER by %s.' % host + else: + elapsed = time.time() - least_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print 'Oldest completion was %s (%d %s ago) by %s.' % ( + time.strftime('%Y-%m-%d %H:%M:%S', + time.gmtime(least_recent_time)), + elapsed, elapsed_unit, host) + if most_recent_url is not None: + host = urlparse(url).netloc + elapsed = time.time() - most_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print 'Most recent completion was %s (%d %s ago) by %s.' % ( + time.strftime('%Y-%m-%d %H:%M:%S', + time.gmtime(most_recent_time)), + elapsed, elapsed_unit, host) print "=" * 79 def updater_check(self, hosts): diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index ed16177ed5..72eaa920b8 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -194,7 +194,8 @@ class Replicator(Daemon): % self.stats) dump_recon_cache( {'replication_stats': self.stats, - 'replication_time': time.time() - self.stats['start']}, + 'replication_time': time.time() - self.stats['start'], + 'replication_last': time.time()}, self.rcache, self.logger) self.logger.info(' '.join(['%s:%s' % item for item in self.stats.items() if item[0] in diff --git a/swift/common/middleware/recon.py b/swift/common/middleware/recon.py index 4bcb8f6db6..aa3bd963bb 100644 --- a/swift/common/middleware/recon.py +++ b/swift/common/middleware/recon.py @@ -122,14 +122,17 @@ class ReconMiddleware(object): """get replication info""" if recon_type == 'account': return self._from_recon_cache(['replication_time', - 'replication_stats'], + 'replication_stats', + 'replication_last'], self.account_recon_cache) elif recon_type == 'container': return self._from_recon_cache(['replication_time', - 'replication_stats'], + 'replication_stats', + 'replication_last'], self.container_recon_cache) elif recon_type == 'object': - return self._from_recon_cache(['object_replication_time'], + return self._from_recon_cache(['object_replication_time', + 'object_replication_last'], self.object_recon_cache) else: return None diff --git a/swift/obj/replicator.py b/swift/obj/replicator.py index 56cceb5b95..fcf6b66459 100644 --- a/swift/obj/replicator.py +++ b/swift/obj/replicator.py @@ -647,7 +647,8 @@ class ObjectReplicator(Daemon): self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): - dump_recon_cache({'object_replication_time': total}, + dump_recon_cache({'object_replication_time': total, + 'object_replication_last': time.time()}, self.rcache, self.logger) def run_forever(self, *args, **kwargs): @@ -661,7 +662,8 @@ class ObjectReplicator(Daemon): total = (time.time() - start) / 60 self.logger.info( _("Object replication complete. (%.02f minutes)"), total) - dump_recon_cache({'object_replication_time': total}, + dump_recon_cache({'object_replication_time': total, + 'object_replication_last': time.time()}, self.rcache, self.logger) self.logger.debug(_('Replication sleeping for %s seconds.'), self.run_pause) diff --git a/test/unit/common/middleware/test_recon.py b/test/unit/common/middleware/test_recon.py index 5bcc936d9b..4d12f8d318 100644 --- a/test/unit/common/middleware/test_recon.py +++ b/test/unit/common/middleware/test_recon.py @@ -388,11 +388,13 @@ class TestReconSuccess(TestCase): "remove": 0, "rsync": 0, "start": 1333044050.855202, "success": 2, "ts_repl": 0 }, - "replication_time": 0.2615511417388916} + "replication_time": 0.2615511417388916, + "replication_last": 1357969645.25} self.fakecache.fakeout = from_cache_response rv = self.app.get_replication_info('account') self.assertEquals(self.fakecache.fakeout_calls, - [((['replication_time', 'replication_stats'], + [((['replication_time', 'replication_stats', + 'replication_last'], '/var/cache/swift/account.recon'), {})]) self.assertEquals(rv, {"replication_stats": { "attempted": 1, "diff": 0, @@ -402,7 +404,8 @@ class TestReconSuccess(TestCase): "remove": 0, "rsync": 0, "start": 1333044050.855202, "success": 2, "ts_repl": 0 }, - "replication_time": 0.2615511417388916}) + "replication_time": 0.2615511417388916, + "replication_last": 1357969645.25}) def test_get_replication_info_container(self): from_cache_response = {"replication_time": 200.0, @@ -413,12 +416,14 @@ class TestReconSuccess(TestCase): "no_change": 358, "remote_merge": 0, "remove": 0, "rsync": 0, "start": 5.5, "success": 358, - "ts_repl": 0}} + "ts_repl": 0}, + "replication_last": 1357969645.25} self.fakecache.fakeout_calls = [] self.fakecache.fakeout = from_cache_response rv = self.app.get_replication_info('container') self.assertEquals(self.fakecache.fakeout_calls, - [((['replication_time', 'replication_stats'], + [((['replication_time', 'replication_stats', + 'replication_last'], '/var/cache/swift/container.recon'), {})]) self.assertEquals(rv, {"replication_time": 200.0, "replication_stats": { @@ -428,17 +433,21 @@ class TestReconSuccess(TestCase): "no_change": 358, "remote_merge": 0, "remove": 0, "rsync": 0, "start": 5.5, "success": 358, - "ts_repl": 0}}) + "ts_repl": 0}, + "replication_last": 1357969645.25}) def test_get_replication_object(self): - from_cache_response = {"object_replication_time": 200.0} + from_cache_response = {"object_replication_time": 200.0, + "object_replication_last": 1357962809.15} self.fakecache.fakeout_calls = [] self.fakecache.fakeout = from_cache_response rv = self.app.get_replication_info('object') self.assertEquals(self.fakecache.fakeout_calls, - [((['object_replication_time'], + [((['object_replication_time', + 'object_replication_last'], '/var/cache/swift/object.recon'), {})]) - self.assertEquals(rv, {'object_replication_time': 200.0}) + self.assertEquals(rv, {'object_replication_time': 200.0, + 'object_replication_last': 1357962809.15}) def test_get_updater_info_container(self): from_cache_response = {"container_updater_sweep": 18.476239919662476}