From 79ba4a85983641e539b620bd143e62673c98416e Mon Sep 17 00:00:00 2001 From: Hisashi Osanai Date: Wed, 3 Dec 2014 06:15:16 +0900 Subject: [PATCH] Enable Object Replicator's failure count in recon This patch makes the count of object replication failure in recon. And "failure_nodes" is added to Account Replicator and Container Replicator. Recon shows the count of object repliction failure as follows: $ curl http://:/recon/replication/object { "replication_last": 1416334368.60865, "replication_stats": { "attempted": 13346, "failure": 870, "failure_nodes": { "192.168.0.1": {"sdb1": 3}, "192.168.0.2": {"sdb1": 851, "sdc1": 1, "sdd1": 8}, "192.168.0.3": {"sdb1": 3, "sdc1": 4} }, "hashmatch": 0, "remove": 0, "rsync": 0, "start": 1416354240.9761429, "success": 1908 }, "replication_time": 2316.5563162644703, "object_replication_last": 1416334368.60865, "object_replication_time": 2316.5563162644703 } Note that 'object_replication_last' and 'object_replication_time' are considered to be transitional and will be removed in the subsequent releases. Use 'replication_last' and 'replication_time' instead. Additionaly this patch adds the count in swift-recon and it will be showed as follows: $ swift-recon object -r ======================================================================== ======= --> Starting reconnaissance on 4 hosts ======================================================================== ======= [2014-11-27 16:14:09] Checking on replication [replication_failure] low: 0, high: 0, avg: 0.0, total: 0, Failed: 0.0%, no_result: 0, reported: 4 [replication_success] low: 3, high: 3, avg: 3.0, total: 12, Failed: 0.0%, no_result: 0, reported: 4 [replication_time] low: 0, high: 0, avg: 0.0, total: 0, Failed: 0.0%, no_result: 0, reported: 4 [replication_attempted] low: 1, high: 1, avg: 1.0, total: 4, Failed: 0.0%, no_result: 0, reported: 4 Oldest completion was 2014-11-27 16:09:45 (4 minutes ago) by 192.168.0.4:6002. Most recent completion was 2014-11-27 16:14:19 (-10 seconds ago) by 192.168.0.1:6002. ======================================================================== ======= In case there is a cluster which has servers, a server runs with this patch and the other servers run without this patch. If swift-recon executes on the server which runs with this patch, there are unnecessary information on the output such as [failure], [success] and [attempted]. Because other servers which run without this patch are not able to send a response with information that this patch needs. Therefore once you apply this patch, you also apply this patch to other servers before you execute swift-recon. DocImpact Change-Id: Iecd33655ae2568482833131f422679996c374d78 Co-Authored-By: Kenichiro Matsuda Co-Authored-By: Brian Cline Implements: blueprint enable-object-replication-failure-in-recon --- doc/source/admin_guide.rst | 6 +- swift/cli/recon.py | 71 ++--------- swift/common/db_replicator.py | 37 +++++- swift/common/middleware/recon.py | 16 +-- swift/obj/replicator.py | 136 ++++++++++++++++++++-- test/unit/cli/test_recon.py | 38 ------ test/unit/common/middleware/test_recon.py | 45 ++++++- test/unit/obj/test_replicator.py | 3 + 8 files changed, 222 insertions(+), 130 deletions(-) mode change 100755 => 100644 swift/cli/recon.py diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index d50efc4ef4..7d396664df 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -549,12 +549,16 @@ Request URI Description /recon/sockstat returns consumable info from /proc/net/sockstat|6 /recon/devices returns list of devices and devices dir i.e. /srv/node /recon/async returns count of async pending -/recon/replication returns object replication times (for backward compatibility) +/recon/replication returns object replication info (for backward compatibility) /recon/replication/ returns replication info for given type (account, container, object) /recon/auditor/ returns auditor stats on last reported scan for given type (account, container, object) /recon/updater/ returns last updater sweep times for given type (container, object) ========================= ======================================================================================== +Note that 'object_replication_last' and 'object_replication_time' in object +replication info are considered to be transitional and will be removed in +the subsequent releases. Use 'replication_last' and 'replication_time' instead. + This information can also be queried via the swift-recon command line utility:: fhines@ubuntu:~$ swift-recon -h diff --git a/swift/cli/recon.py b/swift/cli/recon.py old mode 100755 new mode 100644 index 79e0721c04..c405b9fb43 --- a/swift/cli/recon.py +++ b/swift/cli/recon.py @@ -460,12 +460,14 @@ class SwiftRecon(object): recon.scout, hosts): if status == 200: stats['replication_time'].append( - response.get('replication_time')) - repl_stats = response['replication_stats'] + response.get('replication_time', + response.get('object_replication_time', 0))) + repl_stats = response.get('replication_stats') if repl_stats: for stat_key in ['attempted', 'failure', 'success']: stats[stat_key].append(repl_stats.get(stat_key)) - last = response.get('replication_last', 0) + last = response.get('replication_last', + response.get('object_replication_last', 0)) if last < least_recent_time: least_recent_time = last least_recent_url = url @@ -506,62 +508,6 @@ class SwiftRecon(object): elapsed, elapsed_unit, host)) print("=" * 79) - def object_replication_check(self, hosts): - """ - Obtain and print replication statistics from object servers - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - stats = {} - recon = Scout("replication", self.verbose, self.suppress_errors, - self.timeout) - print("[%s] Checking on replication" % self._ptime()) - least_recent_time = 9999999999 - least_recent_url = None - most_recent_time = 0 - most_recent_url = None - for url, response, status, ts_start, ts_end in self.pool.imap( - recon.scout, hosts): - if status == 200: - stats[url] = response['object_replication_time'] - last = response.get('object_replication_last', 0) - if last < least_recent_time: - least_recent_time = last - least_recent_url = url - if last > most_recent_time: - most_recent_time = last - most_recent_url = url - times = [x for x in stats.values() if x is not None] - if len(stats) > 0 and len(times) > 0: - computed = self._gen_stats(times, 'replication_time') - if computed['reported'] > 0: - self._print_stats(computed) - else: - print("[replication_time] - No hosts returned valid data.") - else: - print("[replication_time] - No hosts returned valid data.") - if least_recent_url is not None: - host = urlparse(least_recent_url).netloc - if not least_recent_time: - print('Oldest completion was NEVER by %s.' % host) - else: - elapsed = time.time() - least_recent_time - elapsed, elapsed_unit = seconds2timeunit(elapsed) - print('Oldest completion was %s (%d %s ago) by %s.' % ( - time.strftime('%Y-%m-%d %H:%M:%S', - time.gmtime(least_recent_time)), - elapsed, elapsed_unit, host)) - if most_recent_url is not None: - host = urlparse(most_recent_url).netloc - elapsed = time.time() - most_recent_time - elapsed, elapsed_unit = seconds2timeunit(elapsed) - print('Most recent completion was %s (%d %s ago) by %s.' % ( - time.strftime('%Y-%m-%d %H:%M:%S', - time.gmtime(most_recent_time)), - elapsed, elapsed_unit, host)) - print("=" * 79) - def updater_check(self, hosts): """ Obtain and print updater statistics @@ -1072,7 +1018,7 @@ class SwiftRecon(object): if options.all: if self.server_type == 'object': self.async_check(hosts) - self.object_replication_check(hosts) + self.replication_check(hosts) self.object_auditor_check(hosts) self.updater_check(hosts) self.expirer_check(hosts) @@ -1102,10 +1048,7 @@ class SwiftRecon(object): if options.unmounted: self.umount_check(hosts) if options.replication: - if self.server_type == 'object': - self.object_replication_check(hosts) - else: - self.replication_check(hosts) + self.replication_check(hosts) if options.auditor: if self.server_type == 'object': self.object_auditor_check(hosts) diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index 151a070c07..7a6e8d549f 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -187,7 +187,8 @@ class Replicator(Daemon): self.stats = {'attempted': 0, 'success': 0, 'failure': 0, 'ts_repl': 0, 'no_change': 0, 'hashmatch': 0, 'rsync': 0, 'diff': 0, 'remove': 0, 'empty': 0, 'remote_merge': 0, - 'start': time.time(), 'diff_capped': 0} + 'start': time.time(), 'diff_capped': 0, + 'failure_nodes': {}} def _report_stats(self): """Report the current stats to the logs.""" @@ -212,6 +213,13 @@ class Replicator(Daemon): ('no_change', 'hashmatch', 'rsync', 'diff', 'ts_repl', 'empty', 'diff_capped')])) + def _add_failure_stats(self, failure_devs_info): + for node, dev in failure_devs_info: + self.stats['failure'] += 1 + failure_devs = self.stats['failure_nodes'].setdefault(node, {}) + failure_devs.setdefault(dev, 0) + failure_devs[dev] += 1 + def _rsync_file(self, db_file, remote_file, whole_file=True, different_region=False): """ @@ -479,7 +487,10 @@ class Replicator(Daemon): quarantine_db(broker.db_file, broker.db_type) else: self.logger.exception(_('ERROR reading db %s'), object_file) - self.stats['failure'] += 1 + nodes = self.ring.get_part_nodes(int(partition)) + self._add_failure_stats([(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in nodes]) self.logger.increment('failures') return # The db is considered deleted if the delete_timestamp value is greater @@ -494,6 +505,7 @@ class Replicator(Daemon): self.logger.timing_since('timing', start_time) return responses = [] + failure_devs_info = set() nodes = self.ring.get_part_nodes(int(partition)) local_dev = None for node in nodes: @@ -532,7 +544,8 @@ class Replicator(Daemon): self.logger.exception(_('ERROR syncing %(file)s with node' ' %(node)s'), {'file': object_file, 'node': node}) - self.stats['success' if success else 'failure'] += 1 + if not success: + failure_devs_info.add((node['replication_ip'], node['device'])) self.logger.increment('successes' if success else 'failures') responses.append(success) try: @@ -543,7 +556,17 @@ class Replicator(Daemon): if not shouldbehere and all(responses): # If the db shouldn't be on this node and has been successfully # synced to all of its peers, it can be removed. - self.delete_db(broker) + if not self.delete_db(broker): + failure_devs_info.update( + [(failure_dev['replication_ip'], failure_dev['device']) + for failure_dev in repl_nodes]) + + target_devs_info = set([(target_dev['replication_ip'], + target_dev['device']) + for target_dev in repl_nodes]) + self.stats['success'] += len(target_devs_info - failure_devs_info) + self._add_failure_stats(failure_devs_info) + self.logger.timing_since('timing', start_time) def delete_db(self, broker): @@ -558,9 +581,11 @@ class Replicator(Daemon): if err.errno not in (errno.ENOENT, errno.ENOTEMPTY): self.logger.exception( _('ERROR while trying to clean up %s') % suf_dir) + return False self.stats['remove'] += 1 device_name = self.extract_device(object_file) self.logger.increment('removes.' + device_name) + return True def extract_device(self, object_file): """ @@ -592,6 +617,10 @@ class Replicator(Daemon): node['replication_port']): if self.mount_check and not ismount( os.path.join(self.root, node['device'])): + self._add_failure_stats( + [(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in self.ring.devs if failure_dev]) self.logger.warn( _('Skipping %(device)s as it is not mounted') % node) continue diff --git a/swift/common/middleware/recon.py b/swift/common/middleware/recon.py index b7a508f921..b0d1a1a526 100644 --- a/swift/common/middleware/recon.py +++ b/swift/common/middleware/recon.py @@ -134,19 +134,19 @@ class ReconMiddleware(object): def get_replication_info(self, recon_type): """get replication info""" + replication_list = ['replication_time', + 'replication_stats', + 'replication_last'] if recon_type == 'account': - return self._from_recon_cache(['replication_time', - 'replication_stats', - 'replication_last'], + return self._from_recon_cache(replication_list, self.account_recon_cache) elif recon_type == 'container': - return self._from_recon_cache(['replication_time', - 'replication_stats', - 'replication_last'], + return self._from_recon_cache(replication_list, self.container_recon_cache) elif recon_type == 'object': - return self._from_recon_cache(['object_replication_time', - 'object_replication_last'], + replication_list += ['object_replication_time', + 'object_replication_last'] + return self._from_recon_cache(replication_list, self.object_recon_cache) else: return None diff --git a/swift/obj/replicator.py b/swift/obj/replicator.py index 70b55046cb..639c67b032 100644 --- a/swift/obj/replicator.py +++ b/swift/obj/replicator.py @@ -101,6 +101,30 @@ class ObjectReplicator(Daemon): conf.get('handoff_delete', 'auto'), 0) self._diskfile_mgr = DiskFileManager(conf, self.logger) + def _zero_stats(self): + """Zero out the stats.""" + self.stats = {'attempted': 0, 'success': 0, 'failure': 0, + 'hashmatch': 0, 'rsync': 0, 'remove': 0, + 'start': time.time(), 'failure_nodes': {}} + + def _add_failure_stats(self, failure_devs_info): + for node, dev in failure_devs_info: + self.stats['failure'] += 1 + failure_devs = self.stats['failure_nodes'].setdefault(node, {}) + failure_devs.setdefault(dev, 0) + failure_devs[dev] += 1 + + def _get_my_replication_ips(self): + my_replication_ips = set() + ips = whataremyips() + for policy in POLICIES: + self.load_object_ring(policy) + for local_dev in [dev for dev in policy.object_ring.devs + if dev and dev['replication_ip'] in ips and + dev['replication_port'] == self.port]: + my_replication_ips.add(local_dev['replication_ip']) + return list(my_replication_ips) + # Just exists for doc anchor point def sync(self, node, job, suffixes, *args, **kwargs): """ @@ -243,6 +267,7 @@ class ObjectReplicator(Daemon): self.replication_count += 1 self.logger.increment('partition.delete.count.%s' % (job['device'],)) self.headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) + failure_devs_info = set() begin = time.time() try: responses = [] @@ -251,6 +276,7 @@ class ObjectReplicator(Daemon): delete_objs = None if suffixes: for node in job['nodes']: + self.stats['rsync'] += 1 kwargs = {} if node['region'] in synced_remote_regions and \ self.conf.get('sync_method', 'rsync') == 'ssync': @@ -271,6 +297,9 @@ class ObjectReplicator(Daemon): if node['region'] != job['region']: synced_remote_regions[node['region']] = \ candidates.keys() + else: + failure_devs_info.add((node['replication_ip'], + node['device'])) responses.append(success) for region, cand_objs in synced_remote_regions.items(): if delete_objs is None: @@ -286,11 +315,23 @@ class ObjectReplicator(Daemon): delete_handoff = len(responses) == len(job['nodes']) and \ all(responses) if delete_handoff: + self.stats['remove'] += 1 if (self.conf.get('sync_method', 'rsync') == 'ssync' and delete_objs is not None): self.logger.info(_("Removing %s objects"), len(delete_objs)) - self.delete_handoff_objs(job, delete_objs) + _junk, error_paths = self.delete_handoff_objs( + job, delete_objs) + # if replication works for a hand-off device and it faild, + # the remote devices which are target of the replication + # from the hand-off device will be marked. Because cleanup + # after replication failed means replicator needs to + # replicate again with the same info. + if error_paths: + failure_devs_info.update( + [(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in job['nodes']]) else: self.delete_partition(job['path']) elif not suffixes: @@ -298,14 +339,21 @@ class ObjectReplicator(Daemon): except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) finally: + target_devs_info = set([(target_dev['replication_ip'], + target_dev['device']) + for target_dev in job['nodes']]) + self.stats['success'] += len(target_devs_info - failure_devs_info) + self._add_failure_stats(failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.delete.timing', begin) def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) - tpool.execute(shutil.rmtree, path, ignore_errors=True) + tpool.execute(shutil.rmtree, path) def delete_handoff_objs(self, job, delete_objs): + success_paths = [] + error_paths = [] for object_hash in delete_objs: object_path = storage_directory(job['obj_path'], job['partition'], object_hash) @@ -313,11 +361,14 @@ class ObjectReplicator(Daemon): suffix_dir = dirname(object_path) try: os.rmdir(suffix_dir) + success_paths.append(object_path) except OSError as e: if e.errno not in (errno.ENOENT, errno.ENOTEMPTY): + error_paths.append(object_path) self.logger.exception( "Unexpected error trying to cleanup suffix dir:%r", suffix_dir) + return success_paths, error_paths def update(self, job): """ @@ -328,6 +379,8 @@ class ObjectReplicator(Daemon): self.replication_count += 1 self.logger.increment('partition.update.count.%s' % (job['device'],)) self.headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) + target_devs_info = set() + failure_devs_info = set() begin = time.time() try: hashed, local_hash = tpool_reraise( @@ -346,6 +399,7 @@ class ObjectReplicator(Daemon): while attempts_left > 0: # If this throws StopIteration it will be caught way below node = next(nodes) + target_devs_info.add((node['replication_ip'], node['device'])) attempts_left -= 1 # if we have already synced to this remote region, # don't sync again on this replication pass @@ -361,12 +415,16 @@ class ObjectReplicator(Daemon): self.logger.error(_('%(ip)s/%(device)s responded' ' as unmounted'), node) attempts_left += 1 + failure_devs_info.add((node['replication_ip'], + node['device'])) continue if resp.status != HTTP_OK: self.logger.error(_("Invalid response %(resp)s " "from %(ip)s"), {'resp': resp.status, 'ip': node['replication_ip']}) + failure_devs_info.add((node['replication_ip'], + node['device'])) continue remote_hash = pickle.loads(resp.read()) del resp @@ -374,6 +432,7 @@ class ObjectReplicator(Daemon): local_hash[suffix] != remote_hash.get(suffix, -1)] if not suffixes: + self.stats['hashmatch'] += 1 continue hashed, recalc_hash = tpool_reraise( self._diskfile_mgr._get_hashes, @@ -384,6 +443,7 @@ class ObjectReplicator(Daemon): suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] + self.stats['rsync'] += 1 success, _junk = self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect( @@ -392,18 +452,26 @@ class ObjectReplicator(Daemon): '/' + '-'.join(suffixes), headers=self.headers) conn.getresponse().read() + if not success: + failure_devs_info.add((node['replication_ip'], + node['device'])) # add only remote region when replicate succeeded if success and node['region'] != job['region']: synced_remote_regions.add(node['region']) self.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) except (Exception, Timeout): + failure_devs_info.add((node['replication_ip'], + node['device'])) self.logger.exception(_("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except (Exception, Timeout): + failure_devs_info.update(target_devs_info) self.logger.exception(_("Error syncing partition")) finally: + self.stats['success'] += len(target_devs_info - failure_devs_info) + self._add_failure_stats(failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) @@ -481,6 +549,9 @@ class ObjectReplicator(Daemon): using replication style storage policy """ jobs = [] + self.all_devs_info.update( + [(dev['replication_ip'], dev['device']) + for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) for local_dev in [dev for dev in policy.object_ring.devs if (dev @@ -494,6 +565,11 @@ class ObjectReplicator(Daemon): obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): + self._add_failure_stats( + [(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in policy.object_ring.devs + if failure_dev]) self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) @@ -508,6 +584,7 @@ class ObjectReplicator(Daemon): and partition not in override_partitions): continue + part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( @@ -524,6 +601,17 @@ class ObjectReplicator(Daemon): partition=partition, region=local_dev['region'])) except ValueError: + if part_nodes: + self._add_failure_stats( + [(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in nodes]) + else: + self._add_failure_stats( + [(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in policy.object_ring.devs + if failure_dev]) continue return jobs @@ -569,19 +657,31 @@ class ObjectReplicator(Daemon): self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] + self.my_replication_ips = self._get_my_replication_ips() + self.all_devs_info = set() stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle + current_nodes = None try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: + current_nodes = job['nodes'] + if override_devices and job['device'] not in override_devices: + continue + if override_partitions and \ + job['partition'] not in override_partitions: + continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): + self._add_failure_stats([(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in job['nodes']]) self.logger.warn(_('%s is not mounted'), job['device']) continue if not self.check_ring(job['policy'].object_ring): @@ -603,18 +703,26 @@ class ObjectReplicator(Daemon): self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) + current_nodes = None with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): + if current_nodes: + self._add_failure_stats([(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in current_nodes]) + else: + self._add_failure_stats(self.all_devs_info) self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() + self.stats['attempted'] = self.replication_count def run_once(self, *args, **kwargs): - start = time.time() + self._zero_stats() self.logger.info(_("Running object replicator in script mode.")) override_devices = list_from_csv(kwargs.get('devices')) @@ -631,27 +739,35 @@ class ObjectReplicator(Daemon): override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) - total = (time.time() - start) / 60 + total = (time.time() - self.stats['start']) / 60 self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): - dump_recon_cache({'object_replication_time': total, - 'object_replication_last': time.time()}, + replication_last = time.time() + dump_recon_cache({'replication_stats': self.stats, + 'replication_time': total, + 'replication_last': replication_last, + 'object_replication_time': total, + 'object_replication_last': replication_last}, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: - start = time.time() + self._zero_stats() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() - total = (time.time() - start) / 60 + total = (time.time() - self.stats['start']) / 60 self.logger.info( _("Object replication complete. (%.02f minutes)"), total) - dump_recon_cache({'object_replication_time': total, - 'object_replication_last': time.time()}, + replication_last = time.time() + dump_recon_cache({'replication_stats': self.stats, + 'replication_time': total, + 'replication_last': replication_last, + 'object_replication_time': total, + 'object_replication_last': replication_last}, self.rcache, self.logger) self.logger.debug('Replication sleeping for %s seconds.', self.interval) diff --git a/test/unit/cli/test_recon.py b/test/unit/cli/test_recon.py index 6559b615f0..345097c63f 100644 --- a/test/unit/cli/test_recon.py +++ b/test/unit/cli/test_recon.py @@ -578,44 +578,6 @@ class TestReconCommands(unittest.TestCase): cli.disk_usage([('127.0.0.1', 6010)], 5, 0) mock_print.assert_has_calls(expected_calls) - @mock.patch('__builtin__.print') - @mock.patch('time.time') - def test_object_replication_check(self, mock_now, mock_print): - now = 1430000000.0 - - def dummy_request(*args, **kwargs): - return [ - ('http://127.0.0.1:6010/recon/replication/object', - {"object_replication_time": 61, - "object_replication_last": now}, - 200, - 0, - 0), - ('http://127.0.0.1:6020/recon/replication/object', - {"object_replication_time": 23, - "object_replication_last": now}, - 200, - 0, - 0), - ] - - cli = recon.SwiftRecon() - cli.pool.imap = dummy_request - - default_calls = [ - mock.call('[replication_time] low: 23, high: 61, avg: 42.0, ' + - 'total: 84, Failed: 0.0%, no_result: 0, reported: 2'), - mock.call('Oldest completion was 2015-04-25 22:13:20 ' + - '(42 seconds ago) by 127.0.0.1:6010.'), - mock.call('Most recent completion was 2015-04-25 22:13:20 ' + - '(42 seconds ago) by 127.0.0.1:6010.'), - ] - - mock_now.return_value = now + 42 - cli.object_replication_check([('127.0.0.1', 6010), - ('127.0.0.1', 6020)]) - mock_print.assert_has_calls(default_calls) - @mock.patch('__builtin__.print') @mock.patch('time.time') def test_replication_check(self, mock_now, mock_print): diff --git a/test/unit/common/middleware/test_recon.py b/test/unit/common/middleware/test_recon.py index 520e2edaf6..8ea659dcaf 100644 --- a/test/unit/common/middleware/test_recon.py +++ b/test/unit/common/middleware/test_recon.py @@ -506,6 +506,9 @@ class TestReconSuccess(TestCase): "attempted": 1, "diff": 0, "diff_capped": 0, "empty": 0, "failure": 0, "hashmatch": 0, + "failure_nodes": { + "192.168.0.1": 0, + "192.168.0.2": 0}, "no_change": 2, "remote_merge": 0, "remove": 0, "rsync": 0, "start": 1333044050.855202, @@ -523,6 +526,9 @@ class TestReconSuccess(TestCase): "attempted": 1, "diff": 0, "diff_capped": 0, "empty": 0, "failure": 0, "hashmatch": 0, + "failure_nodes": { + "192.168.0.1": 0, + "192.168.0.2": 0}, "no_change": 2, "remote_merge": 0, "remove": 0, "rsync": 0, "start": 1333044050.855202, @@ -537,6 +543,9 @@ class TestReconSuccess(TestCase): "attempted": 179, "diff": 0, "diff_capped": 0, "empty": 0, "failure": 0, "hashmatch": 0, + "failure_nodes": { + "192.168.0.1": 0, + "192.168.0.2": 0}, "no_change": 358, "remote_merge": 0, "remove": 0, "rsync": 0, "start": 5.5, "success": 358, @@ -555,6 +564,9 @@ class TestReconSuccess(TestCase): "attempted": 179, "diff": 0, "diff_capped": 0, "empty": 0, "failure": 0, "hashmatch": 0, + "failure_nodes": { + "192.168.0.1": 0, + "192.168.0.2": 0}, "no_change": 358, "remote_merge": 0, "remove": 0, "rsync": 0, "start": 5.5, "success": 358, @@ -562,17 +574,40 @@ class TestReconSuccess(TestCase): "replication_last": 1357969645.25}) def test_get_replication_object(self): - from_cache_response = {"object_replication_time": 200.0, - "object_replication_last": 1357962809.15} + from_cache_response = { + "replication_time": 0.2615511417388916, + "replication_stats": { + "attempted": 179, + "failure": 0, "hashmatch": 0, + "failure_nodes": { + "192.168.0.1": 0, + "192.168.0.2": 0}, + "remove": 0, "rsync": 0, + "start": 1333044050.855202, "success": 358}, + "replication_last": 1357969645.25, + "object_replication_time": 0.2615511417388916, + "object_replication_last": 1357969645.25} self.fakecache.fakeout_calls = [] self.fakecache.fakeout = from_cache_response rv = self.app.get_replication_info('object') self.assertEquals(self.fakecache.fakeout_calls, - [((['object_replication_time', + [((['replication_time', 'replication_stats', + 'replication_last', 'object_replication_time', 'object_replication_last'], '/var/cache/swift/object.recon'), {})]) - self.assertEquals(rv, {'object_replication_time': 200.0, - 'object_replication_last': 1357962809.15}) + self.assertEquals(rv, { + "replication_time": 0.2615511417388916, + "replication_stats": { + "attempted": 179, + "failure": 0, "hashmatch": 0, + "failure_nodes": { + "192.168.0.1": 0, + "192.168.0.2": 0}, + "remove": 0, "rsync": 0, + "start": 1333044050.855202, "success": 358}, + "replication_last": 1357969645.25, + "object_replication_time": 0.2615511417388916, + "object_replication_last": 1357969645.25}) def test_get_updater_info_container(self): from_cache_response = {"container_updater_sweep": 18.476239919662476} diff --git a/test/unit/obj/test_replicator.py b/test/unit/obj/test_replicator.py index 526ff0b7f2..d7316ab4bf 100644 --- a/test/unit/obj/test_replicator.py +++ b/test/unit/obj/test_replicator.py @@ -205,6 +205,8 @@ class TestObjectReplicator(unittest.TestCase): def _create_replicator(self): self.replicator = object_replicator.ObjectReplicator(self.conf) self.replicator.logger = self.logger + self.replicator._zero_stats() + self.replicator.all_devs_info = set() self.df_mgr = diskfile.DiskFileManager(self.conf, self.logger) def test_run_once(self): @@ -771,6 +773,7 @@ class TestObjectReplicator(unittest.TestCase): self.conf['sync_method'] = 'ssync' self.replicator = object_replicator.ObjectReplicator(self.conf) self.replicator.logger = debug_logger() + self.replicator._zero_stats() with mock.patch('swift.obj.replicator.http_connect', mock_http_connect(200)):