From c94acbcd23fc440a2a0bb7cad045557f89fd6bea Mon Sep 17 00:00:00 2001 From: John Dickinson Date: Mon, 30 Apr 2018 10:47:22 -0700 Subject: [PATCH 1/9] updated .gitreview Change-Id: I7a4b06a55c50fdb944ee08eb0e6f7f5902642b18 --- .gitreview | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitreview b/.gitreview index d7c52c0593..32ee97aa51 100644 --- a/.gitreview +++ b/.gitreview @@ -2,3 +2,4 @@ host=review.openstack.org port=29418 project=openstack/swift.git +defaultbranch=feature/deep-review From 9d742b85ad2e53c1782116e56284e95a07577f35 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Tue, 1 May 2018 15:12:05 +0100 Subject: [PATCH 2/9] Refactoring, test infrastructure changes and cleanup ...in preparation for the container sharding feature. Co-Authored-By: Matthew Oliver Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Change-Id: I4455677abb114a645cff93cd41b394d227e805de --- swift/account/backend.py | 11 +- swift/common/db.py | 184 ++++++++++++---- swift/common/db_replicator.py | 184 +++++++++++----- swift/common/wsgi.py | 50 +++-- swift/container/backend.py | 51 ++++- swift/container/replicator.py | 40 ++-- swift/container/server.py | 74 +++---- test/__init__.py | 27 +++ test/probe/brain.py | 6 +- test/probe/common.py | 99 ++++++--- test/probe/test_object_expirer.py | 36 +--- test/unit/__init__.py | 20 ++ test/unit/account/test_server.py | 4 +- test/unit/common/test_db.py | 198 ++++++++++++++++- test/unit/common/test_db_replicator.py | 284 +++++++++++++++++++++++-- test/unit/common/test_utils.py | 9 + test/unit/common/test_wsgi.py | 13 +- test/unit/container/test_backend.py | 130 ++++++----- test/unit/container/test_server.py | 4 +- 19 files changed, 1085 insertions(+), 339 deletions(-) diff --git a/swift/account/backend.py b/swift/account/backend.py index 2734548cf0..1ff940d4f6 100644 --- a/swift/account/backend.py +++ b/swift/account/backend.py @@ -22,7 +22,7 @@ import six.moves.cPickle as pickle import sqlite3 from swift.common.utils import Timestamp -from swift.common.db import DatabaseBroker, utf8encode +from swift.common.db import DatabaseBroker, utf8encode, zero_like DATADIR = 'accounts' @@ -233,7 +233,7 @@ class AccountBroker(DatabaseBroker): with self.get() as conn: row = conn.execute( 'SELECT container_count from account_stat').fetchone() - return (row[0] == 0) + return zero_like(row[0]) def make_tuple_for_pickle(self, record): return (record['name'], record['put_timestamp'], @@ -254,7 +254,7 @@ class AccountBroker(DatabaseBroker): :param storage_policy_index: the storage policy for this container """ if Timestamp(delete_timestamp) > Timestamp(put_timestamp) and \ - object_count in (None, '', 0, '0'): + zero_like(object_count): deleted = 1 else: deleted = 0 @@ -273,8 +273,7 @@ class AccountBroker(DatabaseBroker): :returns: True if the DB is considered to be deleted, False otherwise """ - return status == 'DELETED' or ( - container_count in (None, '', 0, '0') and + return status == 'DELETED' or zero_like(container_count) and ( Timestamp(delete_timestamp) > Timestamp(put_timestamp)) def _is_deleted(self, conn): @@ -509,7 +508,7 @@ class AccountBroker(DatabaseBroker): record[2] = row[2] # If deleted, mark as such if Timestamp(record[2]) > Timestamp(record[1]) and \ - record[3] in (None, '', 0, '0'): + zero_like(record[3]): record[5] = 1 else: record[5] = 0 diff --git a/swift/common/db.py b/swift/common/db.py index b05eeb8d11..6425e85034 100644 --- a/swift/common/db.py +++ b/swift/common/db.py @@ -71,6 +71,18 @@ def native_str_keys(metadata): metadata[k.decode('utf-8')] = sv +ZERO_LIKE_VALUES = {None, '', 0, '0'} + + +def zero_like(count): + """ + We've cargo culted our consumers to be tolerant of various expressions of + zero in our databases for backwards compatibility with less disciplined + producers. + """ + return count in ZERO_LIKE_VALUES + + def _db_timeout(timeout, db_file, call): with LockTimeout(timeout, db_file): retry_wait = 0.001 @@ -208,11 +220,27 @@ class DatabaseBroker(object): def __init__(self, db_file, timeout=BROKER_TIMEOUT, logger=None, account=None, container=None, pending_timeout=None, - stale_reads_ok=False): - """Encapsulates working with a database.""" + stale_reads_ok=False, skip_commits=False): + """Encapsulates working with a database. + + :param db_file: path to a database file. + :param timeout: timeout used for database operations. + :param logger: a logger instance. + :param account: name of account. + :param container: name of container. + :param pending_timeout: timeout used when attempting to take a lock to + write to pending file. + :param stale_reads_ok: if True then no error is raised if pending + commits cannot be committed before the database is read, otherwise + an error is raised. + :param skip_commits: if True then this broker instance will never + commit records from the pending file to the database; + :meth:`~swift.common.db.DatabaseBroker.put_record` should not + called on brokers with skip_commits True. + """ self.conn = None - self.db_file = db_file - self.pending_file = self.db_file + '.pending' + self._db_file = db_file + self.pending_file = self._db_file + '.pending' self.pending_timeout = pending_timeout or 10 self.stale_reads_ok = stale_reads_ok self.db_dir = os.path.dirname(db_file) @@ -221,6 +249,7 @@ class DatabaseBroker(object): self.account = account self.container = container self._db_version = -1 + self.skip_commits = skip_commits def __str__(self): """ @@ -240,9 +269,9 @@ class DatabaseBroker(object): :param put_timestamp: internalized timestamp of initial PUT request :param storage_policy_index: only required for containers """ - if self.db_file == ':memory:': + if self._db_file == ':memory:': tmp_db_file = None - conn = get_db_connection(self.db_file, self.timeout) + conn = get_db_connection(self._db_file, self.timeout) else: mkdirs(self.db_dir) fd, tmp_db_file = mkstemp(suffix='.tmp', dir=self.db_dir) @@ -329,15 +358,22 @@ class DatabaseBroker(object): self._delete_db(conn, timestamp) conn.commit() + @property + def db_file(self): + return self._db_file + + def get_device_path(self): + suffix_path = os.path.dirname(self.db_dir) + partition_path = os.path.dirname(suffix_path) + dbs_path = os.path.dirname(partition_path) + return os.path.dirname(dbs_path) + def quarantine(self, reason): """ The database will be quarantined and a sqlite3.DatabaseError will be raised indicating the action taken. """ - prefix_path = os.path.dirname(self.db_dir) - partition_path = os.path.dirname(prefix_path) - dbs_path = os.path.dirname(partition_path) - device_path = os.path.dirname(dbs_path) + device_path = self.get_device_path() quar_path = os.path.join(device_path, 'quarantined', self.db_type + 's', os.path.basename(self.db_dir)) @@ -377,6 +413,20 @@ class DatabaseBroker(object): self.quarantine(exc_hint) + @contextmanager + def updated_timeout(self, new_timeout): + """Use with "with" statement; updates ``timeout`` within the block.""" + old_timeout = self.timeout + try: + self.timeout = new_timeout + if self.conn: + self.conn.timeout = new_timeout + yield old_timeout + finally: + self.timeout = old_timeout + if self.conn: + self.conn.timeout = old_timeout + @contextmanager def get(self): """Use with the "with" statement; returns a database connection.""" @@ -477,6 +527,23 @@ class DatabaseBroker(object): with self.get() as conn: return self._is_deleted(conn) + def empty(self): + """ + Check if the broker abstraction contains any undeleted records. + """ + raise NotImplementedError() + + def is_reclaimable(self, now, reclaim_age): + """ + Check if the broker abstraction is empty, and has been marked deleted + for at least a reclaim age. + """ + info = self.get_replication_info() + return (zero_like(info['count']) and + (Timestamp(now - reclaim_age) > + Timestamp(info['delete_timestamp']) > + Timestamp(info['put_timestamp']))) + def merge_timestamps(self, created_at, put_timestamp, delete_timestamp): """ Used in replication to handle updating timestamps. @@ -548,13 +615,15 @@ class DatabaseBroker(object): result.append({'remote_id': row[0], 'sync_point': row[1]}) return result - def get_max_row(self): + def get_max_row(self, table=None): + if not table: + table = self.db_contains_type query = ''' SELECT SQLITE_SEQUENCE.seq FROM SQLITE_SEQUENCE WHERE SQLITE_SEQUENCE.name == '%s' LIMIT 1 - ''' % (self.db_contains_type) + ''' % (table, ) with self.get() as conn: row = conn.execute(query).fetchone() return row[0] if row else -1 @@ -582,11 +651,26 @@ class DatabaseBroker(object): return curs.fetchone() def put_record(self, record): - if self.db_file == ':memory:': + """ + Put a record into the DB. If the DB has an associated pending file with + space then the record is appended to that file and a commit to the DB + is deferred. If the DB is in-memory or its pending file is full then + the record will be committed immediately. + + :param record: a record to be added to the DB. + :raises DatabaseConnectionError: if the DB file does not exist or if + ``skip_commits`` is True. + :raises LockTimeout: if a timeout occurs while waiting to take a lock + to write to the pending file. + """ + if self._db_file == ':memory:': self.merge_items([record]) return if not os.path.exists(self.db_file): raise DatabaseConnectionError(self.db_file, "DB doesn't exist") + if self.skip_commits: + raise DatabaseConnectionError(self.db_file, + 'commits not accepted') with lock_parent_directory(self.pending_file, self.pending_timeout): pending_size = 0 try: @@ -606,6 +690,10 @@ class DatabaseBroker(object): protocol=PICKLE_PROTOCOL).encode('base64')) fp.flush() + def _skip_commit_puts(self): + return (self._db_file == ':memory:' or self.skip_commits or not + os.path.exists(self.pending_file)) + def _commit_puts(self, item_list=None): """ Scan for .pending files and commit the found records by feeding them @@ -614,7 +702,13 @@ class DatabaseBroker(object): :param item_list: A list of items to commit in addition to .pending """ - if self.db_file == ':memory:' or not os.path.exists(self.pending_file): + if self._skip_commit_puts(): + if item_list: + # this broker instance should not be used to commit records, + # but if it is then raise an error rather than quietly + # discarding the records in item_list. + raise DatabaseConnectionError(self.db_file, + 'commits not accepted') return if item_list is None: item_list = [] @@ -645,7 +739,7 @@ class DatabaseBroker(object): Catch failures of _commit_puts() if broker is intended for reading of stats, and thus does not care for pending updates. """ - if self.db_file == ':memory:' or not os.path.exists(self.pending_file): + if self._skip_commit_puts(): return try: with lock_parent_directory(self.pending_file, @@ -663,6 +757,12 @@ class DatabaseBroker(object): """ raise NotImplementedError + def merge_items(self, item_list, source=None): + """ + Save :param:item_list to the database. + """ + raise NotImplementedError + def make_tuple_for_pickle(self, record): """ Turn this db record dict into the format this service uses for @@ -701,7 +801,7 @@ class DatabaseBroker(object): within 512k of a boundary, it allocates to the next boundary. Boundaries are 2m, 5m, 10m, 25m, 50m, then every 50m after. """ - if not DB_PREALLOCATION or self.db_file == ':memory:': + if not DB_PREALLOCATION or self._db_file == ':memory:': return MB = (1024 * 1024) @@ -830,40 +930,46 @@ class DatabaseBroker(object): def reclaim(self, age_timestamp, sync_timestamp): """ - Delete rows from the db_contains_type table that are marked deleted - and whose created_at timestamp is < age_timestamp. Also deletes rows - from incoming_sync and outgoing_sync where the updated_at timestamp is - < sync_timestamp. + Delete reclaimable rows and metadata from the db. - In addition, this calls the DatabaseBroker's :func:`_reclaim` method. + By default this method will delete rows from the db_contains_type table + that are marked deleted and whose created_at timestamp is < + age_timestamp, and deletes rows from incoming_sync and outgoing_sync + where the updated_at timestamp is < sync_timestamp. In addition, this + calls the :meth:`_reclaim_metadata` method. + + Subclasses may reclaim other items by overriding :meth:`_reclaim`. :param age_timestamp: max created_at timestamp of object rows to delete :param sync_timestamp: max update_at timestamp of sync rows to delete """ - if self.db_file != ':memory:' and os.path.exists(self.pending_file): + if not self._skip_commit_puts(): with lock_parent_directory(self.pending_file, self.pending_timeout): self._commit_puts() with self.get() as conn: - conn.execute(''' - DELETE FROM %s WHERE deleted = 1 AND %s < ? - ''' % (self.db_contains_type, self.db_reclaim_timestamp), - (age_timestamp,)) - try: - conn.execute(''' - DELETE FROM outgoing_sync WHERE updated_at < ? - ''', (sync_timestamp,)) - conn.execute(''' - DELETE FROM incoming_sync WHERE updated_at < ? - ''', (sync_timestamp,)) - except sqlite3.OperationalError as err: - # Old dbs didn't have updated_at in the _sync tables. - if 'no such column: updated_at' not in str(err): - raise - DatabaseBroker._reclaim(self, conn, age_timestamp) + self._reclaim(conn, age_timestamp, sync_timestamp) + self._reclaim_metadata(conn, age_timestamp) conn.commit() - def _reclaim(self, conn, timestamp): + def _reclaim(self, conn, age_timestamp, sync_timestamp): + conn.execute(''' + DELETE FROM %s WHERE deleted = 1 AND %s < ? + ''' % (self.db_contains_type, self.db_reclaim_timestamp), + (age_timestamp,)) + try: + conn.execute(''' + DELETE FROM outgoing_sync WHERE updated_at < ? + ''', (sync_timestamp,)) + conn.execute(''' + DELETE FROM incoming_sync WHERE updated_at < ? + ''', (sync_timestamp,)) + except sqlite3.OperationalError as err: + # Old dbs didn't have updated_at in the _sync tables. + if 'no such column: updated_at' not in str(err): + raise + + def _reclaim_metadata(self, conn, timestamp): """ Removes any empty metadata values older than the timestamp using the given database connection. This function will not call commit on the diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index c464341b21..c27914bffb 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -33,10 +33,11 @@ from swift.common.direct_client import quote from swift.common.utils import get_logger, whataremyips, storage_directory, \ renamer, mkdirs, lock_parent_directory, config_true_value, \ unlink_older_than, dump_recon_cache, rsync_module_interpolation, \ - json, Timestamp, parse_override_options, round_robin_iter, Everything + json, parse_override_options, round_robin_iter, Everything from swift.common import ring from swift.common.ring.utils import is_local_device -from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE +from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \ + is_success from swift.common.bufferedhttp import BufferedHTTPConnection from swift.common.exceptions import DriveNotMounted from swift.common.daemon import Daemon @@ -87,11 +88,14 @@ def roundrobin_datadirs(datadirs): found (in their proper places). The partitions within each data dir are walked randomly, however. - :param datadirs: a list of (path, node_id, partition_filter) to walk - :returns: A generator of (partition, path_to_db_file, node_id) + :param datadirs: a list of tuples of (path, context, partition_filter) to + walk. The context may be any object; the context is not + used by this function but is included with each yielded + tuple. + :returns: A generator of (partition, path_to_db_file, context) """ - def walk_datadir(datadir, node_id, part_filter): + def walk_datadir(datadir, context, part_filter): partitions = [pd for pd in os.listdir(datadir) if looks_like_partition(pd) and part_filter(pd)] random.shuffle(partitions) @@ -117,7 +121,7 @@ def roundrobin_datadirs(datadirs): continue object_file = os.path.join(hash_dir, hsh + '.db') if os.path.exists(object_file): - yield (partition, object_file, node_id) + yield (partition, object_file, context) else: try: os.rmdir(hash_dir) @@ -125,8 +129,8 @@ def roundrobin_datadirs(datadirs): if e.errno != errno.ENOTEMPTY: raise - its = [walk_datadir(datadir, node_id, filt) - for datadir, node_id, filt in datadirs] + its = [walk_datadir(datadir, context, filt) + for datadir, context, filt in datadirs] rr_its = round_robin_iter(its) for datadir in rr_its: @@ -312,6 +316,16 @@ class Replicator(Daemon): response = http.replicate(replicate_method, local_id) return response and 200 <= response.status < 300 + def _send_merge_items(self, http, local_id, items): + with Timeout(self.node_timeout): + response = http.replicate('merge_items', items, local_id) + if not response or not is_success(response.status): + if response: + self.logger.error('ERROR Bad response %s from %s', + response.status, http.host) + return False + return True + def _usync_db(self, point, broker, http, remote_id, local_id): """ Sync a db by sending all records since the last sync. @@ -326,26 +340,28 @@ class Replicator(Daemon): """ self.stats['diff'] += 1 self.logger.increment('diffs') - self.logger.debug('Syncing chunks with %s, starting at %s', - http.host, point) + self.logger.debug('%s usyncing chunks to %s, starting at row %s', + broker.db_file, + '%(ip)s:%(port)s/%(device)s' % http.node, + point) + start = time.time() sync_table = broker.get_syncs() objects = broker.get_items_since(point, self.per_diff) diffs = 0 while len(objects) and diffs < self.max_diffs: diffs += 1 - with Timeout(self.node_timeout): - response = http.replicate('merge_items', objects, local_id) - if not response or response.status >= 300 or response.status < 200: - if response: - self.logger.error(_('ERROR Bad response %(status)s from ' - '%(host)s'), - {'status': response.status, - 'host': http.host}) + if not self._send_merge_items(http, local_id, objects): return False # replication relies on db order to send the next merge batch in # order with no gaps point = objects[-1]['ROWID'] objects = broker.get_items_since(point, self.per_diff) + + self.logger.debug('%s usyncing chunks to %s, finished at row %s (%gs)', + broker.db_file, + '%(ip)s:%(port)s/%(device)s' % http.node, + point, time.time() - start) + if objects: self.logger.debug( 'Synchronization for %s has fallen more than ' @@ -449,32 +465,79 @@ class Replicator(Daemon): if rinfo.get('metadata', ''): broker.update_metadata(json.loads(rinfo['metadata'])) if self._in_sync(rinfo, info, broker, local_sync): + self.logger.debug('%s in sync with %s, nothing to do', + broker.db_file, + '%(ip)s:%(port)s/%(device)s' % node) return True - # if the difference in rowids between the two differs by - # more than 50% and the difference is greater than per_diff, - # rsync then do a remote merge. - # NOTE: difference > per_diff stops us from dropping to rsync - # on smaller containers, who have only a few rows to sync. - if rinfo['max_row'] / float(info['max_row']) < 0.5 and \ - info['max_row'] - rinfo['max_row'] > self.per_diff: - self.stats['remote_merge'] += 1 - self.logger.increment('remote_merges') - return self._rsync_db(broker, node, http, info['id'], - replicate_method='rsync_then_merge', - replicate_timeout=(info['count'] / 2000), - different_region=different_region) - # else send diffs over to the remote server - return self._usync_db(max(rinfo['point'], local_sync), - broker, http, rinfo['id'], info['id']) + return self._choose_replication_mode( + node, rinfo, info, local_sync, broker, http, + different_region) + return False + + def _choose_replication_mode(self, node, rinfo, info, local_sync, broker, + http, different_region): + # if the difference in rowids between the two differs by + # more than 50% and the difference is greater than per_diff, + # rsync then do a remote merge. + # NOTE: difference > per_diff stops us from dropping to rsync + # on smaller containers, who have only a few rows to sync. + if (rinfo['max_row'] / float(info['max_row']) < 0.5 and + info['max_row'] - rinfo['max_row'] > self.per_diff): + self.stats['remote_merge'] += 1 + self.logger.increment('remote_merges') + return self._rsync_db(broker, node, http, info['id'], + replicate_method='rsync_then_merge', + replicate_timeout=(info['count'] / 2000), + different_region=different_region) + # else send diffs over to the remote server + return self._usync_db(max(rinfo['point'], local_sync), + broker, http, rinfo['id'], info['id']) def _post_replicate_hook(self, broker, info, responses): """ - :param broker: the container that just replicated + :param broker: broker instance for the database that just replicated :param info: pre-replication full info dict :param responses: a list of bools indicating success from nodes """ pass + def cleanup_post_replicate(self, broker, orig_info, responses): + """ + Cleanup non primary database from disk if needed. + + :param broker: the broker for the database we're replicating + :param orig_info: snapshot of the broker replication info dict taken + before replication + :param responses: a list of boolean success values for each replication + request to other nodes + + :return success: returns False if deletion of the database was + attempted but unsuccessful, otherwise returns True. + """ + log_template = 'Not deleting db %s (%%s)' % broker.db_file + max_row_delta = broker.get_max_row() - orig_info['max_row'] + if max_row_delta < 0: + reason = 'negative max_row_delta: %s' % max_row_delta + self.logger.error(log_template, reason) + return True + if max_row_delta: + reason = '%s new rows' % max_row_delta + self.logger.debug(log_template, reason) + return True + if not (responses and all(responses)): + reason = '%s/%s success' % (responses.count(True), len(responses)) + self.logger.debug(log_template, reason) + return True + # If the db has been successfully synced to all of its peers, it can be + # removed. Callers should have already checked that the db is not on a + # primary node. + if not self.delete_db(broker): + self.logger.debug( + 'Failed to delete db %s', broker.db_file) + return False + self.logger.debug('Successfully deleted db %s', broker.db_file) + return True + def _replicate_object(self, partition, object_file, node_id): """ Replicate the db, choosing method based on whether or not it @@ -483,12 +546,20 @@ class Replicator(Daemon): :param partition: partition to be replicated to :param object_file: DB file name to be replicated :param node_id: node id of the node to be replicated to + :returns: a tuple (success, responses). ``success`` is a boolean that + is True if the method completed successfully, False otherwise. + ``responses`` is a list of booleans each of which indicates the + success or not of replicating to a peer node if replication has + been attempted. ``success`` is False if any of ``responses`` is + False; when ``responses`` is empty, ``success`` may be either True + or False. """ start_time = now = time.time() self.logger.debug('Replicating db %s', object_file) self.stats['attempted'] += 1 self.logger.increment('attempts') shouldbehere = True + responses = [] try: broker = self.brokerclass(object_file, pending_timeout=30) broker.reclaim(now - self.reclaim_age, @@ -518,18 +589,12 @@ class Replicator(Daemon): failure_dev['device']) for failure_dev in nodes]) self.logger.increment('failures') - return - # The db is considered deleted if the delete_timestamp value is greater - # than the put_timestamp, and there are no objects. - delete_timestamp = Timestamp(info.get('delete_timestamp') or 0) - put_timestamp = Timestamp(info.get('put_timestamp') or 0) - if (now - self.reclaim_age) > delete_timestamp > put_timestamp and \ - info['count'] in (None, '', 0, '0'): + return False, responses + if broker.is_reclaimable(now, self.reclaim_age): if self.report_up_to_date(info): self.delete_db(broker) self.logger.timing_since('timing', start_time) - return - responses = [] + return True, responses failure_devs_info = set() nodes = self.ring.get_part_nodes(int(partition)) local_dev = None @@ -587,14 +652,11 @@ class Replicator(Daemon): except (Exception, Timeout): self.logger.exception('UNHANDLED EXCEPTION: in post replicate ' 'hook for %s', broker.db_file) - if not shouldbehere and responses and all(responses): - # If the db shouldn't be on this node and has been successfully - # synced to all of its peers, it can be removed. - if not self.delete_db(broker): + if not shouldbehere: + if not self.cleanup_post_replicate(broker, info, responses): failure_devs_info.update( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in repl_nodes]) - target_devs_info = set([(target_dev['replication_ip'], target_dev['device']) for target_dev in repl_nodes]) @@ -602,6 +664,9 @@ class Replicator(Daemon): self._add_failure_stats(failure_devs_info) self.logger.timing_since('timing', start_time) + if shouldbehere: + responses.append(True) + return all(responses), responses def delete_db(self, broker): object_file = broker.db_file @@ -746,6 +811,9 @@ class ReplicatorRpc(object): self.mount_check = mount_check self.logger = logger or get_logger({}, log_route='replicator-rpc') + def _db_file_exists(self, db_path): + return os.path.exists(db_path) + def dispatch(self, replicate_args, args): if not hasattr(args, 'pop'): return HTTPBadRequest(body='Invalid object type') @@ -764,7 +832,7 @@ class ReplicatorRpc(object): # someone might be about to rsync a db to us, # make sure there's a tmp dir to receive it. mkdirs(os.path.join(self.root, drive, 'tmp')) - if not os.path.exists(db_file): + if not self._db_file_exists(db_file): return HTTPNotFound() return getattr(self, op)(self.broker_class(db_file), args) @@ -872,12 +940,17 @@ class ReplicatorRpc(object): renamer(old_filename, db_file) return HTTPNoContent() + def _abort_rsync_then_merge(self, db_file, tmp_filename): + return not (self._db_file_exists(db_file) and + os.path.exists(tmp_filename)) + def rsync_then_merge(self, drive, db_file, args): - old_filename = os.path.join(self.root, drive, 'tmp', args[0]) - if not os.path.exists(db_file) or not os.path.exists(old_filename): + tmp_filename = os.path.join(self.root, drive, 'tmp', args[0]) + if self._abort_rsync_then_merge(db_file, tmp_filename): return HTTPNotFound() - new_broker = self.broker_class(old_filename) + new_broker = self.broker_class(tmp_filename) existing_broker = self.broker_class(db_file) + db_file = existing_broker.db_file point = -1 objects = existing_broker.get_items_since(point, 1000) while len(objects): @@ -885,9 +958,12 @@ class ReplicatorRpc(object): point = objects[-1]['ROWID'] objects = existing_broker.get_items_since(point, 1000) sleep() + new_broker.merge_syncs(existing_broker.get_syncs()) new_broker.newid(args[0]) new_broker.update_metadata(existing_broker.metadata) - renamer(old_filename, db_file) + if self._abort_rsync_then_merge(db_file, tmp_filename): + return HTTPNotFound() + renamer(tmp_filename, db_file) return HTTPNoContent() # Footnote [1]: diff --git a/swift/common/wsgi.py b/swift/common/wsgi.py index 752e8767aa..2a9409d92e 100644 --- a/swift/common/wsgi.py +++ b/swift/common/wsgi.py @@ -45,6 +45,9 @@ from swift.common.utils import capture_stdio, disable_fallocate, \ validate_configuration, get_hub, config_auto_int_value, \ reiterate +SIGNUM_TO_NAME = {getattr(signal, n): n for n in dir(signal) + if n.startswith('SIG') and '_' not in n} + # Set maximum line size of message headers to be accepted. wsgi.MAX_HEADER_LINE = constraints.MAX_HEADER_SIZE @@ -559,7 +562,8 @@ class WorkersStrategy(object): :param int pid: The new worker process' PID """ - self.logger.notice('Started child %s' % pid) + self.logger.notice('Started child %s from parent %s', + pid, os.getpid()) self.children.append(pid) def register_worker_exit(self, pid): @@ -569,7 +573,8 @@ class WorkersStrategy(object): :param int pid: The PID of the worker that exited. """ - self.logger.error('Removing dead child %s' % pid) + self.logger.error('Removing dead child %s from parent %s', + pid, os.getpid()) self.children.remove(pid) def shutdown_sockets(self): @@ -935,24 +940,17 @@ def run_wsgi(conf_path, app_section, *args, **kwargs): run_server(conf, logger, no_fork_sock, global_conf=global_conf) return 0 - def kill_children(*args): - """Kills the entire process group.""" - logger.error('SIGTERM received') - signal.signal(signal.SIGTERM, signal.SIG_IGN) - running[0] = False - os.killpg(0, signal.SIGTERM) + def stop_with_signal(signum, *args): + """Set running flag to False and capture the signum""" + running_context[0] = False + running_context[1] = signum - def hup(*args): - """Shuts down the server, but allows running requests to complete""" - logger.error('SIGHUP received') - signal.signal(signal.SIGHUP, signal.SIG_IGN) - running[0] = False + # context to hold boolean running state and stop signum + running_context = [True, None] + signal.signal(signal.SIGTERM, stop_with_signal) + signal.signal(signal.SIGHUP, stop_with_signal) - running = [True] - signal.signal(signal.SIGTERM, kill_children) - signal.signal(signal.SIGHUP, hup) - - while running[0]: + while running_context[0]: for sock, sock_info in strategy.new_worker_socks(): pid = os.fork() if pid == 0: @@ -992,11 +990,23 @@ def run_wsgi(conf_path, app_section, *args, **kwargs): sleep(0.01) except KeyboardInterrupt: logger.notice('User quit') - running[0] = False + running_context[0] = False break + if running_context[1] is not None: + try: + signame = SIGNUM_TO_NAME[running_context[1]] + except KeyError: + logger.error('Stopping with unexpected signal %r' % + running_context[1]) + else: + logger.error('%s received', signame) + if running_context[1] == signal.SIGTERM: + os.killpg(0, signal.SIGTERM) + strategy.shutdown_sockets() - logger.notice('Exited') + signal.signal(signal.SIGTERM, signal.SIG_IGN) + logger.notice('Exited (%s)', os.getpid()) return 0 diff --git a/swift/container/backend.py b/swift/container/backend.py index bab618286a..c61c633739 100644 --- a/swift/container/backend.py +++ b/swift/container/backend.py @@ -15,7 +15,6 @@ """ Pluggable Back-ends for Container Server """ - import os from uuid import uuid4 @@ -25,9 +24,9 @@ from six.moves import range import sqlite3 from swift.common.utils import Timestamp, encode_timestamps, \ - decode_timestamps, extract_swift_bytes -from swift.common.db import DatabaseBroker, utf8encode - + decode_timestamps, extract_swift_bytes, storage_directory, hash_path +from swift.common.db import DatabaseBroker, utf8encode, \ + zero_like, DatabaseAlreadyExists SQLITE_ARG_LIMIT = 999 @@ -227,6 +226,35 @@ class ContainerBroker(DatabaseBroker): db_contains_type = 'object' db_reclaim_timestamp = 'created_at' + @classmethod + def create_broker(self, device_path, part, account, container, logger=None, + put_timestamp=None, storage_policy_index=None): + """ + Create a ContainerBroker instance. If the db doesn't exist, initialize + the db file. + + :param device_path: device path + :param part: partition number + :param account: account name string + :param container: container name string + :param logger: a logger instance + :param put_timestamp: initial timestamp if broker needs to be + initialized + :param storage_policy_index: the storage policy index + :return: a :class:`swift.container.backend.ContainerBroker` instance + """ + hsh = hash_path(account, container) + db_dir = storage_directory(DATADIR, part, hsh) + db_path = os.path.join(device_path, db_dir, hsh + '.db') + broker = ContainerBroker(db_path, account=account, container=container, + logger=logger) + if not os.path.exists(broker.db_file): + try: + broker.initialize(put_timestamp, storage_policy_index) + except DatabaseAlreadyExists: + pass + return broker + @property def storage_policy_index(self): if not hasattr(self, '_storage_policy_index'): @@ -401,7 +429,7 @@ class ContainerBroker(DatabaseBroker): raise row = conn.execute( 'SELECT object_count from container_stat').fetchone() - return (row[0] == 0) + return zero_like(row[0]) def delete_object(self, name, timestamp, storage_policy_index=0): """ @@ -457,7 +485,7 @@ class ContainerBroker(DatabaseBroker): # The container is considered deleted if the delete_timestamp # value is greater than the put_timestamp, and there are no # objects in the container. - return (object_count in (None, '', 0, '0')) and ( + return zero_like(object_count) and ( Timestamp(delete_timestamp) > Timestamp(put_timestamp)) def _is_deleted(self, conn): @@ -473,6 +501,17 @@ class ContainerBroker(DatabaseBroker): FROM container_stat''').fetchone() return self._is_deleted_info(**info) + def is_reclaimable(self, now, reclaim_age): + with self.get() as conn: + info = conn.execute(''' + SELECT put_timestamp, delete_timestamp + FROM container_stat''').fetchone() + if (Timestamp(now - reclaim_age) > + Timestamp(info['delete_timestamp']) > + Timestamp(info['put_timestamp'])): + return self.empty() + return False + def get_info_is_deleted(self): """ Get the is_deleted status and info for the container. diff --git a/swift/container/replicator.py b/swift/container/replicator.py index 41c048716d..9f3fdb53c7 100644 --- a/swift/container/replicator.py +++ b/swift/container/replicator.py @@ -28,9 +28,7 @@ from swift.common import db_replicator from swift.common.storage_policy import POLICIES from swift.common.exceptions import DeviceUnavailable from swift.common.http import is_success -from swift.common.db import DatabaseAlreadyExists -from swift.common.utils import (Timestamp, hash_path, - storage_directory, majority_size) +from swift.common.utils import Timestamp, majority_size class ContainerReplicator(db_replicator.Replicator): @@ -39,6 +37,10 @@ class ContainerReplicator(db_replicator.Replicator): datadir = DATADIR default_port = 6201 + def __init__(self, conf, logger=None): + super(ContainerReplicator, self).__init__(conf, logger=logger) + self.reconciler_cleanups = self.sync_store = None + def report_up_to_date(self, full_info): reported_key_map = { 'reported_put_timestamp': 'put_timestamp', @@ -61,8 +63,7 @@ class ContainerReplicator(db_replicator.Replicator): return sync_args def _handle_sync_response(self, node, response, info, broker, http, - different_region): - parent = super(ContainerReplicator, self) + different_region=False): if is_success(response.status): remote_info = json.loads(response.data) if incorrect_policy_index(info, remote_info): @@ -75,9 +76,8 @@ class ContainerReplicator(db_replicator.Replicator): if any(info[key] != remote_info[key] for key in sync_timestamps): broker.merge_timestamps(*(remote_info[key] for key in sync_timestamps)) - rv = parent._handle_sync_response( + return super(ContainerReplicator, self)._handle_sync_response( node, response, info, broker, http, different_region) - return rv def find_local_handoff_for_part(self, part): """ @@ -114,15 +114,10 @@ class ContainerReplicator(db_replicator.Replicator): raise DeviceUnavailable( 'No mounted devices found suitable to Handoff reconciler ' 'container %s in partition %s' % (container, part)) - hsh = hash_path(account, container) - db_dir = storage_directory(DATADIR, part, hsh) - db_path = os.path.join(self.root, node['device'], db_dir, hsh + '.db') - broker = ContainerBroker(db_path, account=account, container=container) - if not os.path.exists(broker.db_file): - try: - broker.initialize(timestamp, 0) - except DatabaseAlreadyExists: - pass + broker = ContainerBroker.create_broker( + os.path.join(self.root, node['device']), part, account, container, + logger=self.logger, put_timestamp=timestamp, + storage_policy_index=0) if self.reconciler_containers is not None: self.reconciler_containers[container] = part, broker, node['id'] return broker @@ -217,12 +212,13 @@ class ContainerReplicator(db_replicator.Replicator): # this container shouldn't be here, make sure it's cleaned up self.reconciler_cleanups[broker.container] = broker return - try: - # DB is going to get deleted. Be preemptive about it - self.sync_store.remove_synced_container(broker) - except Exception: - self.logger.exception('Failed to remove sync_store entry %s' % - broker.db_file) + if self.sync_store: + try: + # DB is going to get deleted. Be preemptive about it + self.sync_store.remove_synced_container(broker) + except Exception: + self.logger.exception('Failed to remove sync_store entry %s' % + broker.db_file) return super(ContainerReplicator, self).delete_db(broker) diff --git a/swift/container/server.py b/swift/container/server.py index c7df07ac8e..a3c233b664 100644 --- a/swift/container/server.py +++ b/swift/container/server.py @@ -343,6 +343,40 @@ class ContainerController(BaseStorageServer): broker.update_status_changed_at(timestamp) return recreated + def _maybe_autocreate(self, broker, req_timestamp, account, + policy_index): + created = False + if account.startswith(self.auto_create_account_prefix) and \ + not os.path.exists(broker.db_file): + if policy_index is None: + raise HTTPBadRequest( + 'X-Backend-Storage-Policy-Index header is required') + try: + broker.initialize(req_timestamp.internal, policy_index) + except DatabaseAlreadyExists: + pass + else: + created = True + if not os.path.exists(broker.db_file): + raise HTTPNotFound() + return created + + def _update_metadata(self, req, broker, req_timestamp, method): + metadata = {} + metadata.update( + (key, (value, req_timestamp.internal)) + for key, value in req.headers.items() + if key.lower() in self.save_headers or + is_sys_or_user_meta('container', key)) + if metadata: + if 'X-Container-Sync-To' in metadata: + if 'X-Container-Sync-To' not in broker.metadata or \ + metadata['X-Container-Sync-To'][0] != \ + broker.metadata['X-Container-Sync-To'][0]: + broker.set_x_container_sync_points(-1, -1) + broker.update_metadata(metadata, validate_metadata=True) + self._update_sync_store(broker, method) + @public @timing_stats() def PUT(self, req): @@ -364,14 +398,8 @@ class ContainerController(BaseStorageServer): # obj put expects the policy_index header, default is for # legacy support during upgrade. obj_policy_index = requested_policy_index or 0 - if account.startswith(self.auto_create_account_prefix) and \ - not os.path.exists(broker.db_file): - try: - broker.initialize(req_timestamp.internal, obj_policy_index) - except DatabaseAlreadyExists: - pass - if not os.path.exists(broker.db_file): - return HTTPNotFound() + self._maybe_autocreate(broker, req_timestamp, account, + obj_policy_index) broker.put_object(obj, req_timestamp.internal, int(req.headers['x-size']), req.headers['x-content-type'], @@ -391,20 +419,7 @@ class ContainerController(BaseStorageServer): req_timestamp.internal, new_container_policy, requested_policy_index) - metadata = {} - metadata.update( - (key, (value, req_timestamp.internal)) - for key, value in req.headers.items() - if key.lower() in self.save_headers or - is_sys_or_user_meta('container', key)) - if 'X-Container-Sync-To' in metadata: - if 'X-Container-Sync-To' not in broker.metadata or \ - metadata['X-Container-Sync-To'][0] != \ - broker.metadata['X-Container-Sync-To'][0]: - broker.set_x_container_sync_points(-1, -1) - broker.update_metadata(metadata, validate_metadata=True) - if metadata: - self._update_sync_store(broker, 'PUT') + self._update_metadata(req, broker, req_timestamp, 'PUT') resp = self.account_update(req, account, container, broker) if resp: return resp @@ -562,20 +577,7 @@ class ContainerController(BaseStorageServer): if broker.is_deleted(): return HTTPNotFound(request=req) broker.update_put_timestamp(req_timestamp.internal) - metadata = {} - metadata.update( - (key, (value, req_timestamp.internal)) - for key, value in req.headers.items() - if key.lower() in self.save_headers or - is_sys_or_user_meta('container', key)) - if metadata: - if 'X-Container-Sync-To' in metadata: - if 'X-Container-Sync-To' not in broker.metadata or \ - metadata['X-Container-Sync-To'][0] != \ - broker.metadata['X-Container-Sync-To'][0]: - broker.set_x_container_sync_points(-1, -1) - broker.update_metadata(metadata, validate_metadata=True) - self._update_sync_store(broker, 'POST') + self._update_metadata(req, broker, req_timestamp, 'POST') return HTTPNoContent(request=req) def __call__(self, env, start_response): diff --git a/test/__init__.py b/test/__init__.py index 1a56597158..51e3aa9d82 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -17,7 +17,11 @@ # The code below enables nosetests to work with i18n _() blocks from __future__ import print_function import sys +from contextlib import contextmanager + import os +from six import reraise + try: from unittest.util import safe_repr except ImportError: @@ -86,3 +90,26 @@ def listen_zero(): sock.bind(("127.0.0.1", 0)) sock.listen(50) return sock + + +@contextmanager +def annotate_failure(msg): + """ + Catch AssertionError and annotate it with a message. Useful when making + assertions in a loop where the message can indicate the loop index or + richer context about the failure. + + :param msg: A message to be prefixed to the AssertionError message. + """ + try: + yield + except AssertionError as err: + err_typ, err_val, err_tb = sys.exc_info() + if err_val.args: + msg = '%s Failed with %s' % (msg, err_val.args[0]) + err_val.args = (msg, ) + err_val.args[1:] + else: + # workaround for some IDE's raising custom AssertionErrors + err_val = '%s Failed with %s' % (msg, err) + err_typ = AssertionError + reraise(err_typ, err_val, err_tb) diff --git a/test/probe/brain.py b/test/probe/brain.py index 843754210e..fd597cf6b3 100644 --- a/test/probe/brain.py +++ b/test/probe/brain.py @@ -99,9 +99,11 @@ class BrainSplitter(object): raise ValueError('Unknown server_type: %r' % server_type) self.server_type = server_type - part, nodes = self.ring.get_nodes(self.account, c, o) + self.part, self.nodes = self.ring.get_nodes(self.account, c, o) + + node_ids = [n['id'] for n in self.nodes] + self.node_numbers = [n + 1 for n in node_ids] - node_ids = [n['id'] for n in nodes] if all(n_id in node_ids for n_id in (0, 1)): self.primary_numbers = (1, 2) self.handoff_numbers = (3, 4) diff --git a/test/probe/common.py b/test/probe/common.py index ccb5751f26..5622d71b64 100644 --- a/test/probe/common.py +++ b/test/probe/common.py @@ -14,6 +14,8 @@ # limitations under the License. from __future__ import print_function + +import errno import os from subprocess import Popen, PIPE import sys @@ -125,13 +127,17 @@ def kill_server(ipport, ipport2server): if err: raise Exception('unable to kill %s' % (server if not number else '%s%s' % (server, number))) + return wait_for_server_to_hangup(ipport) + + +def wait_for_server_to_hangup(ipport): try_until = time() + 30 while True: try: conn = HTTPConnection(*ipport) conn.request('GET', '/') conn.getresponse() - except Exception as err: + except Exception: break if time() > try_until: raise Exception( @@ -334,33 +340,35 @@ class ProbeTest(unittest.TestCase): Don't instantiate this directly, use a child class instead. """ + def _load_rings_and_configs(self): + self.ipport2server = {} + self.configs = defaultdict(dict) + self.account_ring = get_ring( + 'account', + self.acct_cont_required_replicas, + self.acct_cont_required_devices, + ipport2server=self.ipport2server, + config_paths=self.configs) + self.container_ring = get_ring( + 'container', + self.acct_cont_required_replicas, + self.acct_cont_required_devices, + ipport2server=self.ipport2server, + config_paths=self.configs) + self.policy = get_policy(**self.policy_requirements) + self.object_ring = get_ring( + self.policy.ring_name, + self.obj_required_replicas, + self.obj_required_devices, + server='object', + ipport2server=self.ipport2server, + config_paths=self.configs) + def setUp(self): resetswift() kill_orphans() + self._load_rings_and_configs() try: - self.ipport2server = {} - self.configs = defaultdict(dict) - self.account_ring = get_ring( - 'account', - self.acct_cont_required_replicas, - self.acct_cont_required_devices, - ipport2server=self.ipport2server, - config_paths=self.configs) - self.container_ring = get_ring( - 'container', - self.acct_cont_required_replicas, - self.acct_cont_required_devices, - ipport2server=self.ipport2server, - config_paths=self.configs) - self.policy = get_policy(**self.policy_requirements) - self.object_ring = get_ring( - self.policy.ring_name, - self.obj_required_replicas, - self.obj_required_devices, - server='object', - ipport2server=self.ipport2server, - config_paths=self.configs) - self.servers_per_port = any( int(readconf(c, section_name='object-replicator').get( 'servers_per_port', '0')) @@ -489,6 +497,49 @@ class ProbeTest(unittest.TestCase): finally: shutil.rmtree(tempdir) + def get_all_object_nodes(self): + """ + Returns a list of all nodes in all object storage policies. + + :return: a list of node dicts. + """ + all_obj_nodes = {} + for policy in ENABLED_POLICIES: + for dev in policy.object_ring.devs: + all_obj_nodes[dev['device']] = dev + return all_obj_nodes.values() + + def gather_async_pendings(self, onodes): + """ + Returns a list of paths to async pending files found on given nodes. + + :param onodes: a list of nodes. + :return: a list of file paths. + """ + async_pendings = [] + for onode in onodes: + device_dir = self.device_dir('', onode) + for ap_pol_dir in os.listdir(device_dir): + if not ap_pol_dir.startswith('async_pending'): + # skip 'objects', 'containers', etc. + continue + async_pending_dir = os.path.join(device_dir, ap_pol_dir) + try: + ap_dirs = os.listdir(async_pending_dir) + except OSError as err: + if err.errno == errno.ENOENT: + pass + else: + raise + else: + for ap_dir in ap_dirs: + ap_dir_fullpath = os.path.join( + async_pending_dir, ap_dir) + async_pendings.extend([ + os.path.join(ap_dir_fullpath, ent) + for ent in os.listdir(ap_dir_fullpath)]) + return async_pendings + class ReplProbeTest(ProbeTest): diff --git a/test/probe/test_object_expirer.py b/test/probe/test_object_expirer.py index 92642f19d6..ad31662730 100644 --- a/test/probe/test_object_expirer.py +++ b/test/probe/test_object_expirer.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import errno -import os import random import time import uuid @@ -143,31 +141,6 @@ class TestObjectExpirer(ReplProbeTest): # tha the object server does not write out any async pendings; this # test asserts that this is the case. - def gather_async_pendings(onodes): - async_pendings = [] - for onode in onodes: - device_dir = self.device_dir('', onode) - for ap_pol_dir in os.listdir(device_dir): - if not ap_pol_dir.startswith('async_pending'): - # skip 'objects', 'containers', etc. - continue - async_pending_dir = os.path.join(device_dir, ap_pol_dir) - try: - ap_dirs = os.listdir(async_pending_dir) - except OSError as err: - if err.errno == errno.ENOENT: - pass - else: - raise - else: - for ap_dir in ap_dirs: - ap_dir_fullpath = os.path.join( - async_pending_dir, ap_dir) - async_pendings.extend([ - os.path.join(ap_dir_fullpath, ent) - for ent in os.listdir(ap_dir_fullpath)]) - return async_pendings - # Make an expiring object in each policy for policy in ENABLED_POLICIES: container_name = "expirer-test-%d" % policy.idx @@ -191,15 +164,12 @@ class TestObjectExpirer(ReplProbeTest): # Make sure there's no async_pendings anywhere. Probe tests only run # on single-node installs anyway, so this set should be small enough # that an exhaustive check doesn't take too long. - all_obj_nodes = {} - for policy in ENABLED_POLICIES: - for dev in policy.object_ring.devs: - all_obj_nodes[dev['device']] = dev - pendings_before = gather_async_pendings(all_obj_nodes.values()) + all_obj_nodes = self.get_all_object_nodes() + pendings_before = self.gather_async_pendings(all_obj_nodes) # expire the objects Manager(['object-expirer']).once() - pendings_after = gather_async_pendings(all_obj_nodes.values()) + pendings_after = self.gather_async_pendings(all_obj_nodes) self.assertEqual(pendings_after, pendings_before) def test_expirer_object_should_not_be_expired(self): diff --git a/test/unit/__init__.py b/test/unit/__init__.py index a07b1b2879..2e611806a4 100644 --- a/test/unit/__init__.py +++ b/test/unit/__init__.py @@ -751,6 +751,8 @@ class FakeStatus(object): :param response_sleep: float, time to eventlet sleep during response """ # connect exception + if inspect.isclass(status) and issubclass(status, Exception): + raise status('FakeStatus Error') if isinstance(status, (Exception, eventlet.Timeout)): raise status if isinstance(status, tuple): @@ -1063,6 +1065,15 @@ def make_timestamp_iter(offset=0): for t in itertools.count(int(time.time()) + offset)) +@contextmanager +def mock_timestamp_now(now=None): + if now is None: + now = Timestamp.now() + with mocklib.patch('swift.common.utils.Timestamp.now', + classmethod(lambda c: now)): + yield now + + class Timeout(object): def __init__(self, seconds): self.seconds = seconds @@ -1323,3 +1334,12 @@ def skip_if_no_xattrs(): if not xattr_supported_check(): raise SkipTest('Large xattrs not supported in `%s`. Skipping test' % gettempdir()) + + +def unlink_files(paths): + for path in paths: + try: + os.unlink(path) + except OSError as err: + if err.errno != errno.ENOENT: + raise diff --git a/test/unit/account/test_server.py b/test/unit/account/test_server.py index 2c00773441..4a8f58cb05 100644 --- a/test/unit/account/test_server.py +++ b/test/unit/account/test_server.py @@ -404,7 +404,7 @@ class TestAccountController(unittest.TestCase): elif state[0] == 'race': # Save the original db_file attribute value self._saved_db_file = self.db_file - self.db_file += '.doesnotexist' + self._db_file += '.doesnotexist' def initialize(self, *args, **kwargs): if state[0] == 'initial': @@ -413,7 +413,7 @@ class TestAccountController(unittest.TestCase): elif state[0] == 'race': # Restore the original db_file attribute to get the race # behavior - self.db_file = self._saved_db_file + self._db_file = self._saved_db_file return super(InterceptedAcBr, self).initialize(*args, **kwargs) with mock.patch("swift.account.server.AccountBroker", InterceptedAcBr): diff --git a/test/unit/common/test_db.py b/test/unit/common/test_db.py index f605d0acba..6f723e13a7 100644 --- a/test/unit/common/test_db.py +++ b/test/unit/common/test_db.py @@ -38,7 +38,7 @@ from swift.common.constraints import \ MAX_META_VALUE_LENGTH, MAX_META_COUNT, MAX_META_OVERALL_SIZE from swift.common.db import chexor, dict_factory, get_db_connection, \ DatabaseBroker, DatabaseConnectionError, DatabaseAlreadyExists, \ - GreenDBConnection, PICKLE_PROTOCOL + GreenDBConnection, PICKLE_PROTOCOL, zero_like from swift.common.utils import normalize_timestamp, mkdirs, Timestamp from swift.common.exceptions import LockTimeout from swift.common.swob import HTTPException @@ -46,6 +46,30 @@ from swift.common.swob import HTTPException from test.unit import with_tempdir +class TestHelperFunctions(unittest.TestCase): + + def test_zero_like(self): + expectations = { + # value => expected + None: True, + True: False, + '': True, + 'asdf': False, + 0: True, + 1: False, + '0': True, + '1': False, + } + errors = [] + for value, expected in expectations.items(): + rv = zero_like(value) + if rv != expected: + errors.append('zero_like(%r) => %r expected %r' % ( + value, rv, expected)) + if errors: + self.fail('Some unexpected return values:\n' + '\n'.join(errors)) + + class TestDatabaseConnectionError(unittest.TestCase): def test_str(self): @@ -989,6 +1013,19 @@ class TestDatabaseBroker(unittest.TestCase): self.assertEqual(broker.get_sync(uuid3), 2) broker.merge_syncs([{'sync_point': 5, 'remote_id': uuid2}]) self.assertEqual(broker.get_sync(uuid2), 5) + # max sync point sticks + broker.merge_syncs([{'sync_point': 5, 'remote_id': uuid2}]) + self.assertEqual(broker.get_sync(uuid2), 5) + self.assertEqual(broker.get_sync(uuid3), 2) + broker.merge_syncs([{'sync_point': 4, 'remote_id': uuid2}]) + self.assertEqual(broker.get_sync(uuid2), 5) + self.assertEqual(broker.get_sync(uuid3), 2) + broker.merge_syncs([{'sync_point': -1, 'remote_id': uuid2}, + {'sync_point': 3, 'remote_id': uuid3}]) + self.assertEqual(broker.get_sync(uuid2), 5) + self.assertEqual(broker.get_sync(uuid3), 3) + self.assertEqual(broker.get_sync(uuid2, incoming=False), 3) + self.assertEqual(broker.get_sync(uuid3, incoming=False), 4) def test_get_replication_info(self): self.get_replication_info_tester(metadata=False) @@ -1089,11 +1126,9 @@ class TestDatabaseBroker(unittest.TestCase): 'max_row': 1, 'id': broker_uuid, 'metadata': broker_metadata}) return broker - def test_metadata(self): - def reclaim(broker, timestamp): - with broker.get() as conn: - broker._reclaim(conn, timestamp) - conn.commit() + # only testing _reclaim_metadata here + @patch.object(DatabaseBroker, '_reclaim') + def test_metadata(self, mock_reclaim): # Initializes a good broker for us broker = self.get_replication_info_tester(metadata=True) # Add our first item @@ -1134,7 +1169,7 @@ class TestDatabaseBroker(unittest.TestCase): self.assertEqual(broker.metadata['Second'], [second_value, second_timestamp]) # Reclaim at point before second item was deleted - reclaim(broker, normalize_timestamp(3)) + broker.reclaim(normalize_timestamp(3), normalize_timestamp(3)) self.assertIn('First', broker.metadata) self.assertEqual(broker.metadata['First'], [first_value, first_timestamp]) @@ -1142,7 +1177,7 @@ class TestDatabaseBroker(unittest.TestCase): self.assertEqual(broker.metadata['Second'], [second_value, second_timestamp]) # Reclaim at point second item was deleted - reclaim(broker, normalize_timestamp(4)) + broker.reclaim(normalize_timestamp(4), normalize_timestamp(4)) self.assertIn('First', broker.metadata) self.assertEqual(broker.metadata['First'], [first_value, first_timestamp]) @@ -1150,11 +1185,18 @@ class TestDatabaseBroker(unittest.TestCase): self.assertEqual(broker.metadata['Second'], [second_value, second_timestamp]) # Reclaim after point second item was deleted - reclaim(broker, normalize_timestamp(5)) + broker.reclaim(normalize_timestamp(5), normalize_timestamp(5)) self.assertIn('First', broker.metadata) self.assertEqual(broker.metadata['First'], [first_value, first_timestamp]) self.assertNotIn('Second', broker.metadata) + # Delete first item (by setting to empty string) + first_timestamp = normalize_timestamp(6) + broker.update_metadata({'First': ['', first_timestamp]}) + self.assertIn('First', broker.metadata) + # Check that sync_timestamp doesn't cause item to be reclaimed + broker.reclaim(normalize_timestamp(5), normalize_timestamp(99)) + self.assertIn('First', broker.metadata) def test_update_metadata_missing_container_info(self): # Test missing container_info/container_stat row @@ -1197,7 +1239,7 @@ class TestDatabaseBroker(unittest.TestCase): exc = None try: with broker.get() as conn: - broker._reclaim(conn, 0) + broker._reclaim_metadata(conn, 0) except Exception as err: exc = err self.assertEqual( @@ -1333,5 +1375,141 @@ class TestDatabaseBroker(unittest.TestCase): else: self.fail('Expected an exception to be raised') + def test_skip_commits(self): + broker = DatabaseBroker(':memory:') + self.assertTrue(broker._skip_commit_puts()) + broker._initialize = MagicMock() + broker.initialize(Timestamp.now()) + self.assertTrue(broker._skip_commit_puts()) + + # not initialized + db_file = os.path.join(self.testdir, '1.db') + broker = DatabaseBroker(db_file) + self.assertFalse(os.path.exists(broker.db_file)) # sanity check + self.assertTrue(broker._skip_commit_puts()) + + # no pending file + broker._initialize = MagicMock() + broker.initialize(Timestamp.now()) + self.assertTrue(os.path.exists(broker.db_file)) # sanity check + self.assertFalse(os.path.exists(broker.pending_file)) # sanity check + self.assertTrue(broker._skip_commit_puts()) + + # pending file exists + with open(broker.pending_file, 'wb'): + pass + self.assertTrue(os.path.exists(broker.pending_file)) # sanity check + self.assertFalse(broker._skip_commit_puts()) + + # skip_commits is True + broker.skip_commits = True + self.assertTrue(broker._skip_commit_puts()) + + # re-init + broker = DatabaseBroker(db_file) + self.assertFalse(broker._skip_commit_puts()) + + # constructor can override + broker = DatabaseBroker(db_file, skip_commits=True) + self.assertTrue(broker._skip_commit_puts()) + + def test_commit_puts(self): + db_file = os.path.join(self.testdir, '1.db') + broker = DatabaseBroker(db_file) + broker._initialize = MagicMock() + broker.initialize(Timestamp.now()) + with open(broker.pending_file, 'wb'): + pass + + # merge given list + with patch.object(broker, 'merge_items') as mock_merge_items: + broker._commit_puts(['test']) + mock_merge_items.assert_called_once_with(['test']) + + # load file and merge + with open(broker.pending_file, 'wb') as fd: + fd.write(':1:2:99') + with patch.object(broker, 'merge_items') as mock_merge_items: + broker._commit_puts_load = lambda l, e: l.append(e) + broker._commit_puts() + mock_merge_items.assert_called_once_with(['1', '2', '99']) + self.assertEqual(0, os.path.getsize(broker.pending_file)) + + # load file and merge with given list + with open(broker.pending_file, 'wb') as fd: + fd.write(':bad') + with patch.object(broker, 'merge_items') as mock_merge_items: + broker._commit_puts_load = lambda l, e: l.append(e) + broker._commit_puts(['not']) + mock_merge_items.assert_called_once_with(['not', 'bad']) + self.assertEqual(0, os.path.getsize(broker.pending_file)) + + # skip_commits True - no merge + db_file = os.path.join(self.testdir, '2.db') + broker = DatabaseBroker(db_file, skip_commits=True) + broker._initialize = MagicMock() + broker.initialize(Timestamp.now()) + with open(broker.pending_file, 'wb') as fd: + fd.write(':ignored') + with patch.object(broker, 'merge_items') as mock_merge_items: + with self.assertRaises(DatabaseConnectionError) as cm: + broker._commit_puts(['hmmm']) + mock_merge_items.assert_not_called() + self.assertIn('commits not accepted', str(cm.exception)) + with open(broker.pending_file, 'rb') as fd: + self.assertEqual(':ignored', fd.read()) + + def test_put_record(self): + db_file = os.path.join(self.testdir, '1.db') + broker = DatabaseBroker(db_file) + broker._initialize = MagicMock() + broker.initialize(Timestamp.now()) + + # pending file created and record written + broker.make_tuple_for_pickle = lambda x: x.upper() + with patch.object(broker, '_commit_puts') as mock_commit_puts: + broker.put_record('pinky') + mock_commit_puts.assert_not_called() + with open(broker.pending_file, 'rb') as fd: + pending = fd.read() + items = pending.split(':') + self.assertEqual(['PINKY'], + [pickle.loads(i.decode('base64')) for i in items[1:]]) + + # record appended + with patch.object(broker, '_commit_puts') as mock_commit_puts: + broker.put_record('perky') + mock_commit_puts.assert_not_called() + with open(broker.pending_file, 'rb') as fd: + pending = fd.read() + items = pending.split(':') + self.assertEqual(['PINKY', 'PERKY'], + [pickle.loads(i.decode('base64')) for i in items[1:]]) + + # pending file above cap + cap = swift.common.db.PENDING_CAP + while os.path.getsize(broker.pending_file) < cap: + with open(broker.pending_file, 'ab') as fd: + fd.write('x' * 100000) + with patch.object(broker, '_commit_puts') as mock_commit_puts: + broker.put_record('direct') + mock_commit_puts.called_once_with(['direct']) + + # records shouldn't be put to brokers with skip_commits True because + # they cannot be accepted if the pending file is full + broker.skip_commits = True + with open(broker.pending_file, 'wb'): + # empty the pending file + pass + with patch.object(broker, '_commit_puts') as mock_commit_puts: + with self.assertRaises(DatabaseConnectionError) as cm: + broker.put_record('unwelcome') + self.assertIn('commits not accepted', str(cm.exception)) + mock_commit_puts.assert_not_called() + with open(broker.pending_file, 'rb') as fd: + pending = fd.read() + self.assertFalse(pending) + + if __name__ == '__main__': unittest.main() diff --git a/test/unit/common/test_db_replicator.py b/test/unit/common/test_db_replicator.py index 7c4143d641..20c5d6738a 100644 --- a/test/unit/common/test_db_replicator.py +++ b/test/unit/common/test_db_replicator.py @@ -16,6 +16,8 @@ from __future__ import print_function import unittest from contextlib import contextmanager + +import eventlet import os import logging import errno @@ -26,6 +28,7 @@ from tempfile import mkdtemp, NamedTemporaryFile import json import mock +from copy import deepcopy from mock import patch, call from six.moves import reload_module @@ -37,6 +40,7 @@ from swift.common.exceptions import DriveNotMounted from swift.common.swob import HTTPException from test import unit +from test.unit import FakeLogger from test.unit.common.test_db import ExampleBroker @@ -160,6 +164,11 @@ class ReplHttp(object): self.set_status = set_status replicated = False host = 'localhost' + node = { + 'ip': '127.0.0.1', + 'port': '6000', + 'device': 'sdb', + } def replicate(self, *args): self.replicated = True @@ -230,11 +239,27 @@ class FakeBroker(object): 'put_timestamp': 1, 'created_at': 1, 'count': 0, + 'max_row': 99, + 'id': 'ID', + 'metadata': {} }) if self.stub_replication_info: info.update(self.stub_replication_info) return info + def get_max_row(self, table=None): + return self.get_replication_info()['max_row'] + + def is_reclaimable(self, now, reclaim_age): + info = self.get_replication_info() + return info['count'] == 0 and ( + (now - reclaim_age) > + info['delete_timestamp'] > + info['put_timestamp']) + + def get_other_replication_items(self): + return None + def reclaim(self, item_timestamp, sync_timestamp): pass @@ -273,6 +298,7 @@ class TestDBReplicator(unittest.TestCase): self.recon_cache = mkdtemp() rmtree(self.recon_cache, ignore_errors=1) os.mkdir(self.recon_cache) + self.logger = unit.debug_logger('test-replicator') def tearDown(self): for patcher in self._patchers: @@ -287,6 +313,7 @@ class TestDBReplicator(unittest.TestCase): def stub_delete_db(self, broker): self.delete_db_calls.append('/path/to/file') + return True def test_creation(self): # later config should be extended to assert more config options @@ -647,11 +674,107 @@ class TestDBReplicator(unittest.TestCase): }) def test_replicate_object(self): + # verify return values from replicate_object db_replicator.ring = FakeRingWithNodes() - replicator = TestReplicator({}) - replicator.delete_db = self.stub_delete_db - replicator._replicate_object('0', '/path/to/file', 'node_id') - self.assertEqual([], self.delete_db_calls) + db_path = '/path/to/file' + replicator = TestReplicator({}, logger=FakeLogger()) + info = FakeBroker().get_replication_info() + # make remote appear to be in sync + rinfo = {'point': info['max_row'], 'id': 'remote_id'} + + class FakeResponse(object): + def __init__(self, status, rinfo): + self._status = status + self.data = json.dumps(rinfo) + + @property + def status(self): + if isinstance(self._status, (Exception, eventlet.Timeout)): + raise self._status + return self._status + + # all requests fail + replicate = 'swift.common.db_replicator.ReplConnection.replicate' + with mock.patch(replicate) as fake_replicate: + fake_replicate.side_effect = [ + FakeResponse(500, None), + FakeResponse(500, None), + FakeResponse(500, None)] + with mock.patch.object(replicator, 'delete_db') as mock_delete: + res = replicator._replicate_object('0', db_path, 'node_id') + self.assertRaises(StopIteration, next, fake_replicate.side_effect) + self.assertEqual((False, [False, False, False]), res) + self.assertEqual(0, mock_delete.call_count) + self.assertFalse(replicator.logger.get_lines_for_level('error')) + self.assertFalse(replicator.logger.get_lines_for_level('warning')) + replicator.logger.clear() + + with mock.patch(replicate) as fake_replicate: + fake_replicate.side_effect = [ + FakeResponse(Exception('ugh'), None), + FakeResponse(eventlet.Timeout(), None), + FakeResponse(200, rinfo)] + with mock.patch.object(replicator, 'delete_db') as mock_delete: + res = replicator._replicate_object('0', db_path, 'node_id') + self.assertRaises(StopIteration, next, fake_replicate.side_effect) + self.assertEqual((False, [False, False, True]), res) + self.assertEqual(0, mock_delete.call_count) + lines = replicator.logger.get_lines_for_level('error') + self.assertIn('ERROR syncing', lines[0]) + self.assertIn('ERROR syncing', lines[1]) + self.assertFalse(lines[2:]) + self.assertFalse(replicator.logger.get_lines_for_level('warning')) + replicator.logger.clear() + + # partial success + with mock.patch(replicate) as fake_replicate: + fake_replicate.side_effect = [ + FakeResponse(200, rinfo), + FakeResponse(200, rinfo), + FakeResponse(500, None)] + with mock.patch.object(replicator, 'delete_db') as mock_delete: + res = replicator._replicate_object('0', db_path, 'node_id') + self.assertRaises(StopIteration, next, fake_replicate.side_effect) + self.assertEqual((False, [True, True, False]), res) + self.assertEqual(0, mock_delete.call_count) + self.assertFalse(replicator.logger.get_lines_for_level('error')) + self.assertFalse(replicator.logger.get_lines_for_level('warning')) + replicator.logger.clear() + + # 507 triggers additional requests + with mock.patch(replicate) as fake_replicate: + fake_replicate.side_effect = [ + FakeResponse(200, rinfo), + FakeResponse(200, rinfo), + FakeResponse(507, None), + FakeResponse(507, None), + FakeResponse(200, rinfo)] + with mock.patch.object(replicator, 'delete_db') as mock_delete: + res = replicator._replicate_object('0', db_path, 'node_id') + self.assertRaises(StopIteration, next, fake_replicate.side_effect) + self.assertEqual((False, [True, True, False, False, True]), res) + self.assertEqual(0, mock_delete.call_count) + lines = replicator.logger.get_lines_for_level('error') + self.assertIn('Remote drive not mounted', lines[0]) + self.assertIn('Remote drive not mounted', lines[1]) + self.assertFalse(lines[2:]) + self.assertFalse(replicator.logger.get_lines_for_level('warning')) + replicator.logger.clear() + + # all requests succeed; node id == 'node_id' causes node to be + # considered a handoff so expect the db to be deleted + with mock.patch(replicate) as fake_replicate: + fake_replicate.side_effect = [ + FakeResponse(200, rinfo), + FakeResponse(200, rinfo), + FakeResponse(200, rinfo)] + with mock.patch.object(replicator, 'delete_db') as mock_delete: + res = replicator._replicate_object('0', db_path, 'node_id') + self.assertRaises(StopIteration, next, fake_replicate.side_effect) + self.assertEqual((True, [True, True, True]), res) + self.assertEqual(1, mock_delete.call_count) + self.assertFalse(replicator.logger.get_lines_for_level('error')) + self.assertFalse(replicator.logger.get_lines_for_level('warning')) def test_replicate_object_quarantine(self): replicator = TestReplicator({}) @@ -695,8 +818,122 @@ class TestDBReplicator(unittest.TestCase): replicator.brokerclass = FakeAccountBroker replicator._repl_to_node = lambda *args: True replicator.delete_db = self.stub_delete_db - replicator._replicate_object('0', '/path/to/file', 'node_id') + orig_cleanup = replicator.cleanup_post_replicate + with mock.patch.object(replicator, 'cleanup_post_replicate', + side_effect=orig_cleanup) as mock_cleanup: + replicator._replicate_object('0', '/path/to/file', 'node_id') + mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY, [True] * 3) + self.assertIsInstance(mock_cleanup.call_args[0][0], + replicator.brokerclass) self.assertEqual(['/path/to/file'], self.delete_db_calls) + self.assertEqual(0, replicator.stats['failure']) + + def test_replicate_object_delete_delegated_to_cleanup_post_replicate(self): + replicator = TestReplicator({}) + replicator.ring = FakeRingWithNodes().Ring('path') + replicator.brokerclass = FakeAccountBroker + replicator._repl_to_node = lambda *args: True + replicator.delete_db = self.stub_delete_db + + # cleanup succeeds + with mock.patch.object(replicator, 'cleanup_post_replicate', + return_value=True) as mock_cleanup: + replicator._replicate_object('0', '/path/to/file', 'node_id') + mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY, [True] * 3) + self.assertIsInstance(mock_cleanup.call_args[0][0], + replicator.brokerclass) + self.assertFalse(self.delete_db_calls) + self.assertEqual(0, replicator.stats['failure']) + self.assertEqual(3, replicator.stats['success']) + + # cleanup fails + replicator._zero_stats() + with mock.patch.object(replicator, 'cleanup_post_replicate', + return_value=False) as mock_cleanup: + replicator._replicate_object('0', '/path/to/file', 'node_id') + mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY, [True] * 3) + self.assertIsInstance(mock_cleanup.call_args[0][0], + replicator.brokerclass) + self.assertFalse(self.delete_db_calls) + self.assertEqual(3, replicator.stats['failure']) + self.assertEqual(0, replicator.stats['success']) + + # shouldbehere True - cleanup not required + replicator._zero_stats() + primary_node_id = replicator.ring.get_part_nodes('0')[0]['id'] + with mock.patch.object(replicator, 'cleanup_post_replicate', + return_value=True) as mock_cleanup: + replicator._replicate_object('0', '/path/to/file', primary_node_id) + mock_cleanup.assert_not_called() + self.assertFalse(self.delete_db_calls) + self.assertEqual(0, replicator.stats['failure']) + self.assertEqual(2, replicator.stats['success']) + + def test_cleanup_post_replicate(self): + replicator = TestReplicator({}, logger=self.logger) + replicator.ring = FakeRingWithNodes().Ring('path') + broker = FakeBroker() + replicator._repl_to_node = lambda *args: True + info = broker.get_replication_info() + + with mock.patch.object(replicator, 'delete_db') as mock_delete_db: + res = replicator.cleanup_post_replicate( + broker, info, [False] * 3) + mock_delete_db.assert_not_called() + self.assertTrue(res) + self.assertEqual(['Not deleting db %s (0/3 success)' % broker.db_file], + replicator.logger.get_lines_for_level('debug')) + replicator.logger.clear() + + with mock.patch.object(replicator, 'delete_db') as mock_delete_db: + res = replicator.cleanup_post_replicate( + broker, info, [True, False, True]) + mock_delete_db.assert_not_called() + self.assertTrue(res) + self.assertEqual(['Not deleting db %s (2/3 success)' % broker.db_file], + replicator.logger.get_lines_for_level('debug')) + replicator.logger.clear() + + broker.stub_replication_info = {'max_row': 101} + with mock.patch.object(replicator, 'delete_db') as mock_delete_db: + res = replicator.cleanup_post_replicate( + broker, info, [True] * 3) + mock_delete_db.assert_not_called() + self.assertTrue(res) + self.assertEqual(['Not deleting db %s (2 new rows)' % broker.db_file], + replicator.logger.get_lines_for_level('debug')) + replicator.logger.clear() + + broker.stub_replication_info = {'max_row': 98} + with mock.patch.object(replicator, 'delete_db') as mock_delete_db: + res = replicator.cleanup_post_replicate( + broker, info, [True] * 3) + mock_delete_db.assert_not_called() + self.assertTrue(res) + broker.stub_replication_info = None + self.assertEqual(['Not deleting db %s (negative max_row_delta: -1)' % + broker.db_file], + replicator.logger.get_lines_for_level('error')) + replicator.logger.clear() + + with mock.patch.object(replicator, 'delete_db') as mock_delete_db: + res = replicator.cleanup_post_replicate( + broker, info, [True] * 3) + mock_delete_db.assert_called_once_with(broker) + self.assertTrue(res) + self.assertEqual(['Successfully deleted db %s' % broker.db_file], + replicator.logger.get_lines_for_level('debug')) + replicator.logger.clear() + + with mock.patch.object(replicator, 'delete_db', + return_value=False) as mock_delete_db: + res = replicator.cleanup_post_replicate( + broker, info, [True] * 3) + mock_delete_db.assert_called_once_with(broker) + self.assertFalse(res) + self.assertEqual(['Failed to delete db %s' % broker.db_file], + replicator.logger.get_lines_for_level('debug')) + replicator.logger.clear() def test_replicate_object_with_exception(self): replicator = TestReplicator({}) @@ -949,6 +1186,8 @@ class TestDBReplicator(unittest.TestCase): response = rpc.dispatch(('drive', 'part', 'hash'), ['rsync_then_merge', 'arg1', 'arg2']) expected_calls = [call('/part/ash/hash/hash.db'), + call('/drive/tmp/arg1'), + call(FakeBroker.db_file), call('/drive/tmp/arg1')] self.assertEqual(mock_os.path.exists.call_args_list, expected_calls) @@ -1010,7 +1249,8 @@ class TestDBReplicator(unittest.TestCase): def mock_renamer(old, new): self.assertEqual('/drive/tmp/arg1', old) - self.assertEqual('/data/db.db', new) + # FakeBroker uses module filename as db_file! + self.assertEqual(__file__, new) self._patch(patch.object, db_replicator, 'renamer', mock_renamer) @@ -1023,7 +1263,7 @@ class TestDBReplicator(unittest.TestCase): self.assertEqual('204 No Content', response.status) self.assertEqual(204, response.status_int) - def test_complete_rsync_db_does_not_exist(self): + def test_complete_rsync_db_exists(self): rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker, mount_check=False) @@ -1740,7 +1980,7 @@ class TestReplToNode(unittest.TestCase): def test_repl_to_node_300_status(self): self.http = ReplHttp('{"id": 3, "point": -1}', set_status=300) - self.assertIsNone(self.replicator._repl_to_node( + self.assertFalse(self.replicator._repl_to_node( self.fake_node, FakeBroker(), '0', self.fake_info)) def test_repl_to_node_not_response(self): @@ -1783,7 +2023,7 @@ class FakeHTTPResponse(object): return self.resp.body -def attach_fake_replication_rpc(rpc, replicate_hook=None): +def attach_fake_replication_rpc(rpc, replicate_hook=None, errors=None): class FakeReplConnection(object): def __init__(self, node, partition, hash_, logger): @@ -1795,12 +2035,16 @@ def attach_fake_replication_rpc(rpc, replicate_hook=None): def replicate(self, op, *sync_args): print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args)) - replicate_args = self.path.lstrip('/').split('/') - args = [op] + list(sync_args) - with unit.mock_check_drive(isdir=not rpc.mount_check, - ismount=rpc.mount_check): - swob_response = rpc.dispatch(replicate_args, args) - resp = FakeHTTPResponse(swob_response) + resp = None + if errors and op in errors and errors[op]: + resp = errors[op].pop(0) + if not resp: + replicate_args = self.path.lstrip('/').split('/') + args = [op] + deepcopy(list(sync_args)) + with unit.mock_check_drive(isdir=not rpc.mount_check, + ismount=rpc.mount_check): + swob_response = rpc.dispatch(replicate_args, args) + resp = FakeHTTPResponse(swob_response) if replicate_hook: replicate_hook(op, *sync_args) return resp @@ -1872,15 +2116,19 @@ class TestReplicatorSync(unittest.TestCase): conf.update(conf_updates) return self.replicator_daemon(conf, logger=self.logger) - def _run_once(self, node, conf_updates=None, daemon=None): - daemon = daemon or self._get_daemon(node, conf_updates) - + def _install_fake_rsync_file(self, daemon, captured_calls=None): def _rsync_file(db_file, remote_file, **kwargs): + if captured_calls is not None: + captured_calls.append((db_file, remote_file, kwargs)) remote_server, remote_path = remote_file.split('/', 1) dest_path = os.path.join(self.root, remote_path) copy(db_file, dest_path) return True daemon._rsync_file = _rsync_file + + def _run_once(self, node, conf_updates=None, daemon=None): + daemon = daemon or self._get_daemon(node, conf_updates) + self._install_fake_rsync_file(daemon) with mock.patch('swift.common.db_replicator.whataremyips', new=lambda *a, **kw: [node['replication_ip']]), \ unit.mock_check_drive(isdir=not daemon.mount_check, diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index b9caaabf34..48724eac33 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -1454,6 +1454,15 @@ class TestUtils(unittest.TestCase): with open(testcache_file) as fd: file_dict = json.loads(fd.readline()) self.assertEqual(expect_dict, file_dict) + # nested dict items are not sticky + submit_dict = {'key1': {'key2': {'value3': 3}}} + expect_dict = {'key0': 101, + 'key1': {'key2': {'value3': 3}, + 'value1': 1, 'value2': 2}} + utils.dump_recon_cache(submit_dict, testcache_file, logger) + with open(testcache_file) as fd: + file_dict = json.loads(fd.readline()) + self.assertEqual(expect_dict, file_dict) # cached entries are sticky submit_dict = {} utils.dump_recon_cache(submit_dict, testcache_file, logger) diff --git a/test/unit/common/test_wsgi.py b/test/unit/common/test_wsgi.py index 774fcf84e8..8e88d09b59 100644 --- a/test/unit/common/test_wsgi.py +++ b/test/unit/common/test_wsgi.py @@ -1270,9 +1270,10 @@ class TestWorkersStrategy(unittest.TestCase): pid += 1 sock_count += 1 + mypid = os.getpid() self.assertEqual([ - 'Started child %s' % 88, - 'Started child %s' % 89, + 'Started child %s from parent %s' % (88, mypid), + 'Started child %s from parent %s' % (89, mypid), ], self.logger.get_lines_for_level('notice')) self.assertEqual(2, sock_count) @@ -1282,7 +1283,7 @@ class TestWorkersStrategy(unittest.TestCase): self.strategy.register_worker_exit(88) self.assertEqual([ - 'Removing dead child %s' % 88, + 'Removing dead child %s from parent %s' % (88, mypid) ], self.logger.get_lines_for_level('error')) for s, i in self.strategy.new_worker_socks(): @@ -1294,9 +1295,9 @@ class TestWorkersStrategy(unittest.TestCase): self.assertEqual(1, sock_count) self.assertEqual([ - 'Started child %s' % 88, - 'Started child %s' % 89, - 'Started child %s' % 90, + 'Started child %s from parent %s' % (88, mypid), + 'Started child %s from parent %s' % (89, mypid), + 'Started child %s from parent %s' % (90, mypid), ], self.logger.get_lines_for_level('notice')) def test_post_fork_hook(self): diff --git a/test/unit/container/test_backend.py b/test/unit/container/test_backend.py index 1febf47cfb..9a3d86d4d4 100644 --- a/test/unit/container/test_backend.py +++ b/test/unit/container/test_backend.py @@ -20,7 +20,6 @@ import hashlib import unittest from time import sleep, time from uuid import uuid4 -import itertools import random from collections import defaultdict from contextlib import contextmanager @@ -30,7 +29,7 @@ import json from swift.container.backend import ContainerBroker, \ update_new_item_from_existing -from swift.common.utils import Timestamp, encode_timestamps +from swift.common.utils import Timestamp, encode_timestamps, hash_path from swift.common.storage_policy import POLICIES import mock @@ -46,7 +45,7 @@ class TestContainerBroker(unittest.TestCase): def test_creation(self): # Test ContainerBroker.__init__ broker = ContainerBroker(':memory:', account='a', container='c') - self.assertEqual(broker.db_file, ':memory:') + self.assertEqual(broker._db_file, ':memory:') broker.initialize(Timestamp('1').internal, 0) with broker.get() as conn: curs = conn.cursor() @@ -55,11 +54,11 @@ class TestContainerBroker(unittest.TestCase): @patch_policies def test_storage_policy_property(self): - ts = (Timestamp(t).internal for t in itertools.count(int(time()))) + ts = make_timestamp_iter() for policy in POLICIES: broker = ContainerBroker(':memory:', account='a', container='policy_%s' % policy.name) - broker.initialize(next(ts), policy.idx) + broker.initialize(next(ts).internal, policy.idx) with broker.get() as conn: try: conn.execute('''SELECT storage_policy_index @@ -165,17 +164,17 @@ class TestContainerBroker(unittest.TestCase): broker.delete_db(Timestamp.now().internal) def test_get_info_is_deleted(self): - start = int(time()) - ts = (Timestamp(t).internal for t in itertools.count(start)) + ts = make_timestamp_iter() + start = next(ts) broker = ContainerBroker(':memory:', account='test_account', container='test_container') # create it - broker.initialize(next(ts), POLICIES.default.idx) + broker.initialize(start.internal, POLICIES.default.idx) info, is_deleted = broker.get_info_is_deleted() self.assertEqual(is_deleted, broker.is_deleted()) self.assertEqual(is_deleted, False) # sanity self.assertEqual(info, broker.get_info()) - self.assertEqual(info['put_timestamp'], Timestamp(start).internal) + self.assertEqual(info['put_timestamp'], start.internal) self.assertTrue(Timestamp(info['created_at']) >= start) self.assertEqual(info['delete_timestamp'], '0') if self.__class__ in (TestContainerBrokerBeforeMetadata, @@ -184,28 +183,28 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(info['status_changed_at'], '0') else: self.assertEqual(info['status_changed_at'], - Timestamp(start).internal) + start.internal) # delete it delete_timestamp = next(ts) - broker.delete_db(delete_timestamp) + broker.delete_db(delete_timestamp.internal) info, is_deleted = broker.get_info_is_deleted() self.assertEqual(is_deleted, True) # sanity self.assertEqual(is_deleted, broker.is_deleted()) self.assertEqual(info, broker.get_info()) - self.assertEqual(info['put_timestamp'], Timestamp(start).internal) + self.assertEqual(info['put_timestamp'], start.internal) self.assertTrue(Timestamp(info['created_at']) >= start) self.assertEqual(info['delete_timestamp'], delete_timestamp) self.assertEqual(info['status_changed_at'], delete_timestamp) # bring back to life - broker.put_object('obj', next(ts), 0, 'text/plain', 'etag', + broker.put_object('obj', next(ts).internal, 0, 'text/plain', 'etag', storage_policy_index=broker.storage_policy_index) info, is_deleted = broker.get_info_is_deleted() self.assertEqual(is_deleted, False) # sanity self.assertEqual(is_deleted, broker.is_deleted()) self.assertEqual(info, broker.get_info()) - self.assertEqual(info['put_timestamp'], Timestamp(start).internal) + self.assertEqual(info['put_timestamp'], start.internal) self.assertTrue(Timestamp(info['created_at']) >= start) self.assertEqual(info['delete_timestamp'], delete_timestamp) self.assertEqual(info['status_changed_at'], delete_timestamp) @@ -559,7 +558,7 @@ class TestContainerBroker(unittest.TestCase): "SELECT deleted FROM object").fetchone()[0], deleted) def _test_put_object_multiple_encoded_timestamps(self, broker): - ts = (Timestamp(t) for t in itertools.count(int(time()))) + ts = make_timestamp_iter() broker.initialize(next(ts).internal, 0) t = [next(ts) for _ in range(9)] @@ -629,7 +628,7 @@ class TestContainerBroker(unittest.TestCase): self._test_put_object_multiple_encoded_timestamps(broker) def _test_put_object_multiple_explicit_timestamps(self, broker): - ts = (Timestamp(t) for t in itertools.count(int(time()))) + ts = make_timestamp_iter() broker.initialize(next(ts).internal, 0) t = [next(ts) for _ in range(11)] @@ -733,7 +732,7 @@ class TestContainerBroker(unittest.TestCase): def test_last_modified_time(self): # Test container listing reports the most recent of data or metadata # timestamp as last-modified time - ts = (Timestamp(t) for t in itertools.count(int(time()))) + ts = make_timestamp_iter() broker = ContainerBroker(':memory:', account='a', container='c') broker.initialize(next(ts).internal, 0) @@ -786,18 +785,17 @@ class TestContainerBroker(unittest.TestCase): @patch_policies def test_put_misplaced_object_does_not_effect_container_stats(self): policy = random.choice(list(POLICIES)) - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() broker = ContainerBroker(':memory:', account='a', container='c') - broker.initialize(next(ts), policy.idx) + broker.initialize(next(ts).internal, policy.idx) # migration tests may not honor policy on initialize if isinstance(self, ContainerBrokerMigrationMixin): real_storage_policy_index = \ broker.get_info()['storage_policy_index'] policy = [p for p in POLICIES if p.idx == real_storage_policy_index][0] - broker.put_object('correct_o', next(ts), 123, 'text/plain', + broker.put_object('correct_o', next(ts).internal, 123, 'text/plain', '5af83e3196bf99f440f31f2e1a6c9afe', storage_policy_index=policy.idx) info = broker.get_info() @@ -805,7 +803,7 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(123, info['bytes_used']) other_policy = random.choice([p for p in POLICIES if p is not policy]) - broker.put_object('wrong_o', next(ts), 123, 'text/plain', + broker.put_object('wrong_o', next(ts).internal, 123, 'text/plain', '5af83e3196bf99f440f31f2e1a6c9afe', storage_policy_index=other_policy.idx) self.assertEqual(1, info['object_count']) @@ -814,23 +812,22 @@ class TestContainerBroker(unittest.TestCase): @patch_policies def test_has_multiple_policies(self): policy = random.choice(list(POLICIES)) - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() broker = ContainerBroker(':memory:', account='a', container='c') - broker.initialize(next(ts), policy.idx) + broker.initialize(next(ts).internal, policy.idx) # migration tests may not honor policy on initialize if isinstance(self, ContainerBrokerMigrationMixin): real_storage_policy_index = \ broker.get_info()['storage_policy_index'] policy = [p for p in POLICIES if p.idx == real_storage_policy_index][0] - broker.put_object('correct_o', next(ts), 123, 'text/plain', + broker.put_object('correct_o', next(ts).internal, 123, 'text/plain', '5af83e3196bf99f440f31f2e1a6c9afe', storage_policy_index=policy.idx) self.assertFalse(broker.has_multiple_policies()) other_policy = [p for p in POLICIES if p is not policy][0] - broker.put_object('wrong_o', next(ts), 123, 'text/plain', + broker.put_object('wrong_o', next(ts).internal, 123, 'text/plain', '5af83e3196bf99f440f31f2e1a6c9afe', storage_policy_index=other_policy.idx) self.assertTrue(broker.has_multiple_policies()) @@ -838,11 +835,10 @@ class TestContainerBroker(unittest.TestCase): @patch_policies def test_get_policy_info(self): policy = random.choice(list(POLICIES)) - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() broker = ContainerBroker(':memory:', account='a', container='c') - broker.initialize(next(ts), policy.idx) + broker.initialize(next(ts).internal, policy.idx) # migration tests may not honor policy on initialize if isinstance(self, ContainerBrokerMigrationMixin): real_storage_policy_index = \ @@ -854,7 +850,7 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(policy_stats, expected) # add an object - broker.put_object('correct_o', next(ts), 123, 'text/plain', + broker.put_object('correct_o', next(ts).internal, 123, 'text/plain', '5af83e3196bf99f440f31f2e1a6c9afe', storage_policy_index=policy.idx) policy_stats = broker.get_policy_stats() @@ -864,7 +860,7 @@ class TestContainerBroker(unittest.TestCase): # add a misplaced object other_policy = random.choice([p for p in POLICIES if p is not policy]) - broker.put_object('wrong_o', next(ts), 123, 'text/plain', + broker.put_object('wrong_o', next(ts).internal, 123, 'text/plain', '5af83e3196bf99f440f31f2e1a6c9afe', storage_policy_index=other_policy.idx) policy_stats = broker.get_policy_stats() @@ -876,15 +872,14 @@ class TestContainerBroker(unittest.TestCase): @patch_policies def test_policy_stat_tracking(self): - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() broker = ContainerBroker(':memory:', account='a', container='c') # Note: in subclasses of this TestCase that inherit the # ContainerBrokerMigrationMixin, passing POLICIES.default.idx here has # no effect and broker.get_policy_stats() returns a dict with a single # entry mapping policy index 0 to the container stats - broker.initialize(next(ts), POLICIES.default.idx) + broker.initialize(next(ts).internal, POLICIES.default.idx) stats = defaultdict(dict) def assert_empty_default_policy_stats(policy_stats): @@ -904,7 +899,7 @@ class TestContainerBroker(unittest.TestCase): policy_index = random.randint(0, iters * 0.1) name = 'object-%s' % random.randint(0, iters * 0.1) size = random.randint(0, iters) - broker.put_object(name, next(ts), size, 'text/plain', + broker.put_object(name, next(ts).internal, size, 'text/plain', '5af83e3196bf99f440f31f2e1a6c9afe', storage_policy_index=policy_index) # track the size of the latest timestamp put for each object @@ -1930,12 +1925,11 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(rec['content_type'], 'text/plain') def test_set_storage_policy_index(self): - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() broker = ContainerBroker(':memory:', account='test_account', container='test_container') timestamp = next(ts) - broker.initialize(timestamp, 0) + broker.initialize(timestamp.internal, 0) info = broker.get_info() self.assertEqual(0, info['storage_policy_index']) # sanity check @@ -1946,39 +1940,40 @@ class TestContainerBroker(unittest.TestCase): TestContainerBrokerBeforeSPI): self.assertEqual(info['status_changed_at'], '0') else: - self.assertEqual(timestamp, info['status_changed_at']) + self.assertEqual(timestamp.internal, info['status_changed_at']) expected = {0: {'object_count': 0, 'bytes_used': 0}} self.assertEqual(expected, broker.get_policy_stats()) timestamp = next(ts) - broker.set_storage_policy_index(111, timestamp) + broker.set_storage_policy_index(111, timestamp.internal) self.assertEqual(broker.storage_policy_index, 111) info = broker.get_info() self.assertEqual(111, info['storage_policy_index']) self.assertEqual(0, info['object_count']) self.assertEqual(0, info['bytes_used']) - self.assertEqual(timestamp, info['status_changed_at']) + self.assertEqual(timestamp.internal, info['status_changed_at']) expected[111] = {'object_count': 0, 'bytes_used': 0} self.assertEqual(expected, broker.get_policy_stats()) timestamp = next(ts) - broker.set_storage_policy_index(222, timestamp) + broker.set_storage_policy_index(222, timestamp.internal) self.assertEqual(broker.storage_policy_index, 222) info = broker.get_info() self.assertEqual(222, info['storage_policy_index']) self.assertEqual(0, info['object_count']) self.assertEqual(0, info['bytes_used']) - self.assertEqual(timestamp, info['status_changed_at']) + self.assertEqual(timestamp.internal, info['status_changed_at']) expected[222] = {'object_count': 0, 'bytes_used': 0} self.assertEqual(expected, broker.get_policy_stats()) old_timestamp, timestamp = timestamp, next(ts) - broker.set_storage_policy_index(222, timestamp) # it's idempotent + # setting again is idempotent + broker.set_storage_policy_index(222, timestamp.internal) info = broker.get_info() self.assertEqual(222, info['storage_policy_index']) self.assertEqual(0, info['object_count']) self.assertEqual(0, info['bytes_used']) - self.assertEqual(old_timestamp, info['status_changed_at']) + self.assertEqual(old_timestamp.internal, info['status_changed_at']) self.assertEqual(expected, broker.get_policy_stats()) def test_set_storage_policy_index_empty(self): @@ -2004,19 +1999,18 @@ class TestContainerBroker(unittest.TestCase): @with_tempdir def test_legacy_pending_files(self, tempdir): - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() db_path = os.path.join(tempdir, 'container.db') # first init an acct DB without the policy_stat table present broker = ContainerBroker(db_path, account='a', container='c') - broker.initialize(next(ts), 1) + broker.initialize(next(ts).internal, 1) # manually make some pending entries lacking storage_policy_index with open(broker.pending_file, 'a+b') as fp: for i in range(10): name, timestamp, size, content_type, etag, deleted = ( - 'o%s' % i, next(ts), 0, 'c', 'e', 0) + 'o%s' % i, next(ts).internal, 0, 'c', 'e', 0) fp.write(':') fp.write(pickle.dumps( (name, timestamp, size, content_type, etag, deleted), @@ -2033,7 +2027,7 @@ class TestContainerBroker(unittest.TestCase): else: size = 2 storage_policy_index = 1 - broker.put_object(name, next(ts), size, 'c', 'e', 0, + broker.put_object(name, next(ts).internal, size, 'c', 'e', 0, storage_policy_index=storage_policy_index) broker._commit_puts_stale_ok() @@ -2049,8 +2043,7 @@ class TestContainerBroker(unittest.TestCase): @with_tempdir def test_get_info_no_stale_reads(self, tempdir): - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() db_path = os.path.join(tempdir, 'container.db') def mock_commit_puts(): @@ -2058,13 +2051,13 @@ class TestContainerBroker(unittest.TestCase): broker = ContainerBroker(db_path, account='a', container='c', stale_reads_ok=False) - broker.initialize(next(ts), 1) + broker.initialize(next(ts).internal, 1) # manually make some pending entries with open(broker.pending_file, 'a+b') as fp: for i in range(10): name, timestamp, size, content_type, etag, deleted = ( - 'o%s' % i, next(ts), 0, 'c', 'e', 0) + 'o%s' % i, next(ts).internal, 0, 'c', 'e', 0) fp.write(':') fp.write(pickle.dumps( (name, timestamp, size, content_type, etag, deleted), @@ -2079,8 +2072,7 @@ class TestContainerBroker(unittest.TestCase): @with_tempdir def test_get_info_stale_read_ok(self, tempdir): - ts = (Timestamp(t).internal for t in - itertools.count(int(time()))) + ts = make_timestamp_iter() db_path = os.path.join(tempdir, 'container.db') def mock_commit_puts(): @@ -2088,13 +2080,13 @@ class TestContainerBroker(unittest.TestCase): broker = ContainerBroker(db_path, account='a', container='c', stale_reads_ok=True) - broker.initialize(next(ts), 1) + broker.initialize(next(ts).internal, 1) # manually make some pending entries with open(broker.pending_file, 'a+b') as fp: for i in range(10): name, timestamp, size, content_type, etag, deleted = ( - 'o%s' % i, next(ts), 0, 'c', 'e', 0) + 'o%s' % i, next(ts).internal, 0, 'c', 'e', 0) fp.write(':') fp.write(pickle.dumps( (name, timestamp, size, content_type, etag, deleted), @@ -2104,6 +2096,26 @@ class TestContainerBroker(unittest.TestCase): broker._commit_puts = mock_commit_puts broker.get_info() + @with_tempdir + def test_create_broker(self, tempdir): + broker = ContainerBroker.create_broker(tempdir, 0, 'a', 'c') + hsh = hash_path('a', 'c') + expected_path = os.path.join( + tempdir, 'containers', '0', hsh[-3:], hsh, hsh + '.db') + self.assertEqual(expected_path, broker.db_file) + self.assertTrue(os.path.isfile(expected_path)) + + ts = Timestamp.now() + broker = ContainerBroker.create_broker(tempdir, 0, 'a', 'c1', + put_timestamp=ts.internal) + hsh = hash_path('a', 'c1') + expected_path = os.path.join( + tempdir, 'containers', '0', hsh[-3:], hsh, hsh + '.db') + self.assertEqual(expected_path, broker.db_file) + self.assertTrue(os.path.isfile(expected_path)) + self.assertEqual(ts.internal, broker.get_info()['put_timestamp']) + self.assertEqual(0, broker.get_info()['storage_policy_index']) + class TestCommonContainerBroker(test_db.TestExampleBroker): diff --git a/test/unit/container/test_server.py b/test/unit/container/test_server.py index 54ce6d973b..8327e8754b 100644 --- a/test/unit/container/test_server.py +++ b/test/unit/container/test_server.py @@ -424,7 +424,7 @@ class TestContainerController(unittest.TestCase): elif state[0] == 'race': # Save the original db_file attribute value self._saved_db_file = self.db_file - self.db_file += '.doesnotexist' + self._db_file += '.doesnotexist' def initialize(self, *args, **kwargs): if state[0] == 'initial': @@ -433,7 +433,7 @@ class TestContainerController(unittest.TestCase): elif state[0] == 'race': # Restore the original db_file attribute to get the race # behavior - self.db_file = self._saved_db_file + self._db_file = self._saved_db_file return super(InterceptedCoBr, self).initialize(*args, **kwargs) with mock.patch("swift.container.server.ContainerBroker", From a962340dd86109b1e3204d3c31fb7c5f6ac0cfc4 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Tue, 1 May 2018 15:28:10 +0100 Subject: [PATCH 3/9] Add ShardRange class A ShardRange represents the part of the object namespace that is managed by a container. It encapsulates: - the namespace range, from an excluded lower bound to an included upper bound - the object count and bytes used in the range - the current state of the range, including whether it is deleted or not Co-Authored-By: Matthew Oliver Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Co-Authored-By: Kazuhiro MIYAHARA Change-Id: Iae090dc170843f15fd2a3ea8f167bec2848e928d --- swift/common/utils.py | 550 ++++++++++++++++++++ test/unit/common/test_utils.py | 887 ++++++++++++++++++++++++++++++++- 2 files changed, 1436 insertions(+), 1 deletion(-) diff --git a/swift/common/utils.py b/swift/common/utils.py index 54efdf2b18..6641bce268 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -19,10 +19,12 @@ from __future__ import print_function import base64 import binascii +import bisect import collections import errno import fcntl import grp +import hashlib import hmac import json import math @@ -76,6 +78,7 @@ from six.moves import range, http_client from six.moves.urllib.parse import ParseResult from six.moves.urllib.parse import quote as _quote from six.moves.urllib.parse import urlparse as stdlib_urlparse +from six import string_types from swift import gettext_ as _ import swift.common.exceptions @@ -4370,6 +4373,553 @@ def get_md5_socket(): return md5_sockfd +class ShardRange(object): + """ + A ShardRange encapsulates sharding state related to a container including + lower and upper bounds that define the object namespace for which the + container is responsible. + + Shard ranges may be persisted in a container database. Timestamps + associated with subsets of the shard range attributes are used to resolve + conflicts when a shard range needs to be merged with an existing shard + range record and the most recent version of an attribute should be + persisted. + + :param name: the name of the shard range; this should take the form of a + path to a container i.e. /. + :param timestamp: a timestamp that represents the time at which the + shard range's ``lower``, ``upper`` or ``deleted`` attributes were + last modified. + :param lower: the lower bound of object names contained in the shard range; + the lower bound *is not* included in the shard range namespace. + :param upper: the upper bound of object names contained in the shard range; + the upper bound *is* included in the shard range namespace. + :param object_count: the number of objects in the shard range; defaults to + zero. + :param bytes_used: the number of bytes in the shard range; defaults to + zero. + :param meta_timestamp: a timestamp that represents the time at which the + shard range's ``object_count`` and ``bytes_used`` were last updated; + defaults to the value of ``timestamp``. + :param deleted: a boolean; if True the shard range is considered to be + deleted. + :param state: the state; must be one of ShardRange.STATES; defaults to + CREATED. + :param state_timestamp: a timestamp that represents the time at which + ``state`` was forced to its current value; defaults to the value of + ``timestamp``. This timestamp is typically not updated with every + change of ``state`` because in general conflicts in ``state`` + attributes are resolved by choosing the larger ``state`` value. + However, when this rule does not apply, for example when changing state + from ``SHARDED`` to ``ACTIVE``, the ``state_timestamp`` may be advanced + so that the new ``state`` value is preferred over any older ``state`` + value. + :param epoch: optional epoch timestamp which represents the time at which + sharding was enabled for a container. + """ + FOUND = 10 + CREATED = 20 + CLEAVED = 30 + ACTIVE = 40 + SHRINKING = 50 + SHARDING = 60 + SHARDED = 70 + STATES = {FOUND: 'found', + CREATED: 'created', + CLEAVED: 'cleaved', + ACTIVE: 'active', + SHRINKING: 'shrinking', + SHARDING: 'sharding', + SHARDED: 'sharded'} + STATES_BY_NAME = dict((v, k) for k, v in STATES.items()) + + class OuterBound(object): + def __eq__(self, other): + return isinstance(other, type(self)) + + def __ne__(self, other): + return not self.__eq__(other) + + def __str__(self): + return '' + + def __repr__(self): + return type(self).__name__ + + def __bool__(self): + return False + + __nonzero__ = __bool__ + + @functools.total_ordering + class MaxBound(OuterBound): + def __ge__(self, other): + return True + + @functools.total_ordering + class MinBound(OuterBound): + def __le__(self, other): + return True + + MIN = MinBound() + MAX = MaxBound() + + def __init__(self, name, timestamp, lower=MIN, upper=MAX, + object_count=0, bytes_used=0, meta_timestamp=None, + deleted=False, state=None, state_timestamp=None, epoch=None): + self.account = self.container = self._timestamp = \ + self._meta_timestamp = self._state_timestamp = self._epoch = None + self._lower = ShardRange.MIN + self._upper = ShardRange.MAX + self._deleted = False + self._state = None + + self.name = name + self.timestamp = timestamp + self.lower = lower + self.upper = upper + self.deleted = deleted + self.object_count = object_count + self.bytes_used = bytes_used + self.meta_timestamp = meta_timestamp + self.state = self.FOUND if state is None else state + self.state_timestamp = state_timestamp + self.epoch = epoch + + @classmethod + def _encode(cls, value): + if six.PY2 and isinstance(value, six.text_type): + return value.encode('utf-8') + return value + + def _encode_bound(self, bound): + if isinstance(bound, ShardRange.OuterBound): + return bound + if not isinstance(bound, string_types): + raise TypeError('must be a string type') + return self._encode(bound) + + @classmethod + def _make_container_name(cls, root_container, parent_container, timestamp, + index): + if not isinstance(parent_container, bytes): + parent_container = parent_container.encode('utf-8') + return "%s-%s-%s-%s" % (root_container, + hashlib.md5(parent_container).hexdigest(), + cls._to_timestamp(timestamp).internal, + index) + + @classmethod + def make_path(cls, shards_account, root_container, parent_container, + timestamp, index): + """ + Returns a path for a shard container that is valid to use as a name + when constructing a :class:`~swift.common.utils.ShardRange`. + + :param shards_account: the hidden internal account to which the shard + container belongs. + :param root_container: the name of the root container for the shard. + :param parent_container: the name of the parent container for the + shard; for initial first generation shards this should be the same + as ``root_container``; for shards of shards this should be the name + of the sharding shard container. + :param timestamp: an instance of :class:`~swift.common.utils.Timestamp` + :param index: a unique index that will distinguish the path from any + other path generated using the same combination of + ``shards_account``, ``root_container``, ``parent_container`` and + ``timestamp``. + :return: a string of the form / + """ + shard_container = cls._make_container_name( + root_container, parent_container, timestamp, index) + return '%s/%s' % (shards_account, shard_container) + + @classmethod + def _to_timestamp(cls, timestamp): + if timestamp is None or isinstance(timestamp, Timestamp): + return timestamp + return Timestamp(timestamp) + + @property + def name(self): + return '%s/%s' % (self.account, self.container) + + @name.setter + def name(self, path): + path = self._encode(path) + if not path or len(path.split('/')) != 2 or not all(path.split('/')): + raise ValueError( + "Name must be of the form '/', got %r" % + path) + self.account, self.container = path.split('/') + + @property + def timestamp(self): + return self._timestamp + + @timestamp.setter + def timestamp(self, ts): + if ts is None: + raise TypeError('timestamp cannot be None') + self._timestamp = self._to_timestamp(ts) + + @property + def meta_timestamp(self): + if self._meta_timestamp is None: + return self.timestamp + return self._meta_timestamp + + @meta_timestamp.setter + def meta_timestamp(self, ts): + self._meta_timestamp = self._to_timestamp(ts) + + @property + def lower(self): + return self._lower + + @property + def lower_str(self): + return str(self.lower) + + @lower.setter + def lower(self, value): + if value in (None, ''): + value = ShardRange.MIN + try: + value = self._encode_bound(value) + except TypeError as err: + raise TypeError('lower %s' % err) + if value > self._upper: + raise ValueError( + 'lower (%r) must be less than or equal to upper (%r)' % + (value, self.upper)) + self._lower = value + + @property + def end_marker(self): + return self.upper_str + '\x00' if self.upper else '' + + @property + def upper(self): + return self._upper + + @property + def upper_str(self): + return str(self.upper) + + @upper.setter + def upper(self, value): + if value in (None, ''): + value = ShardRange.MAX + try: + value = self._encode_bound(value) + except TypeError as err: + raise TypeError('upper %s' % err) + if value < self._lower: + raise ValueError( + 'upper (%r) must be greater than or equal to lower (%r)' % + (value, self.lower)) + self._upper = value + + @property + def object_count(self): + return self._count + + @object_count.setter + def object_count(self, count): + count = int(count) + if count < 0: + raise ValueError('object_count cannot be < 0') + self._count = count + + @property + def bytes_used(self): + return self._bytes + + @bytes_used.setter + def bytes_used(self, bytes_used): + bytes_used = int(bytes_used) + if bytes_used < 0: + raise ValueError('bytes_used cannot be < 0') + self._bytes = bytes_used + + def update_meta(self, object_count, bytes_used, meta_timestamp=None): + """ + Set the object stats metadata to the given values and update the + meta_timestamp to the current time. + + :param object_count: should be an integer + :param bytes_used: should be an integer + :param meta_timestamp: timestamp for metadata; if not given the + current time will be set. + :raises ValueError: if ``object_count`` or ``bytes_used`` cannot be + cast to an int, or if meta_timestamp is neither None nor can be + cast to a :class:`~swift.common.utils.Timestamp`. + """ + self.object_count = int(object_count) + self.bytes_used = int(bytes_used) + if meta_timestamp is None: + self.meta_timestamp = Timestamp.now() + else: + self.meta_timestamp = meta_timestamp + + def increment_meta(self, object_count, bytes_used): + """ + Increment the object stats metadata by the given values and update the + meta_timestamp to the current time. + + :param object_count: should be an integer + :param bytes_used: should be an integer + :raises ValueError: if ``object_count`` or ``bytes_used`` cannot be + cast to an int. + """ + self.update_meta(self.object_count + int(object_count), + self.bytes_used + int(bytes_used)) + + @classmethod + def resolve_state(cls, state): + """ + Given a value that may be either the name or the number of a state + return a tuple of (state number, state name). + + :param state: Either a string state name or an integer state number. + :return: A tuple (state number, state name) + :raises ValueError: if ``state`` is neither a valid state name nor a + valid state number. + """ + try: + state = state.lower() + state_num = cls.STATES_BY_NAME[state] + except (KeyError, AttributeError): + try: + state_name = cls.STATES[state] + except KeyError: + raise ValueError('Invalid state %r' % state) + else: + state_num = state + else: + state_name = state + return state_num, state_name + + @property + def state(self): + return self._state + + @state.setter + def state(self, state): + try: + float_state = float(state) + int_state = int(float_state) + except (ValueError, TypeError): + raise ValueError('Invalid state %r' % state) + if int_state != float_state or int_state not in self.STATES: + raise ValueError('Invalid state %r' % state) + self._state = int_state + + @property + def state_text(self): + return self.STATES[self.state] + + @property + def state_timestamp(self): + if self._state_timestamp is None: + return self.timestamp + return self._state_timestamp + + @state_timestamp.setter + def state_timestamp(self, ts): + self._state_timestamp = self._to_timestamp(ts) + + @property + def epoch(self): + return self._epoch + + @epoch.setter + def epoch(self, epoch): + self._epoch = self._to_timestamp(epoch) + + def update_state(self, state, state_timestamp=None): + """ + Set state to the given value and optionally update the state_timestamp + to the given time. + + :param state: new state, should be an integer + :param state_timestamp: timestamp for state; if not given the + state_timestamp will not be changed. + :return: True if the state or state_timestamp was changed, False + otherwise + """ + if state_timestamp is None and self.state == state: + return False + self.state = state + if state_timestamp is not None: + self.state_timestamp = state_timestamp + return True + + @property + def deleted(self): + return self._deleted + + @deleted.setter + def deleted(self, value): + self._deleted = bool(value) + + def set_deleted(self, timestamp=None): + """ + Mark the shard range deleted and set timestamp to the current time. + + :param timestamp: optional timestamp to set; if not given the + current time will be set. + :return: True if the deleted attribute or timestamp was changed, False + otherwise + """ + if timestamp is None and self.deleted: + return False + self.deleted = True + self.timestamp = timestamp or Timestamp.now() + return True + + def __contains__(self, item): + # test if the given item is within the namespace + if item == '': + return False + item = self._encode_bound(item) + return self.lower < item <= self.upper + + def __lt__(self, other): + # a ShardRange is less than other if its entire namespace is less than + # other; if other is another ShardRange that implies that this + # ShardRange's upper must be less than or equal to the other + # ShardRange's lower + if self.upper == ShardRange.MAX: + return False + if isinstance(other, ShardRange): + return self.upper <= other.lower + elif other is None: + return True + else: + return self.upper < other + + def __gt__(self, other): + # a ShardRange is greater than other if its entire namespace is greater + # than other; if other is another ShardRange that implies that this + # ShardRange's lower must be less greater than or equal to the other + # ShardRange's upper + if self.lower == ShardRange.MIN: + return False + if isinstance(other, ShardRange): + return self.lower >= other.upper + elif other is None: + return False + else: + return self.lower >= other + + def __eq__(self, other): + # test for equality of range bounds only + if not isinstance(other, ShardRange): + return False + return self.lower == other.lower and self.upper == other.upper + + def __ne__(self, other): + return not (self == other) + + def __repr__(self): + return '%s<%r to %r as of %s, (%d, %d) as of %s, %s as of %s>' % ( + self.__class__.__name__, self.lower, self.upper, + self.timestamp.internal, self.object_count, self.bytes_used, + self.meta_timestamp.internal, self.state_text, + self.state_timestamp.internal) + + def entire_namespace(self): + """ + Returns True if the ShardRange includes the entire namespace, False + otherwise. + """ + return (self.lower == ShardRange.MIN and + self.upper == ShardRange.MAX) + + def overlaps(self, other): + """ + Returns True if the ShardRange namespace overlaps with the other + ShardRange's namespace. + + :param other: an instance of :class:`~swift.common.utils.ShardRange` + """ + if not isinstance(other, ShardRange): + return False + return max(self.lower, other.lower) < min(self.upper, other.upper) + + def includes(self, other): + """ + Returns True if this namespace includes the whole of the other + namespace, False otherwise. + + :param other: an instance of :class:`~swift.common.utils.ShardRange` + """ + return (self.lower <= other.lower) and (other.upper <= self.upper) + + def __iter__(self): + yield 'name', self.name + yield 'timestamp', self.timestamp.internal + yield 'lower', str(self.lower) + yield 'upper', str(self.upper) + yield 'object_count', self.object_count + yield 'bytes_used', self.bytes_used + yield 'meta_timestamp', self.meta_timestamp.internal + yield 'deleted', 1 if self.deleted else 0 + yield 'state', self.state + yield 'state_timestamp', self.state_timestamp.internal + yield 'epoch', self.epoch.internal if self.epoch is not None else None + + def copy(self, timestamp=None, **kwargs): + """ + Creates a copy of the ShardRange. + + :param timestamp: (optional) If given, the returned ShardRange will + have all of its timestamps set to this value. Otherwise the + returned ShardRange will have the original timestamps. + :return: an instance of :class:`~swift.common.utils.ShardRange` + """ + new = ShardRange.from_dict(dict(self, **kwargs)) + if timestamp: + new.timestamp = timestamp + new.meta_timestamp = new.state_timestamp = None + return new + + @classmethod + def from_dict(cls, params): + """ + Return an instance constructed using the given dict of params. This + method is deliberately less flexible than the class `__init__()` method + and requires all of the `__init__()` args to be given in the dict of + params. + + :param params: a dict of parameters + :return: an instance of this class + """ + return cls( + params['name'], params['timestamp'], params['lower'], + params['upper'], params['object_count'], params['bytes_used'], + params['meta_timestamp'], params['deleted'], params['state'], + params['state_timestamp'], params['epoch']) + + +def find_shard_range(item, ranges): + """ + Find a ShardRange in given list of ``shard_ranges`` whose namespace + contains ``item``. + + :param item: The item for a which a ShardRange is to be found. + :param ranges: a sorted list of ShardRanges. + :return: the ShardRange whose namespace contains ``item``, or None if + no suitable range is found. + """ + index = bisect.bisect_left(ranges, item) + if index != len(ranges) and item in ranges[index]: + return ranges[index] + return None + + def modify_priority(conf, logger): """ Modify priority by nice and ionice. diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index 48724eac33..1f495ac876 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -15,7 +15,11 @@ """Tests for swift.common.utils""" from __future__ import print_function -from test.unit import temptree, debug_logger, make_timestamp_iter, with_tempdir + +import hashlib + +from test.unit import temptree, debug_logger, make_timestamp_iter, \ + with_tempdir, mock_timestamp_now import ctypes import contextlib @@ -3816,6 +3820,64 @@ cluster_dfw1 = http://dfw1.host/v1/ if tempdir: shutil.rmtree(tempdir) + def test_find_shard_range(self): + ts = utils.Timestamp.now().internal + start = utils.ShardRange('a/-a', ts, '', 'a') + atof = utils.ShardRange('a/a-f', ts, 'a', 'f') + ftol = utils.ShardRange('a/f-l', ts, 'f', 'l') + ltor = utils.ShardRange('a/l-r', ts, 'l', 'r') + rtoz = utils.ShardRange('a/r-z', ts, 'r', 'z') + end = utils.ShardRange('a/z-', ts, 'z', '') + ranges = [start, atof, ftol, ltor, rtoz, end] + + found = utils.find_shard_range('', ranges) + self.assertEqual(found, None) + found = utils.find_shard_range(' ', ranges) + self.assertEqual(found, start) + found = utils.find_shard_range(' ', ranges[1:]) + self.assertEqual(found, None) + found = utils.find_shard_range('b', ranges) + self.assertEqual(found, atof) + found = utils.find_shard_range('f', ranges) + self.assertEqual(found, atof) + found = utils.find_shard_range('f\x00', ranges) + self.assertEqual(found, ftol) + found = utils.find_shard_range('x', ranges) + self.assertEqual(found, rtoz) + found = utils.find_shard_range('r', ranges) + self.assertEqual(found, ltor) + found = utils.find_shard_range('}', ranges) + self.assertEqual(found, end) + found = utils.find_shard_range('}', ranges[:-1]) + self.assertEqual(found, None) + # remove l-r from list of ranges and try and find a shard range for an + # item in that range. + found = utils.find_shard_range('p', ranges[:-3] + ranges[-2:]) + self.assertEqual(found, None) + + # add some sub-shards; a sub-shard's state is less than its parent + # while the parent is undeleted, so insert these ahead of the + # overlapping parent in the list of ranges + ftoh = utils.ShardRange('a/f-h', ts, 'f', 'h') + htok = utils.ShardRange('a/h-k', ts, 'h', 'k') + + overlapping_ranges = ranges[:2] + [ftoh, htok] + ranges[2:] + found = utils.find_shard_range('g', overlapping_ranges) + self.assertEqual(found, ftoh) + found = utils.find_shard_range('h', overlapping_ranges) + self.assertEqual(found, ftoh) + found = utils.find_shard_range('k', overlapping_ranges) + self.assertEqual(found, htok) + found = utils.find_shard_range('l', overlapping_ranges) + self.assertEqual(found, ftol) + found = utils.find_shard_range('m', overlapping_ranges) + self.assertEqual(found, ltor) + + ktol = utils.ShardRange('a/k-l', ts, 'k', 'l') + overlapping_ranges = ranges[:2] + [ftoh, htok, ktol] + ranges[2:] + found = utils.find_shard_range('l', overlapping_ranges) + self.assertEqual(found, ktol) + def test_modify_priority(self): pid = os.getpid() logger = debug_logger() @@ -6665,5 +6727,828 @@ class TestDistributeEvenly(unittest.TestCase): self.assertEqual(out, [[0], [1], [2], [3], [4], [], []]) +class TestShardRange(unittest.TestCase): + def setUp(self): + self.ts_iter = make_timestamp_iter() + + def test_min_max_bounds(self): + # max + self.assertEqual(utils.ShardRange.MAX, utils.ShardRange.MAX) + self.assertFalse(utils.ShardRange.MAX > utils.ShardRange.MAX) + self.assertFalse(utils.ShardRange.MAX < utils.ShardRange.MAX) + + for val in 'z', u'\u00e4': + self.assertFalse(utils.ShardRange.MAX == val) + self.assertFalse(val > utils.ShardRange.MAX) + self.assertTrue(val < utils.ShardRange.MAX) + self.assertTrue(utils.ShardRange.MAX > val) + self.assertFalse(utils.ShardRange.MAX < val) + + self.assertEqual('', str(utils.ShardRange.MAX)) + self.assertFalse(utils.ShardRange.MAX) + self.assertTrue(utils.ShardRange.MAX == utils.ShardRange.MAX) + self.assertFalse(utils.ShardRange.MAX != utils.ShardRange.MAX) + self.assertTrue( + utils.ShardRange.MaxBound() == utils.ShardRange.MaxBound()) + self.assertFalse( + utils.ShardRange.MaxBound() != utils.ShardRange.MaxBound()) + + # min + self.assertEqual(utils.ShardRange.MIN, utils.ShardRange.MIN) + self.assertFalse(utils.ShardRange.MIN > utils.ShardRange.MIN) + self.assertFalse(utils.ShardRange.MIN < utils.ShardRange.MIN) + + for val in 'z', u'\u00e4': + self.assertFalse(utils.ShardRange.MIN == val) + self.assertFalse(val < utils.ShardRange.MIN) + self.assertTrue(val > utils.ShardRange.MIN) + self.assertTrue(utils.ShardRange.MIN < val) + self.assertFalse(utils.ShardRange.MIN > val) + self.assertFalse(utils.ShardRange.MIN) + + self.assertEqual('', str(utils.ShardRange.MIN)) + self.assertFalse(utils.ShardRange.MIN) + self.assertTrue(utils.ShardRange.MIN == utils.ShardRange.MIN) + self.assertFalse(utils.ShardRange.MIN != utils.ShardRange.MIN) + self.assertTrue( + utils.ShardRange.MinBound() == utils.ShardRange.MinBound()) + self.assertFalse( + utils.ShardRange.MinBound() != utils.ShardRange.MinBound()) + + self.assertFalse(utils.ShardRange.MAX == utils.ShardRange.MIN) + self.assertFalse(utils.ShardRange.MIN == utils.ShardRange.MAX) + self.assertTrue(utils.ShardRange.MAX != utils.ShardRange.MIN) + self.assertTrue(utils.ShardRange.MIN != utils.ShardRange.MAX) + + self.assertEqual(utils.ShardRange.MAX, + max(utils.ShardRange.MIN, utils.ShardRange.MAX)) + self.assertEqual(utils.ShardRange.MIN, + min(utils.ShardRange.MIN, utils.ShardRange.MAX)) + + def test_shard_range_initialisation(self): + def assert_initialisation_ok(params, expected): + pr = utils.ShardRange(**params) + self.assertDictEqual(dict(pr), expected) + + def assert_initialisation_fails(params, err_type=ValueError): + with self.assertRaises(err_type): + utils.ShardRange(**params) + + ts_1 = next(self.ts_iter) + ts_2 = next(self.ts_iter) + ts_3 = next(self.ts_iter) + ts_4 = next(self.ts_iter) + empty_run = dict(name=None, timestamp=None, lower=None, + upper=None, object_count=0, bytes_used=0, + meta_timestamp=None, deleted=0, + state=utils.ShardRange.FOUND, state_timestamp=None, + epoch=None) + # name, timestamp must be given + assert_initialisation_fails(empty_run.copy()) + assert_initialisation_fails(dict(empty_run, name='a/c'), TypeError) + assert_initialisation_fails(dict(empty_run, timestamp=ts_1)) + # name must be form a/c + assert_initialisation_fails(dict(empty_run, name='c', timestamp=ts_1)) + assert_initialisation_fails(dict(empty_run, name='', timestamp=ts_1)) + assert_initialisation_fails(dict(empty_run, name='/a/c', + timestamp=ts_1)) + assert_initialisation_fails(dict(empty_run, name='/c', + timestamp=ts_1)) + # lower, upper can be None + expect = dict(name='a/c', timestamp=ts_1.internal, lower='', + upper='', object_count=0, bytes_used=0, + meta_timestamp=ts_1.internal, deleted=0, + state=utils.ShardRange.FOUND, + state_timestamp=ts_1.internal, epoch=None) + assert_initialisation_ok(dict(empty_run, name='a/c', timestamp=ts_1), + expect) + assert_initialisation_ok(dict(name='a/c', timestamp=ts_1), expect) + + good_run = dict(name='a/c', timestamp=ts_1, lower='l', + upper='u', object_count=2, bytes_used=10, + meta_timestamp=ts_2, deleted=0, + state=utils.ShardRange.CREATED, + state_timestamp=ts_3.internal, epoch=ts_4) + expect.update({'lower': 'l', 'upper': 'u', 'object_count': 2, + 'bytes_used': 10, 'meta_timestamp': ts_2.internal, + 'state': utils.ShardRange.CREATED, + 'state_timestamp': ts_3.internal, 'epoch': ts_4}) + assert_initialisation_ok(good_run.copy(), expect) + + # obj count and bytes used as int strings + good_str_run = good_run.copy() + good_str_run.update({'object_count': '2', 'bytes_used': '10'}) + assert_initialisation_ok(good_str_run, expect) + + good_no_meta = good_run.copy() + good_no_meta.pop('meta_timestamp') + assert_initialisation_ok(good_no_meta, + dict(expect, meta_timestamp=ts_1.internal)) + + good_deleted = good_run.copy() + good_deleted['deleted'] = 1 + assert_initialisation_ok(good_deleted, + dict(expect, deleted=1)) + + assert_initialisation_fails(dict(good_run, timestamp='water balloon')) + + assert_initialisation_fails( + dict(good_run, meta_timestamp='water balloon')) + + assert_initialisation_fails(dict(good_run, lower='water balloon')) + + assert_initialisation_fails(dict(good_run, upper='balloon')) + + assert_initialisation_fails( + dict(good_run, object_count='water balloon')) + + assert_initialisation_fails(dict(good_run, bytes_used='water ballon')) + + assert_initialisation_fails(dict(good_run, object_count=-1)) + + assert_initialisation_fails(dict(good_run, bytes_used=-1)) + assert_initialisation_fails(dict(good_run, state=-1)) + assert_initialisation_fails(dict(good_run, state_timestamp='not a ts')) + assert_initialisation_fails(dict(good_run, name='/a/c')) + assert_initialisation_fails(dict(good_run, name='/a/c/')) + assert_initialisation_fails(dict(good_run, name='a/c/')) + assert_initialisation_fails(dict(good_run, name='a')) + assert_initialisation_fails(dict(good_run, name='')) + + def _check_to_from_dict(self, lower, upper): + ts_1 = next(self.ts_iter) + ts_2 = next(self.ts_iter) + ts_3 = next(self.ts_iter) + ts_4 = next(self.ts_iter) + sr = utils.ShardRange('a/test', ts_1, lower, upper, 10, 100, ts_2, + state=None, state_timestamp=ts_3, epoch=ts_4) + sr_dict = dict(sr) + expected = { + 'name': 'a/test', 'timestamp': ts_1.internal, 'lower': lower, + 'upper': upper, 'object_count': 10, 'bytes_used': 100, + 'meta_timestamp': ts_2.internal, 'deleted': 0, + 'state': utils.ShardRange.FOUND, 'state_timestamp': ts_3.internal, + 'epoch': ts_4} + self.assertEqual(expected, sr_dict) + self.assertIsInstance(sr_dict['lower'], six.string_types) + self.assertIsInstance(sr_dict['upper'], six.string_types) + sr_new = utils.ShardRange.from_dict(sr_dict) + self.assertEqual(sr, sr_new) + self.assertEqual(sr_dict, dict(sr_new)) + + sr_new = utils.ShardRange(**sr_dict) + self.assertEqual(sr, sr_new) + self.assertEqual(sr_dict, dict(sr_new)) + + for key in sr_dict: + bad_dict = dict(sr_dict) + bad_dict.pop(key) + with self.assertRaises(KeyError): + utils.ShardRange.from_dict(bad_dict) + # But __init__ still (generally) works! + if key not in ('name', 'timestamp'): + utils.ShardRange(**bad_dict) + else: + with self.assertRaises(TypeError): + utils.ShardRange(**bad_dict) + + def test_to_from_dict(self): + self._check_to_from_dict('l', 'u') + self._check_to_from_dict('', '') + + def test_timestamp_setter(self): + ts_1 = next(self.ts_iter) + sr = utils.ShardRange('a/test', ts_1, 'l', 'u', 0, 0, None) + self.assertEqual(ts_1, sr.timestamp) + + ts_2 = next(self.ts_iter) + sr.timestamp = ts_2 + self.assertEqual(ts_2, sr.timestamp) + + sr.timestamp = 0 + self.assertEqual(utils.Timestamp(0), sr.timestamp) + + with self.assertRaises(TypeError): + sr.timestamp = None + + def test_meta_timestamp_setter(self): + ts_1 = next(self.ts_iter) + sr = utils.ShardRange('a/test', ts_1, 'l', 'u', 0, 0, None) + self.assertEqual(ts_1, sr.timestamp) + self.assertEqual(ts_1, sr.meta_timestamp) + + ts_2 = next(self.ts_iter) + sr.meta_timestamp = ts_2 + self.assertEqual(ts_1, sr.timestamp) + self.assertEqual(ts_2, sr.meta_timestamp) + + ts_3 = next(self.ts_iter) + sr.timestamp = ts_3 + self.assertEqual(ts_3, sr.timestamp) + self.assertEqual(ts_2, sr.meta_timestamp) + + # meta_timestamp defaults to tracking timestamp + sr.meta_timestamp = None + self.assertEqual(ts_3, sr.timestamp) + self.assertEqual(ts_3, sr.meta_timestamp) + ts_4 = next(self.ts_iter) + sr.timestamp = ts_4 + self.assertEqual(ts_4, sr.timestamp) + self.assertEqual(ts_4, sr.meta_timestamp) + + sr.meta_timestamp = 0 + self.assertEqual(ts_4, sr.timestamp) + self.assertEqual(utils.Timestamp(0), sr.meta_timestamp) + + def test_update_meta(self): + ts_1 = next(self.ts_iter) + sr = utils.ShardRange('a/test', ts_1, 'l', 'u', 0, 0, None) + with mock_timestamp_now(next(self.ts_iter)) as now: + sr.update_meta(9, 99) + self.assertEqual(9, sr.object_count) + self.assertEqual(99, sr.bytes_used) + self.assertEqual(now, sr.meta_timestamp) + + with mock_timestamp_now(next(self.ts_iter)) as now: + sr.update_meta(99, 999, None) + self.assertEqual(99, sr.object_count) + self.assertEqual(999, sr.bytes_used) + self.assertEqual(now, sr.meta_timestamp) + + ts_2 = next(self.ts_iter) + sr.update_meta(21, 2112, ts_2) + self.assertEqual(21, sr.object_count) + self.assertEqual(2112, sr.bytes_used) + self.assertEqual(ts_2, sr.meta_timestamp) + + sr.update_meta('11', '12') + self.assertEqual(11, sr.object_count) + self.assertEqual(12, sr.bytes_used) + + def check_bad_args(*args): + with self.assertRaises(ValueError): + sr.update_meta(*args) + check_bad_args('bad', 10) + check_bad_args(10, 'bad') + check_bad_args(10, 11, 'bad') + + def test_increment_meta(self): + ts_1 = next(self.ts_iter) + sr = utils.ShardRange('a/test', ts_1, 'l', 'u', 1, 2, None) + with mock_timestamp_now(next(self.ts_iter)) as now: + sr.increment_meta(9, 99) + self.assertEqual(10, sr.object_count) + self.assertEqual(101, sr.bytes_used) + self.assertEqual(now, sr.meta_timestamp) + + sr.increment_meta('11', '12') + self.assertEqual(21, sr.object_count) + self.assertEqual(113, sr.bytes_used) + + def check_bad_args(*args): + with self.assertRaises(ValueError): + sr.increment_meta(*args) + check_bad_args('bad', 10) + check_bad_args(10, 'bad') + + def test_state_timestamp_setter(self): + ts_1 = next(self.ts_iter) + sr = utils.ShardRange('a/test', ts_1, 'l', 'u', 0, 0, None) + self.assertEqual(ts_1, sr.timestamp) + self.assertEqual(ts_1, sr.state_timestamp) + + ts_2 = next(self.ts_iter) + sr.state_timestamp = ts_2 + self.assertEqual(ts_1, sr.timestamp) + self.assertEqual(ts_2, sr.state_timestamp) + + ts_3 = next(self.ts_iter) + sr.timestamp = ts_3 + self.assertEqual(ts_3, sr.timestamp) + self.assertEqual(ts_2, sr.state_timestamp) + + # state_timestamp defaults to tracking timestamp + sr.state_timestamp = None + self.assertEqual(ts_3, sr.timestamp) + self.assertEqual(ts_3, sr.state_timestamp) + ts_4 = next(self.ts_iter) + sr.timestamp = ts_4 + self.assertEqual(ts_4, sr.timestamp) + self.assertEqual(ts_4, sr.state_timestamp) + + sr.state_timestamp = 0 + self.assertEqual(ts_4, sr.timestamp) + self.assertEqual(utils.Timestamp(0), sr.state_timestamp) + + def test_state_setter(self): + for state in utils.ShardRange.STATES: + for test_value in (state, str(state)): + sr = utils.ShardRange('a/test', next(self.ts_iter), 'l', 'u') + sr.state = test_value + actual = sr.state + self.assertEqual( + state, actual, + 'Expected %s but got %s for %s' % + (state, actual, test_value) + ) + + for bad_state in (max(utils.ShardRange.STATES) + 1, + -1, 99, None, 'stringy', 1.1): + sr = utils.ShardRange('a/test', next(self.ts_iter), 'l', 'u') + with self.assertRaises(ValueError) as cm: + sr.state = bad_state + self.assertIn('Invalid state', str(cm.exception)) + + def test_update_state(self): + sr = utils.ShardRange('a/c', next(self.ts_iter)) + old_sr = sr.copy() + self.assertEqual(utils.ShardRange.FOUND, sr.state) + self.assertEqual(dict(sr), dict(old_sr)) # sanity check + + for state in utils.ShardRange.STATES: + if state == utils.ShardRange.FOUND: + continue + self.assertTrue(sr.update_state(state)) + self.assertEqual(dict(old_sr, state=state), dict(sr)) + self.assertFalse(sr.update_state(state)) + self.assertEqual(dict(old_sr, state=state), dict(sr)) + + sr = utils.ShardRange('a/c', next(self.ts_iter)) + old_sr = sr.copy() + for state in utils.ShardRange.STATES: + ts = next(self.ts_iter) + self.assertTrue(sr.update_state(state, state_timestamp=ts)) + self.assertEqual(dict(old_sr, state=state, state_timestamp=ts), + dict(sr)) + + def test_resolve_state(self): + for name, number in utils.ShardRange.STATES_BY_NAME.items(): + self.assertEqual( + (number, name), utils.ShardRange.resolve_state(name)) + self.assertEqual( + (number, name), utils.ShardRange.resolve_state(name.upper())) + self.assertEqual( + (number, name), utils.ShardRange.resolve_state(name.title())) + self.assertEqual( + (number, name), utils.ShardRange.resolve_state(number)) + + def check_bad_value(value): + with self.assertRaises(ValueError) as cm: + utils.ShardRange.resolve_state(value) + self.assertIn('Invalid state %r' % value, str(cm.exception)) + + check_bad_value(min(utils.ShardRange.STATES) - 1) + check_bad_value(max(utils.ShardRange.STATES) + 1) + check_bad_value('badstate') + + def test_epoch_setter(self): + sr = utils.ShardRange('a/c', next(self.ts_iter)) + self.assertIsNone(sr.epoch) + ts = next(self.ts_iter) + sr.epoch = ts + self.assertEqual(ts, sr.epoch) + ts = next(self.ts_iter) + sr.epoch = ts.internal + self.assertEqual(ts, sr.epoch) + sr.epoch = None + self.assertIsNone(sr.epoch) + with self.assertRaises(ValueError): + sr.epoch = 'bad' + + def test_deleted_setter(self): + sr = utils.ShardRange('a/c', next(self.ts_iter)) + for val in (True, 1): + sr.deleted = val + self.assertIs(True, sr.deleted) + for val in (False, 0, None): + sr.deleted = val + self.assertIs(False, sr.deleted) + + def test_set_deleted(self): + sr = utils.ShardRange('a/c', next(self.ts_iter)) + # initialise other timestamps + sr.update_state(utils.ShardRange.ACTIVE, + state_timestamp=utils.Timestamp.now()) + sr.update_meta(1, 2) + old_sr = sr.copy() + self.assertIs(False, sr.deleted) # sanity check + self.assertEqual(dict(sr), dict(old_sr)) # sanity check + + with mock_timestamp_now(next(self.ts_iter)) as now: + self.assertTrue(sr.set_deleted()) + self.assertEqual(now, sr.timestamp) + self.assertIs(True, sr.deleted) + old_sr_dict = dict(old_sr) + old_sr_dict.pop('deleted') + old_sr_dict.pop('timestamp') + sr_dict = dict(sr) + sr_dict.pop('deleted') + sr_dict.pop('timestamp') + self.assertEqual(old_sr_dict, sr_dict) + + # no change + self.assertFalse(sr.set_deleted()) + self.assertEqual(now, sr.timestamp) + self.assertIs(True, sr.deleted) + + # force timestamp change + with mock_timestamp_now(next(self.ts_iter)) as now: + self.assertTrue(sr.set_deleted(timestamp=now)) + self.assertEqual(now, sr.timestamp) + self.assertIs(True, sr.deleted) + + def test_lower_setter(self): + sr = utils.ShardRange('a/c', utils.Timestamp.now(), 'b', '') + # sanity checks + self.assertEqual('b', sr.lower) + self.assertEqual(sr.MAX, sr.upper) + + def do_test(good_value, expected): + sr.lower = good_value + self.assertEqual(expected, sr.lower) + self.assertEqual(sr.MAX, sr.upper) + + do_test(utils.ShardRange.MIN, utils.ShardRange.MIN) + do_test(utils.ShardRange.MAX, utils.ShardRange.MAX) + do_test('', utils.ShardRange.MIN) + do_test(u'', utils.ShardRange.MIN) + do_test(None, utils.ShardRange.MIN) + do_test('a', 'a') + do_test('y', 'y') + + sr = utils.ShardRange('a/c', utils.Timestamp.now(), 'b', 'y') + sr.lower = '' + self.assertEqual(sr.MIN, sr.lower) + + sr = utils.ShardRange('a/c', utils.Timestamp.now(), 'b', 'y') + with self.assertRaises(ValueError) as cm: + sr.lower = 'z' + self.assertIn("lower ('z') must be less than or equal to upper ('y')", + str(cm.exception)) + self.assertEqual('b', sr.lower) + self.assertEqual('y', sr.upper) + + def do_test(bad_value): + with self.assertRaises(TypeError) as cm: + sr.lower = bad_value + self.assertIn("lower must be a string", str(cm.exception)) + self.assertEqual('b', sr.lower) + self.assertEqual('y', sr.upper) + + do_test(1) + do_test(1.234) + + def test_upper_setter(self): + sr = utils.ShardRange('a/c', utils.Timestamp.now(), '', 'y') + # sanity checks + self.assertEqual(sr.MIN, sr.lower) + self.assertEqual('y', sr.upper) + + def do_test(good_value, expected): + sr.upper = good_value + self.assertEqual(expected, sr.upper) + self.assertEqual(sr.MIN, sr.lower) + + do_test(utils.ShardRange.MIN, utils.ShardRange.MIN) + do_test(utils.ShardRange.MAX, utils.ShardRange.MAX) + do_test('', utils.ShardRange.MAX) + do_test(u'', utils.ShardRange.MAX) + do_test(None, utils.ShardRange.MAX) + do_test('z', 'z') + do_test('b', 'b') + + sr = utils.ShardRange('a/c', utils.Timestamp.now(), 'b', 'y') + sr.upper = '' + self.assertEqual(sr.MAX, sr.upper) + + sr = utils.ShardRange('a/c', utils.Timestamp.now(), 'b', 'y') + with self.assertRaises(ValueError) as cm: + sr.upper = 'a' + self.assertIn( + "upper ('a') must be greater than or equal to lower ('b')", + str(cm.exception)) + self.assertEqual('b', sr.lower) + self.assertEqual('y', sr.upper) + + def do_test(bad_value): + with self.assertRaises(TypeError) as cm: + sr.upper = bad_value + self.assertIn("upper must be a string", str(cm.exception)) + self.assertEqual('b', sr.lower) + self.assertEqual('y', sr.upper) + + do_test(1) + do_test(1.234) + + def test_end_marker(self): + sr = utils.ShardRange('a/c', utils.Timestamp.now(), '', 'y') + self.assertEqual('y\x00', sr.end_marker) + sr = utils.ShardRange('a/c', utils.Timestamp.now(), '', '') + self.assertEqual('', sr.end_marker) + + def test_bounds_serialization(self): + sr = utils.ShardRange('a/c', utils.Timestamp.now()) + self.assertEqual('a/c', sr.name) + self.assertEqual(utils.ShardRange.MIN, sr.lower) + self.assertEqual('', sr.lower_str) + self.assertEqual(utils.ShardRange.MAX, sr.upper) + self.assertEqual('', sr.upper_str) + self.assertEqual('', sr.end_marker) + + lower = u'\u00e4' + upper = u'\u00fb' + sr = utils.ShardRange('a/%s-%s' % (lower, upper), + utils.Timestamp.now(), lower, upper) + if six.PY3: + self.assertEqual(u'\u00e4', sr.lower) + self.assertEqual(u'\u00e4', sr.lower_str) + self.assertEqual(u'\u00fb', sr.upper) + self.assertEqual(u'\u00fb', sr.upper_str) + self.assertEqual(u'\u00fb\x00', sr.end_marker) + else: + self.assertEqual(u'\u00e4'.encode('utf8'), sr.lower) + self.assertEqual(u'\u00e4'.encode('utf8'), sr.lower_str) + self.assertEqual(u'\u00fb'.encode('utf8'), sr.upper) + self.assertEqual(u'\u00fb'.encode('utf8'), sr.upper_str) + self.assertEqual(u'\u00fb\x00'.encode('utf8'), sr.end_marker) + + def test_entire_namespace(self): + # test entire range (no boundaries) + entire = utils.ShardRange('a/test', utils.Timestamp.now()) + self.assertEqual(utils.ShardRange.MAX, entire.upper) + self.assertEqual(utils.ShardRange.MIN, entire.lower) + self.assertIs(True, entire.entire_namespace()) + + for x in range(100): + self.assertTrue(str(x) in entire) + self.assertTrue(chr(x) in entire) + + for x in ('a', 'z', 'zzzz', '124fsdf', u'\u00e4'): + self.assertTrue(x in entire, '%r should be in %r' % (x, entire)) + + entire.lower = 'a' + self.assertIs(False, entire.entire_namespace()) + + def test_comparisons(self): + ts = utils.Timestamp.now().internal + + # upper (if provided) *must* be greater than lower + with self.assertRaises(ValueError): + utils.ShardRange('f-a', ts, 'f', 'a') + + # test basic boundaries + btoc = utils.ShardRange('a/b-c', ts, 'b', 'c') + atof = utils.ShardRange('a/a-f', ts, 'a', 'f') + ftol = utils.ShardRange('a/f-l', ts, 'f', 'l') + ltor = utils.ShardRange('a/l-r', ts, 'l', 'r') + rtoz = utils.ShardRange('a/r-z', ts, 'r', 'z') + lower = utils.ShardRange('a/lower', ts, '', 'mid') + upper = utils.ShardRange('a/upper', ts, 'mid', '') + entire = utils.ShardRange('a/test', utils.Timestamp.now()) + + # overlapping ranges + dtof = utils.ShardRange('a/d-f', ts, 'd', 'f') + dtom = utils.ShardRange('a/d-m', ts, 'd', 'm') + + # test range > and < + # non-adjacent + self.assertFalse(rtoz < atof) + self.assertTrue(atof < ltor) + self.assertTrue(ltor > atof) + self.assertFalse(ftol > rtoz) + + # adjacent + self.assertFalse(rtoz < ltor) + self.assertTrue(ltor < rtoz) + self.assertFalse(ltor > rtoz) + self.assertTrue(rtoz > ltor) + + # wholly within + self.assertFalse(btoc < atof) + self.assertFalse(btoc > atof) + self.assertFalse(atof < btoc) + self.assertFalse(atof > btoc) + + self.assertFalse(atof < dtof) + self.assertFalse(dtof > atof) + self.assertFalse(atof > dtof) + self.assertFalse(dtof < atof) + + self.assertFalse(dtof < dtom) + self.assertFalse(dtof > dtom) + self.assertFalse(dtom > dtof) + self.assertFalse(dtom < dtof) + + # overlaps + self.assertFalse(atof < dtom) + self.assertFalse(atof > dtom) + self.assertFalse(ltor > dtom) + + # ranges including min/max bounds + self.assertTrue(upper > lower) + self.assertTrue(lower < upper) + self.assertFalse(upper < lower) + self.assertFalse(lower > upper) + + self.assertFalse(lower < entire) + self.assertFalse(entire > lower) + self.assertFalse(lower > entire) + self.assertFalse(entire < lower) + + self.assertFalse(upper < entire) + self.assertFalse(entire > upper) + self.assertFalse(upper > entire) + self.assertFalse(entire < upper) + + self.assertFalse(entire < entire) + self.assertFalse(entire > entire) + + # test range < and > to an item + # range is > lower and <= upper to lower boundary isn't + # actually included + self.assertTrue(ftol > 'f') + self.assertFalse(atof < 'f') + self.assertTrue(ltor < 'y') + + self.assertFalse(ftol < 'f') + self.assertFalse(atof > 'f') + self.assertFalse(ltor > 'y') + + self.assertTrue('f' < ftol) + self.assertFalse('f' > atof) + self.assertTrue('y' > ltor) + + self.assertFalse('f' > ftol) + self.assertFalse('f' < atof) + self.assertFalse('y' < ltor) + + # Now test ranges with only 1 boundary + start_to_l = utils.ShardRange('a/None-l', ts, '', 'l') + l_to_end = utils.ShardRange('a/l-None', ts, 'l', '') + + for x in ('l', 'm', 'z', 'zzz1231sd'): + if x == 'l': + self.assertFalse(x in l_to_end) + self.assertFalse(start_to_l < x) + self.assertFalse(x > start_to_l) + else: + self.assertTrue(x in l_to_end) + self.assertTrue(start_to_l < x) + self.assertTrue(x > start_to_l) + + # Now test some of the range to range checks with missing boundaries + self.assertFalse(atof < start_to_l) + self.assertFalse(start_to_l < entire) + + # Now test ShardRange.overlaps(other) + self.assertTrue(atof.overlaps(atof)) + self.assertFalse(atof.overlaps(ftol)) + self.assertFalse(ftol.overlaps(atof)) + self.assertTrue(atof.overlaps(dtof)) + self.assertTrue(dtof.overlaps(atof)) + self.assertFalse(dtof.overlaps(ftol)) + self.assertTrue(dtom.overlaps(ftol)) + self.assertTrue(ftol.overlaps(dtom)) + self.assertFalse(start_to_l.overlaps(l_to_end)) + + def test_contains(self): + ts = utils.Timestamp.now().internal + lower = utils.ShardRange('a/-h', ts, '', 'h') + mid = utils.ShardRange('a/h-p', ts, 'h', 'p') + upper = utils.ShardRange('a/p-', ts, 'p', '') + entire = utils.ShardRange('a/all', ts, '', '') + + self.assertTrue('a' in entire) + self.assertTrue('x' in entire) + + # the empty string is not a valid object name, so it cannot be in any + # range + self.assertFalse('' in lower) + self.assertFalse('' in upper) + self.assertFalse('' in entire) + + self.assertTrue('a' in lower) + self.assertTrue('h' in lower) + self.assertFalse('i' in lower) + + self.assertFalse('h' in mid) + self.assertTrue('p' in mid) + + self.assertFalse('p' in upper) + self.assertTrue('x' in upper) + + self.assertIn(utils.ShardRange.MAX, entire) + self.assertNotIn(utils.ShardRange.MAX, lower) + self.assertIn(utils.ShardRange.MAX, upper) + + # lower bound is excluded so MIN cannot be in any range. + self.assertNotIn(utils.ShardRange.MIN, entire) + self.assertNotIn(utils.ShardRange.MIN, upper) + self.assertNotIn(utils.ShardRange.MIN, lower) + + def test_includes(self): + ts = utils.Timestamp.now().internal + _to_h = utils.ShardRange('a/-h', ts, '', 'h') + d_to_t = utils.ShardRange('a/d-t', ts, 'd', 't') + d_to_k = utils.ShardRange('a/d-k', ts, 'd', 'k') + e_to_l = utils.ShardRange('a/e-l', ts, 'e', 'l') + k_to_t = utils.ShardRange('a/k-t', ts, 'k', 't') + p_to_ = utils.ShardRange('a/p-', ts, 'p', '') + t_to_ = utils.ShardRange('a/t-', ts, 't', '') + entire = utils.ShardRange('a/all', ts, '', '') + + self.assertTrue(entire.includes(entire)) + self.assertTrue(d_to_t.includes(d_to_t)) + self.assertTrue(_to_h.includes(_to_h)) + self.assertTrue(p_to_.includes(p_to_)) + + self.assertTrue(entire.includes(_to_h)) + self.assertTrue(entire.includes(d_to_t)) + self.assertTrue(entire.includes(p_to_)) + + self.assertTrue(d_to_t.includes(d_to_k)) + self.assertTrue(d_to_t.includes(e_to_l)) + self.assertTrue(d_to_t.includes(k_to_t)) + self.assertTrue(p_to_.includes(t_to_)) + + self.assertFalse(_to_h.includes(d_to_t)) + self.assertFalse(p_to_.includes(d_to_t)) + self.assertFalse(k_to_t.includes(d_to_k)) + self.assertFalse(d_to_k.includes(e_to_l)) + self.assertFalse(k_to_t.includes(e_to_l)) + self.assertFalse(t_to_.includes(p_to_)) + + self.assertFalse(_to_h.includes(entire)) + self.assertFalse(p_to_.includes(entire)) + self.assertFalse(d_to_t.includes(entire)) + + def test_repr(self): + ts = next(self.ts_iter) + ts.offset = 1234 + meta_ts = next(self.ts_iter) + state_ts = next(self.ts_iter) + sr = utils.ShardRange('a/c', ts, 'l', 'u', 100, 1000, + meta_timestamp=meta_ts, + state=utils.ShardRange.ACTIVE, + state_timestamp=state_ts) + self.assertEqual( + "ShardRange<'l' to 'u' as of %s, (100, 1000) as of %s, " + "active as of %s>" + % (ts.internal, meta_ts.internal, state_ts.internal), str(sr)) + + ts.offset = 0 + meta_ts.offset = 2 + state_ts.offset = 3 + sr = utils.ShardRange('a/c', ts, '', '', 100, 1000, + meta_timestamp=meta_ts, + state=utils.ShardRange.FOUND, + state_timestamp=state_ts) + self.assertEqual( + "ShardRange" + % (ts.internal, meta_ts.internal, state_ts.internal), str(sr)) + + def test_copy(self): + sr = utils.ShardRange('a/c', next(self.ts_iter), 'x', 'y', 99, 99000, + meta_timestamp=next(self.ts_iter), + state=utils.ShardRange.CREATED, + state_timestamp=next(self.ts_iter)) + new = sr.copy() + self.assertEqual(dict(sr), dict(new)) + + new = sr.copy(deleted=1) + self.assertEqual(dict(sr, deleted=1), dict(new)) + + new_timestamp = next(self.ts_iter) + new = sr.copy(timestamp=new_timestamp) + self.assertEqual(dict(sr, timestamp=new_timestamp.internal, + meta_timestamp=new_timestamp.internal, + state_timestamp=new_timestamp.internal), + dict(new)) + + new = sr.copy(timestamp=new_timestamp, object_count=99) + self.assertEqual(dict(sr, timestamp=new_timestamp.internal, + meta_timestamp=new_timestamp.internal, + state_timestamp=new_timestamp.internal, + object_count=99), + dict(new)) + + def test_make_path(self): + ts = utils.Timestamp.now() + actual = utils.ShardRange.make_path('a', 'root', 'parent', ts, 0) + parent_hash = hashlib.md5(b'parent').hexdigest() + self.assertEqual('a/root-%s-%s-0' % (parent_hash, ts.internal), actual) + actual = utils.ShardRange.make_path('a', 'root', 'parent', ts, 3) + self.assertEqual('a/root-%s-%s-3' % (parent_hash, ts.internal), actual) + actual = utils.ShardRange.make_path('a', 'root', 'parent', ts, '3') + self.assertEqual('a/root-%s-%s-3' % (parent_hash, ts.internal), actual) + actual = utils.ShardRange.make_path( + 'a', 'root', 'parent', ts.internal, '3') + self.assertEqual('a/root-%s-%s-3' % (parent_hash, ts.internal), actual) + actual = utils.ShardRange.make_path('a', 'root', 'parent', ts, 'foo') + self.assertEqual('a/root-%s-%s-foo' % (parent_hash, ts.internal), + actual) + + if __name__ == '__main__': unittest.main() From 14af38a8992c55925a7e6ed74eb9ba1b8d360ff1 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Tue, 1 May 2018 15:44:18 +0100 Subject: [PATCH 4/9] Add support for sharding in ContainerBroker With this patch the ContainerBroker gains several new features: 1. A shard_ranges table to persist ShardRange data, along with methods to merge and access ShardRange instances to that table, and to remove expired shard ranges. 2. The ability to create a fresh db file to replace the existing db file. Fresh db files are named using the hash of the container path plus an epoch which is a serialized Timestamp value, in the form: _.db During sharding both the fresh and retiring db files co-exist on disk. The ContainerBroker is now able to choose the newest on disk db file when instantiated. It also provides a method (get_brokers()) to gain access to broker instance for either on disk file. 3. Methods to access the current state of the on disk db files i.e. UNSHARDED (old file only), SHARDING (fresh and retiring files), or SHARDED (fresh file only with shard ranges). Container replication is also modified: 1. shard ranges are replicated between container db peers. Unlike objects, shard ranges are both pushed and pulled during a REPLICATE event. 2. If a container db is capable of being sharded (i.e. it has a set of shard ranges) then it will no longer attempt to replicate objects to its peers. Object record durability is achieved by sharding rather than peer to peer replication. Co-Authored-By: Matthew Oliver Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Change-Id: Ie4d2816259e6c25c346976e181fb9d350f947190 --- swift/cli/info.py | 29 +- swift/common/db_replicator.py | 45 +- swift/common/utils.py | 83 + swift/container/backend.py | 1273 ++++++++++++- swift/container/replicator.py | 92 +- test/unit/cli/test_info.py | 118 +- test/unit/common/test_db_replicator.py | 77 +- test/unit/common/test_utils.py | 105 ++ test/unit/container/test_backend.py | 2346 +++++++++++++++++++++++- test/unit/container/test_replicator.py | 1049 ++++++++++- test/unit/container/test_server.py | 4 +- 11 files changed, 5064 insertions(+), 157 deletions(-) diff --git a/swift/cli/info.py b/swift/cli/info.py index 0eee781ba6..1969435285 100644 --- a/swift/cli/info.py +++ b/swift/cli/info.py @@ -298,6 +298,27 @@ def print_db_info_metadata(db_type, info, metadata, drop_prefixes=False): else: print('No user metadata found in db file') + if db_type == 'container': + print('Sharding Metadata:') + shard_type = 'root' if info['is_root'] else 'shard' + print(' Type: %s' % shard_type) + print(' State: %s' % info['db_state']) + if info.get('shard_ranges'): + print('Shard Ranges (%d):' % len(info['shard_ranges'])) + for srange in info['shard_ranges']: + srange = dict(srange, state_text=srange.state_text) + print(' Name: %(name)s' % srange) + print(' lower: %(lower)r, upper: %(upper)r' % srange) + print(' Object Count: %(object_count)d, Bytes Used: ' + '%(bytes_used)d, State: %(state_text)s (%(state)d)' + % srange) + print(' Created at: %s (%s)' + % (Timestamp(srange['timestamp']).isoformat, + srange['timestamp'])) + print(' Meta Timestamp: %s (%s)' + % (Timestamp(srange['meta_timestamp']).isoformat, + srange['meta_timestamp'])) + def print_obj_metadata(metadata, drop_prefixes=False): """ @@ -406,7 +427,13 @@ def print_info(db_type, db_file, swift_dir='/etc/swift', stale_reads_ok=False, raise InfoSystemExit() raise account = info['account'] - container = info['container'] if db_type == 'container' else None + container = None + if db_type == 'container': + container = info['container'] + info['is_root'] = broker.is_root_container() + sranges = broker.get_shard_ranges() + if sranges: + info['shard_ranges'] = sranges print_db_info_metadata(db_type, info, broker.metadata, drop_prefixes) try: ring = Ring(swift_dir, ring_name=db_type) diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index c27914bffb..0d063cd455 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -33,7 +33,8 @@ from swift.common.direct_client import quote from swift.common.utils import get_logger, whataremyips, storage_directory, \ renamer, mkdirs, lock_parent_directory, config_true_value, \ unlink_older_than, dump_recon_cache, rsync_module_interpolation, \ - json, parse_override_options, round_robin_iter, Everything + json, parse_override_options, round_robin_iter, Everything, get_db_files, \ + parse_db_filename from swift.common import ring from swift.common.ring.utils import is_local_device from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \ @@ -120,14 +121,20 @@ def roundrobin_datadirs(datadirs): if not os.path.isdir(hash_dir): continue object_file = os.path.join(hash_dir, hsh + '.db') + # common case if os.path.exists(object_file): yield (partition, object_file, context) - else: - try: - os.rmdir(hash_dir) - except OSError as e: - if e.errno != errno.ENOTEMPTY: - raise + continue + # look for any alternate db filenames + db_files = get_db_files(object_file) + if db_files: + yield (partition, db_files[-1], context) + continue + try: + os.rmdir(hash_dir) + except OSError as e: + if e.errno != errno.ENOTEMPTY: + raise its = [walk_datadir(datadir, context, filt) for datadir, context, filt in datadirs] @@ -216,7 +223,7 @@ class Replicator(Daemon): self.stats = {'attempted': 0, 'success': 0, 'failure': 0, 'ts_repl': 0, 'no_change': 0, 'hashmatch': 0, 'rsync': 0, 'diff': 0, 'remove': 0, 'empty': 0, 'remote_merge': 0, - 'start': time.time(), 'diff_capped': 0, + 'start': time.time(), 'diff_capped': 0, 'deferred': 0, 'failure_nodes': {}} def _report_stats(self): @@ -313,12 +320,13 @@ class Replicator(Daemon): different_region=different_region): return False with Timeout(replicate_timeout or self.node_timeout): - response = http.replicate(replicate_method, local_id) + response = http.replicate(replicate_method, local_id, + os.path.basename(broker.db_file)) return response and 200 <= response.status < 300 - def _send_merge_items(self, http, local_id, items): + def _send_replicate_request(self, http, *repl_args): with Timeout(self.node_timeout): - response = http.replicate('merge_items', items, local_id) + response = http.replicate(*repl_args) if not response or not is_success(response.status): if response: self.logger.error('ERROR Bad response %s from %s', @@ -350,7 +358,8 @@ class Replicator(Daemon): diffs = 0 while len(objects) and diffs < self.max_diffs: diffs += 1 - if not self._send_merge_items(http, local_id, objects): + if not self._send_replicate_request( + http, 'merge_items', objects, local_id): return False # replication relies on db order to send the next merge batch in # order with no gaps @@ -413,9 +422,8 @@ class Replicator(Daemon): :returns: ReplConnection object """ - return ReplConnection(node, partition, - os.path.basename(db_file).split('.', 1)[0], - self.logger) + hsh, other, ext = parse_db_filename(db_file) + return ReplConnection(node, partition, hsh, self.logger) def _gather_sync_args(self, info): """ @@ -931,6 +939,8 @@ class ReplicatorRpc(object): def complete_rsync(self, drive, db_file, args): old_filename = os.path.join(self.root, drive, 'tmp', args[0]) + if args[1:]: + db_file = os.path.join(os.path.dirname(db_file), args[1]) if os.path.exists(db_file): return HTTPNotFound() if not os.path.exists(old_filename): @@ -944,6 +954,10 @@ class ReplicatorRpc(object): return not (self._db_file_exists(db_file) and os.path.exists(tmp_filename)) + def _post_rsync_then_merge_hook(self, existing_broker, new_broker): + # subclasses may override to make custom changes to the new broker + pass + def rsync_then_merge(self, drive, db_file, args): tmp_filename = os.path.join(self.root, drive, 'tmp', args[0]) if self._abort_rsync_then_merge(db_file, tmp_filename): @@ -959,6 +973,7 @@ class ReplicatorRpc(object): objects = existing_broker.get_items_since(point, 1000) sleep() new_broker.merge_syncs(existing_broker.get_syncs()) + self._post_rsync_then_merge_hook(existing_broker, new_broker) new_broker.newid(args[0]) new_broker.update_metadata(existing_broker.metadata) if self._abort_rsync_then_merge(db_file, tmp_filename): diff --git a/swift/common/utils.py b/swift/common/utils.py index 6641bce268..40c2cb7a8f 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -5300,3 +5300,86 @@ def distribute_evenly(items, num_buckets): for index, item in enumerate(items): out[index % num_buckets].append(item) return out + + +def parse_db_filename(filename): + """ + Splits a db filename into three parts: the hash, the epoch, and the + extension. + + >>> parse_db_filename("ab2134.db") + ('ab2134', None, '.db') + >>> parse_db_filename("ab2134_1234567890.12345.db") + ('ab2134', '1234567890.12345', '.db') + + :param filename: A db file basename or path to a db file. + :return: A tuple of (hash , epoch, extension). ``epoch`` may be None. + :raises ValueError: if ``filename`` is not a path to a file. + """ + filename = os.path.basename(filename) + if not filename: + raise ValueError('Path to a file required.') + name, ext = os.path.splitext(filename) + parts = name.split('_') + hash_ = parts.pop(0) + epoch = parts[0] if parts else None + return hash_, epoch, ext + + +def make_db_file_path(db_path, epoch): + """ + Given a path to a db file, return a modified path whose filename part has + the given epoch. + + A db filename takes the form [_].db; this method replaces the + part of the given ``db_path`` with the given ``epoch`` value. + + :param db_path: Path to a db file that does not necessarily exist. + :param epoch: A string that will be used as the epoch in the new path's + filename; the value will be normalized to the normal string + representation of a :class:`~swift.common.utils.Timestamp`. + :return: A modified path to a db file. + :raises ValueError: if the ``epoch`` is not valid for constructing a + :class:`~swift.common.utils.Timestamp`. + """ + if epoch is None: + raise ValueError('epoch must not be None') + epoch = Timestamp(epoch).normal + hash_, _, ext = parse_db_filename(db_path) + db_dir = os.path.dirname(db_path) + return os.path.join(db_dir, '%s_%s%s' % (hash_, epoch, ext)) + + +def get_db_files(db_path): + """ + Given the path to a db file, return a sorted list of all valid db files + that actually exist in that path's dir. A valid db filename has the form: + + [_].db + + where matches the part of the given db_path as would be + parsed by :meth:`~swift.utils.common.parse_db_filename`. + + :param db_path: Path to a db file that does not necessarily exist. + :return: List of valid db files that do exist in the dir of the + ``db_path``. This list may be empty. + """ + db_dir, db_file = os.path.split(db_path) + try: + files = os.listdir(db_dir) + except OSError as err: + if err.errno == errno.ENOENT: + return [] + raise + if not files: + return [] + match_hash, epoch, ext = parse_db_filename(db_file) + results = [] + for f in files: + hash_, epoch, ext = parse_db_filename(f) + if ext != '.db': + continue + if hash_ != match_hash: + continue + results.append(os.path.join(db_dir, f)) + return sorted(results) diff --git a/swift/container/backend.py b/swift/container/backend.py index c61c633739..9d75d0f680 100644 --- a/swift/container/backend.py +++ b/swift/container/backend.py @@ -15,6 +15,8 @@ """ Pluggable Back-ends for Container Server """ +import errno + import os from uuid import uuid4 @@ -22,16 +24,45 @@ import six import six.moves.cPickle as pickle from six.moves import range import sqlite3 +from eventlet import tpool +from swift.common.constraints import CONTAINER_LISTING_LIMIT +from swift.common.exceptions import LockTimeout from swift.common.utils import Timestamp, encode_timestamps, \ - decode_timestamps, extract_swift_bytes, storage_directory, hash_path -from swift.common.db import DatabaseBroker, utf8encode, \ + decode_timestamps, extract_swift_bytes, storage_directory, hash_path, \ + ShardRange, renamer, find_shard_range, MD5_OF_EMPTY_STRING, mkdirs, \ + get_db_files, parse_db_filename, make_db_file_path, split_path +from swift.common.db import DatabaseBroker, utf8encode, BROKER_TIMEOUT, \ zero_like, DatabaseAlreadyExists SQLITE_ARG_LIMIT = 999 DATADIR = 'containers' +RECORD_TYPE_OBJECT = 'object' +RECORD_TYPE_SHARD = 'shard' +SHARD_RANGE_TABLE = 'shard_range' + +NOTFOUND = 'not_found' +UNSHARDED = 'unsharded' +SHARDING = 'sharding' +SHARDED = 'sharded' +COLLAPSED = 'collapsed' + + +SHARD_STATS_STATES = [ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING] +SHARD_LISTING_STATES = SHARD_STATS_STATES + [ShardRange.CLEAVED] +SHARD_UPDATE_STATES = [ShardRange.CREATED, ShardRange.CLEAVED, + ShardRange.ACTIVE, ShardRange.SHARDING] + + +# attribute names in order used when transforming shard ranges from dicts to +# tuples and vice-versa +SHARD_RANGE_KEYS = ('name', 'timestamp', 'lower', 'upper', 'object_count', + 'bytes_used', 'meta_timestamp', 'deleted', 'state', + 'state_timestamp', 'epoch') + POLICY_STAT_TABLE_CREATE = ''' CREATE TABLE policy_stat ( storage_policy_index INTEGER PRIMARY KEY, @@ -220,15 +251,99 @@ def update_new_item_from_existing(new_item, existing): return any(newer_than_existing) +def merge_shards(shard_data, existing): + """ + Compares ``shard_data`` with ``existing`` and updates ``shard_data`` with + any items of ``existing`` that take precedence over the corresponding item + in ``shard_data``. + + :param shard_data: a dict representation of shard range that may be + modified by this method. + :param existing: a dict representation of shard range. + :returns: True if ``shard data`` has any item(s) that are considered to + take precedence over the corresponding item in ``existing`` + """ + if not existing: + return True + if existing['timestamp'] < shard_data['timestamp']: + # note that currently we do not roll forward any meta or state from + # an item that was created at older time, newer created time trumps + return True + elif existing['timestamp'] > shard_data['timestamp']: + return False + + new_content = False + # timestamp must be the same, so preserve existing range bounds and deleted + for k in ('lower', 'upper', 'deleted'): + shard_data[k] = existing[k] + + # now we need to look for meta data updates + if existing['meta_timestamp'] >= shard_data['meta_timestamp']: + for k in ('object_count', 'bytes_used', 'meta_timestamp'): + shard_data[k] = existing[k] + else: + new_content = True + + if (existing['state_timestamp'] == shard_data['state_timestamp'] + and shard_data['state'] > existing['state']): + new_content = True + elif existing['state_timestamp'] >= shard_data['state_timestamp']: + for k in ('state', 'state_timestamp', 'epoch'): + shard_data[k] = existing[k] + else: + new_content = True + return new_content + + class ContainerBroker(DatabaseBroker): - """Encapsulates working with a container database.""" + """ + Encapsulates working with a container database. + + Note that this may involve multiple on-disk DB files if the container + becomes sharded: + + * :attr:`_db_file` is the path to the legacy container DB name, i.e. + ``.db``. This file should exist for an initialised broker that + has never been sharded, but will not exist once a container has been + sharded. + * :attr:`db_files` is a list of existing db files for the broker. This + list should have at least one entry for an initialised broker, and + should have two entries while a broker is in SHARDING state. + * :attr:`db_file` is the path to whichever db is currently authoritative + for the container. Depending on the container's state, this may not be + the same as the ``db_file`` argument given to :meth:`~__init__`, unless + ``force_db_file`` is True in which case :attr:`db_file` is always equal + to the ``db_file`` argument given to :meth:`~__init__`. + * :attr:`pending_file` is always equal to :attr:`_db_file` extended with + ``.pending``, i.e. ``.db.pending``. + """ db_type = 'container' db_contains_type = 'object' db_reclaim_timestamp = 'created_at' + def __init__(self, db_file, timeout=BROKER_TIMEOUT, logger=None, + account=None, container=None, pending_timeout=None, + stale_reads_ok=False, skip_commits=False, + force_db_file=False): + self._init_db_file = db_file + if db_file == ':memory:': + base_db_file = db_file + else: + db_dir = os.path.dirname(db_file) + hash_, other, ext = parse_db_filename(db_file) + base_db_file = os.path.join(db_dir, hash_ + ext) + super(ContainerBroker, self).__init__( + base_db_file, timeout, logger, account, container, pending_timeout, + stale_reads_ok, skip_commits=skip_commits) + # the root account and container are populated on demand + self._root_account = self._root_container = None + self._force_db_file = force_db_file + self._db_files = None + @classmethod def create_broker(self, device_path, part, account, container, logger=None, - put_timestamp=None, storage_policy_index=None): + epoch=None, put_timestamp=None, + storage_policy_index=None): """ Create a ContainerBroker instance. If the db doesn't exist, initialize the db file. @@ -238,6 +353,7 @@ class ContainerBroker(DatabaseBroker): :param account: account name string :param container: container name string :param logger: a logger instance + :param epoch: a timestamp to include in the db filename :param put_timestamp: initial timestamp if broker needs to be initialized :param storage_policy_index: the storage policy index @@ -246,6 +362,8 @@ class ContainerBroker(DatabaseBroker): hsh = hash_path(account, container) db_dir = storage_directory(DATADIR, part, hsh) db_path = os.path.join(device_path, db_dir, hsh + '.db') + if epoch: + db_path = make_db_file_path(db_path, epoch) broker = ContainerBroker(db_path, account=account, container=container, logger=logger) if not os.path.exists(broker.db_file): @@ -255,6 +373,98 @@ class ContainerBroker(DatabaseBroker): pass return broker + def get_db_state(self): + """ + Returns the current state of on disk db files. + """ + if self._db_file == ':memory:': + return UNSHARDED + if not self.db_files: + return NOTFOUND + if len(self.db_files) > 1: + return SHARDING + if self.db_epoch is None: + # never been sharded + return UNSHARDED + if self.db_epoch != self._own_shard_range().epoch: + return UNSHARDED + if not self.get_shard_ranges(): + return COLLAPSED + return SHARDED + + def sharding_initiated(self): + """ + Returns True if a broker has shard range state that would be necessary + for sharding to have been initiated, False otherwise. + """ + own_shard_range = self.get_own_shard_range() + if own_shard_range.state in (ShardRange.SHARDING, + ShardRange.SHRINKING, + ShardRange.SHARDED): + return bool(self.get_shard_ranges()) + return False + + def sharding_required(self): + """ + Returns True if a broker has shard range state that would be necessary + for sharding to have been initiated but has not yet completed sharding, + False otherwise. + """ + db_state = self.get_db_state() + return (db_state == SHARDING or + (db_state == UNSHARDED and self.sharding_initiated())) + + def is_sharded(self): + return self.get_db_state() == SHARDED + + def reload_db_files(self): + """ + Reloads the cached list of valid on disk db files for this broker. + """ + if self._db_file == ':memory:': + return + # reset connection so the next access will use the correct DB file + self.conn = None + self._db_files = get_db_files(self._init_db_file) + + @property + def db_files(self): + """ + Gets the cached list of valid db files that exist on disk for this + broker. + + The cached list may be refreshed by calling + :meth:`~swift.container.backend.ContainerBroker.reload_db_files`. + + :return: A list of paths to db files ordered by ascending epoch; + the list may be empty. + """ + if not self._db_files: + self.reload_db_files() + return self._db_files + + @property + def db_file(self): + """ + Get the path to the primary db file for this broker. This is typically + the db file for the most recent sharding epoch. However, if no db files + exist on disk, or if ``force_db_file`` was True when the broker was + constructed, then the primary db file is the file passed to the broker + constructor. + + :return: A path to a db file; the file does not necessarily exist. + """ + if self._force_db_file: + return self._init_db_file + if self.db_files: + return self.db_files[-1] + return self._init_db_file + + @property + def db_epoch(self): + hash_, epoch, ext = parse_db_filename(self.db_file) + return epoch + @property def storage_policy_index(self): if not hasattr(self, '_storage_policy_index'): @@ -262,6 +472,11 @@ class ContainerBroker(DatabaseBroker): self.get_info()['storage_policy_index'] return self._storage_policy_index + @property + def path(self): + self._populate_instance_cache() + return '%s/%s' % (self.account, self.container) + def _initialize(self, conn, put_timestamp, storage_policy_index): """ Create a brand new container database (tables, indices, triggers, etc.) @@ -278,6 +493,8 @@ class ContainerBroker(DatabaseBroker): self.create_policy_stat_table(conn, storage_policy_index) self.create_container_info_table(conn, put_timestamp, storage_policy_index) + self.create_shard_range_table(conn) + self._db_files = None def create_object_table(self, conn): """ @@ -359,6 +576,40 @@ class ContainerBroker(DatabaseBroker): VALUES (?) """, (storage_policy_index,)) + def create_shard_range_table(self, conn): + """ + Create the shard_range table which is specific to the container DB. + + :param conn: DB connection object + """ + # Use execute (not executescript) so we get the benefits of our + # GreenDBConnection. Creating a table requires a whole-DB lock; + # *any* in-progress cursor will otherwise trip a "database is locked" + # error. + conn.execute(""" + CREATE TABLE %s ( + ROWID INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT, + timestamp TEXT, + lower TEXT, + upper TEXT, + object_count INTEGER DEFAULT 0, + bytes_used INTEGER DEFAULT 0, + meta_timestamp TEXT, + deleted INTEGER DEFAULT 0, + state INTEGER, + state_timestamp TEXT, + epoch TEXT + ); + """ % SHARD_RANGE_TABLE) + + conn.execute(""" + CREATE TRIGGER shard_range_update BEFORE UPDATE ON %s + BEGIN + SELECT RAISE(FAIL, 'UPDATE not allowed; DELETE and INSERT'); + END; + """ % SHARD_RANGE_TABLE) + def get_db_version(self, conn): if self._db_version == -1: self._db_version = 0 @@ -368,6 +619,11 @@ class ContainerBroker(DatabaseBroker): self._db_version = 1 return self._db_version + def _get_deleted_key(self, connection): + if self.get_db_version(connection) < 1: + return '+deleted' + return 'deleted' + def _newid(self, conn): conn.execute(''' UPDATE container_stat @@ -411,12 +667,7 @@ class ContainerBroker(DatabaseBroker): 'ctype_timestamp': content_type_timestamp, 'meta_timestamp': meta_timestamp}) - def empty(self): - """ - Check if container DB is empty. - - :returns: True if the database has no active objects, False otherwise - """ + def _empty(self): self._commit_puts_stale_ok() with self.get() as conn: try: @@ -431,6 +682,26 @@ class ContainerBroker(DatabaseBroker): 'SELECT object_count from container_stat').fetchone() return zero_like(row[0]) + def empty(self): + """ + Check if container DB is empty. + + This method uses more stringent checks on object count than + :meth:`is_deleted`: this method checks that there are no objects in any + policy; if the container is in the process of sharding then both fresh + and retiring databases are checked to be empty; if a root container has + shard ranges then they are checked to be empty. + + :returns: True if the database has no active objects, False otherwise + """ + if not all(broker._empty() for broker in self.get_brokers()): + return False + if self.is_root_container() and self.sharding_initiated(): + # sharded shards don't get updates from their shards so their shard + # usage should not be relied upon + return self.get_shard_usage()['object_count'] <= 0 + return True + def delete_object(self, name, timestamp, storage_policy_index=0): """ Mark an object deleted. @@ -490,7 +761,12 @@ class ContainerBroker(DatabaseBroker): def _is_deleted(self, conn): """ - Check container_stat view and evaluate info. + Check if the DB is considered to be deleted. + + This object count used in this check is the same as the container + object count that would be returned in the result of :meth:`get_info` + and exposed to a client i.e. it is based on the container_stat view for + the current storage policy index or relevant shard range usage. :param conn: database conn @@ -499,6 +775,8 @@ class ContainerBroker(DatabaseBroker): info = conn.execute(''' SELECT put_timestamp, delete_timestamp, object_count FROM container_stat''').fetchone() + info = dict(info) + info.update(self._get_alternate_object_stats()[1]) return self._is_deleted_info(**info) def is_reclaimable(self, now, reclaim_age): @@ -524,6 +802,73 @@ class ContainerBroker(DatabaseBroker): info = self.get_info() return info, self._is_deleted_info(**info) + def get_replication_info(self): + info = super(ContainerBroker, self).get_replication_info() + info['shard_max_row'] = self.get_max_row('shard_ranges') + return info + + def _do_get_info_query(self, conn): + data = None + trailing_sync = 'x_container_sync_point1, x_container_sync_point2' + trailing_pol = 'storage_policy_index' + errors = set() + while not data: + try: + data = conn.execute((''' + SELECT account, container, created_at, put_timestamp, + delete_timestamp, status_changed_at, + object_count, bytes_used, + reported_put_timestamp, reported_delete_timestamp, + reported_object_count, reported_bytes_used, hash, + id, %s, %s + FROM container_stat + ''') % (trailing_sync, trailing_pol)).fetchone() + except sqlite3.OperationalError as err: + err_msg = str(err) + if err_msg in errors: + # only attempt migration once + raise + errors.add(err_msg) + if 'no such column: storage_policy_index' in err_msg: + trailing_pol = '0 AS storage_policy_index' + elif 'no such column: x_container_sync_point' in err_msg: + trailing_sync = '-1 AS x_container_sync_point1, ' \ + '-1 AS x_container_sync_point2' + else: + raise + data = dict(data) + # populate instance cache + self._storage_policy_index = data['storage_policy_index'] + self.account = data['account'] + self.container = data['container'] + return data + + def _get_info(self): + self._commit_puts_stale_ok() + with self.get() as conn: + return self._do_get_info_query(conn) + + def _populate_instance_cache(self, conn=None): + # load cached instance attributes from the database if necessary + if self.container is None: + if conn: + self._do_get_info_query(conn) + else: + with self.get() as conn: + self._do_get_info_query(conn) + + def _get_alternate_object_stats(self): + state = self.get_db_state() + if state == SHARDING: + other_info = self.get_brokers()[0]._get_info() + stats = {'object_count': other_info['object_count'], + 'bytes_used': other_info['bytes_used']} + elif state == SHARDED and self.is_root_container(): + stats = self.get_shard_usage() + else: + stats = {} + return state, stats + def get_info(self): """ Get global data for the container. @@ -533,44 +878,14 @@ class ContainerBroker(DatabaseBroker): object_count, bytes_used, reported_put_timestamp, reported_delete_timestamp, reported_object_count, reported_bytes_used, hash, id, x_container_sync_point1, - x_container_sync_point2, and storage_policy_index. + x_container_sync_point2, and storage_policy_index, + db_state. """ - self._commit_puts_stale_ok() - with self.get() as conn: - data = None - trailing_sync = 'x_container_sync_point1, x_container_sync_point2' - trailing_pol = 'storage_policy_index' - errors = set() - while not data: - try: - data = conn.execute((''' - SELECT account, container, created_at, put_timestamp, - delete_timestamp, status_changed_at, - object_count, bytes_used, - reported_put_timestamp, reported_delete_timestamp, - reported_object_count, reported_bytes_used, hash, - id, %s, %s - FROM container_stat - ''') % (trailing_sync, trailing_pol)).fetchone() - except sqlite3.OperationalError as err: - err_msg = str(err) - if err_msg in errors: - # only attempt migration once - raise - errors.add(err_msg) - if 'no such column: storage_policy_index' in err_msg: - trailing_pol = '0 AS storage_policy_index' - elif 'no such column: x_container_sync_point' in err_msg: - trailing_sync = '-1 AS x_container_sync_point1, ' \ - '-1 AS x_container_sync_point2' - else: - raise - data = dict(data) - # populate instance cache - self._storage_policy_index = data['storage_policy_index'] - self.account = data['account'] - self.container = data['container'] - return data + data = self._get_info() + state, stats = self._get_alternate_object_stats() + data.update(stats) + data['db_state'] = state + return data def set_x_container_sync_points(self, sync_point1, sync_point2): with self.get() as conn: @@ -696,7 +1011,9 @@ class ContainerBroker(DatabaseBroker): conn.commit() def list_objects_iter(self, limit, marker, end_marker, prefix, delimiter, - path=None, storage_policy_index=0, reverse=False): + path=None, storage_policy_index=0, reverse=False, + include_deleted=False, since_row=None, + transform_func=None, all_policies=False): """ Get a list of objects sorted by name starting at marker onward, up to limit entries. Entries will begin with the prefix and will not @@ -711,10 +1028,29 @@ class ContainerBroker(DatabaseBroker): the path :param storage_policy_index: storage policy index for query :param reverse: reverse the result order. - + :param include_deleted: if True, include only deleted objects; if + False (default), include only undeleted objects; otherwise, include + both deleted and undeleted objects. + :param since_row: include only items whose ROWID is greater than + the given row id; by default all rows are included. + :param transform_func: an optional function that if given will be + called for each object to get a transformed version of the object + to include in the listing; should have same signature as + :meth:`~_transform_record`; defaults to :meth:`~_transform_record`. + :param all_policies: if True, include objects for all storage policies + ignoring any value given for ``storage_policy_index`` :returns: list of tuples of (name, created_at, size, content_type, - etag) + etag, deleted) """ + if include_deleted is True: + deleted_arg = ' = 1' + elif include_deleted is False: + deleted_arg = ' = 0' + else: + deleted_arg = ' in (0, 1)' + + if transform_func is None: + transform_func = self._transform_record delim_force_gte = False (marker, end_marker, prefix, delimiter, path) = utf8encode( marker, end_marker, prefix, delimiter, path) @@ -734,60 +1070,71 @@ class ContainerBroker(DatabaseBroker): orig_marker = marker with self.get() as conn: results = [] + deleted_key = self._get_deleted_key(conn) + query_keys = ['name', 'created_at', 'size', 'content_type', + 'etag', deleted_key] while len(results) < limit: - query = '''SELECT name, created_at, size, content_type, etag - FROM object WHERE''' query_args = [] + query_conditions = [] if end_marker and (not prefix or end_marker < end_prefix): - query += ' name < ? AND' + query_conditions.append('name < ?') query_args.append(end_marker) elif prefix: - query += ' name < ? AND' + query_conditions.append('name < ?') query_args.append(end_prefix) if delim_force_gte: - query += ' name >= ? AND' + query_conditions.append('name >= ?') query_args.append(marker) # Always set back to False delim_force_gte = False elif marker and marker >= prefix: - query += ' name > ? AND' + query_conditions.append('name > ?') query_args.append(marker) elif prefix: - query += ' name >= ? AND' + query_conditions.append('name >= ?') query_args.append(prefix) - if self.get_db_version(conn) < 1: - query += ' +deleted = 0' - else: - query += ' deleted = 0' - orig_tail_query = ''' - ORDER BY name %s LIMIT ? - ''' % ('DESC' if reverse else '') - orig_tail_args = [limit - len(results)] + query_conditions.append(deleted_key + deleted_arg) + if since_row: + query_conditions.append('ROWID > ?') + query_args.append(since_row) + + def build_query(keys, conditions, args): + query = 'SELECT ' + ', '.join(keys) + ' FROM object ' + if conditions: + query += 'WHERE ' + ' AND '.join(conditions) + tail_query = ''' + ORDER BY name %s LIMIT ? + ''' % ('DESC' if reverse else '') + return query + tail_query, args + [limit - len(results)] + # storage policy filter - policy_tail_query = ''' - AND storage_policy_index = ? - ''' + orig_tail_query - policy_tail_args = [storage_policy_index] + orig_tail_args - tail_query, tail_args = \ - policy_tail_query, policy_tail_args + if all_policies: + query, args = build_query( + query_keys + ['storage_policy_index'], + query_conditions, + query_args) + else: + query, args = build_query( + query_keys + ['storage_policy_index'], + query_conditions + ['storage_policy_index = ?'], + query_args + [storage_policy_index]) try: - curs = conn.execute(query + tail_query, - tuple(query_args + tail_args)) + curs = conn.execute(query, tuple(args)) except sqlite3.OperationalError as err: if 'no such column: storage_policy_index' not in str(err): raise - tail_query, tail_args = \ - orig_tail_query, orig_tail_args - curs = conn.execute(query + tail_query, - tuple(query_args + tail_args)) + query, args = build_query( + query_keys + ['0 as storage_policy_index'], + query_conditions, query_args) + curs = conn.execute(query, tuple(args)) curs.row_factory = None # Delimiters without a prefix is ignored, further if there # is no delimiter then we can simply return the result as # prefixes are now handled in the SQL statement. if prefix is None or not delimiter: - return [self._transform_record(r) for r in curs] + return [transform_func(r) for r in curs] # We have a delimiter and a prefix (possibly empty string) to # handle @@ -826,19 +1173,51 @@ class ContainerBroker(DatabaseBroker): results.append([dir_name, '0', 0, None, '']) curs.close() break - results.append(self._transform_record(row)) + results.append(transform_func(row)) if not rowcount: break return results + def get_objects(self, limit=None, marker='', end_marker='', + include_deleted=None, since_row=None): + """ + Returns a list of objects, including deleted objects, in all policies. + Each object in the list is described by a dict with keys {'name', + 'created_at', 'size', 'content_type', 'etag', 'deleted', + 'storage_policy_index'}. + + :param limit: maximum number of entries to get + :param marker: if set, objects with names less than or equal to this + value will not be included in the list. + :param end_marker: if set, objects with names greater than or equal to + this value will not be included in the list. + :param include_deleted: if True, include only deleted objects; if + False, include only undeleted objects; otherwise (default), include + both deleted and undeleted objects. + :param since_row: include only items whose ROWID is greater than + the given row id; by default all rows are included. + :return: a list of dicts, each describing an object. + """ + + limit = CONTAINER_LISTING_LIMIT if limit is None else limit + return self.list_objects_iter( + limit, marker, end_marker, prefix=None, delimiter=None, path=None, + reverse=False, include_deleted=include_deleted, + transform_func=self._record_to_dict, since_row=since_row, + all_policies=True + ) + def _transform_record(self, record): """ - Decode the created_at timestamp into separate data, content-type and - meta timestamps and replace the created_at timestamp with the - metadata timestamp i.e. the last-modified time. + Returns a tuple of (name, last-modified time, size, content_type and + etag) for the given record. + + The given record's created_at timestamp is decoded into separate data, + content-type and meta timestamps and the metadata timestamp is used as + the last-modified time value. """ t_data, t_ctype, t_meta = decode_timestamps(record[1]) - return (record[0], t_meta.internal) + record[2:] + return (record[0], t_meta.internal) + record[2:5] def _record_to_dict(self, rec): if rec: @@ -861,7 +1240,7 @@ class ContainerBroker(DatabaseBroker): if isinstance(item['name'], six.text_type): item['name'] = item['name'].encode('utf-8') - def _really_merge_items(conn): + def _really_really_merge_items(conn): curs = conn.cursor() if self.get_db_version(conn) >= 1: query_mod = ' deleted IN (0, 1) AND ' @@ -924,6 +1303,9 @@ class ContainerBroker(DatabaseBroker): ''', (sync_point, source)) conn.commit() + def _really_merge_items(conn): + return tpool.execute(_really_really_merge_items, conn) + with self.get() as conn: try: return _really_merge_items(conn) @@ -933,6 +1315,86 @@ class ContainerBroker(DatabaseBroker): self._migrate_add_storage_policy(conn) return _really_merge_items(conn) + def merge_shard_ranges(self, shard_ranges): + """ + Merge shard ranges into the shard range table. + + :param shard_ranges: a shard range or a list of shard ranges; each + shard range should be an instance of + :class:`~swift.common.utils.ShardRange` or a dict representation of + a shard range having ``SHARD_RANGE_KEYS``. + """ + if not shard_ranges: + return + if not isinstance(shard_ranges, list): + shard_ranges = [shard_ranges] + + item_list = [] + for item in shard_ranges: + if isinstance(item, ShardRange): + item = dict(item) + for col in ('name', 'lower', 'upper'): + if isinstance(item[col], six.text_type): + item[col] = item[col].encode('utf-8') + item_list.append(item) + + def _really_merge_items(conn): + curs = conn.cursor() + curs.execute('BEGIN IMMEDIATE') + + # Get rows for items that already exist. + # We must chunk it up to avoid sqlite's limit of 999 args. + records = {} + for offset in range(0, len(item_list), SQLITE_ARG_LIMIT): + chunk = [record['name'] for record + in item_list[offset:offset + SQLITE_ARG_LIMIT]] + records.update( + (rec[0], rec) for rec in curs.execute( + 'SELECT %s FROM %s ' + 'WHERE deleted IN (0, 1) AND name IN (%s)' % + (', '.join(SHARD_RANGE_KEYS), SHARD_RANGE_TABLE, + ','.join('?' * len(chunk))), chunk)) + + # Sort item_list into things that need adding and deleting + to_delete = {} + to_add = {} + for item in item_list: + item_ident = item['name'] + existing = records.get(item_ident) + if existing: + existing = dict(zip(SHARD_RANGE_KEYS, existing)) + if merge_shards(item, existing): + # exists with older timestamp + if item_ident in records: + to_delete[item_ident] = item + # duplicate entries in item_list + if (item_ident not in to_add or + merge_shards(item, to_add[item_ident])): + to_add[item_ident] = item + + if to_delete: + curs.executemany( + 'DELETE FROM %s WHERE deleted in (0, 1) ' + 'AND name = ?' % SHARD_RANGE_TABLE, + ((item_ident,) for item_ident in to_delete)) + if to_add: + vals = ','.join('?' * len(SHARD_RANGE_KEYS)) + curs.executemany( + 'INSERT INTO %s (%s) VALUES (%s)' % + (SHARD_RANGE_TABLE, ','.join(SHARD_RANGE_KEYS), vals), + tuple([item[k] for k in SHARD_RANGE_KEYS] + for item in to_add.values())) + conn.commit() + + with self.get() as conn: + try: + return _really_merge_items(conn) + except sqlite3.OperationalError as err: + if ('no such table: %s' % SHARD_RANGE_TABLE) not in str(err): + raise + self.create_shard_range_table(conn) + return _really_merge_items(conn) + def get_reconciler_sync(self): with self.get() as conn: try: @@ -1078,3 +1540,644 @@ class ContainerBroker(DatabaseBroker): ''' % (column_names, column_names) + CONTAINER_STAT_VIEW_SCRIPT + 'COMMIT;') + + def _reclaim(self, conn, age_timestamp, sync_timestamp): + super(ContainerBroker, self)._reclaim(conn, age_timestamp, + sync_timestamp) + # populate instance cache, but use existing conn to avoid deadlock + # when it has a pending update + self._populate_instance_cache(conn=conn) + try: + conn.execute(''' + DELETE FROM %s WHERE deleted = 1 AND timestamp < ? + AND name != ? + ''' % SHARD_RANGE_TABLE, (sync_timestamp, self.path)) + except sqlite3.OperationalError as err: + if ('no such table: %s' % SHARD_RANGE_TABLE) not in str(err): + raise + + def _get_shard_range_rows(self, connection=None, include_deleted=False, + states=None, exclude_states=None, + include_own=False, exclude_others=False): + """ + Returns a list of shard range rows. + + To get all shard ranges use ``include_own=True``. To get only the + broker's own shard range use ``include_own=True`` and + ``exclude_others=True``. + + :param connection: db connection + :param include_deleted: include rows marked as deleted + :param states: include only rows matching the given state(s); can be an + int or a list of ints. + :param exclude_states: exclude rows matching the given state(s); can be + an int or a list of ints; takes precedence over ``state``. + :param include_own: boolean that governs whether the row whose name + matches the broker's path is included in the returned list. If + True, that row is included, otherwise it is not included. Default + is False. + :param exclude_others: boolean that governs whether the rows whose + names do not match the broker's path are included in the returned + list. If True, those rows are not included, otherwise they are + included. Default is False. + :return: a list of tuples. + """ + + if exclude_others and not include_own: + return [] + + def prep_states(states): + state_set = set() + if isinstance(states, (list, tuple, set)): + state_set.update(states) + elif states is not None: + state_set.add(states) + return state_set + + excluded_states = prep_states(exclude_states) + included_states = prep_states(states) + included_states -= excluded_states + + def do_query(conn): + try: + condition = '' + conditions = [] + params = [] + if not include_deleted: + conditions.append('deleted=0') + if included_states: + conditions.append('state in (%s)' % ','.join( + '?' * len(included_states))) + params.extend(included_states) + if excluded_states: + conditions.append('state not in (%s)' % ','.join( + '?' * len(excluded_states))) + params.extend(excluded_states) + if not include_own: + conditions.append('name != ?') + params.append(self.path) + if exclude_others: + conditions.append('name = ?') + params.append(self.path) + if conditions: + condition = ' WHERE ' + ' AND '.join(conditions) + sql = ''' + SELECT %s + FROM %s%s; + ''' % (', '.join(SHARD_RANGE_KEYS), SHARD_RANGE_TABLE, + condition) + data = conn.execute(sql, params) + data.row_factory = None + return [row for row in data] + except sqlite3.OperationalError as err: + if ('no such table: %s' % SHARD_RANGE_TABLE) not in str(err): + raise + return [] + + if connection: + return do_query(connection) + else: + with self.get() as conn: + return do_query(conn) + + @classmethod + def resolve_shard_range_states(cls, states): + """ + Given a list of values each of which may be the name of a state, the + number of a state, or an alias, return the set of state numbers + described by the list. + + The following alias values are supported: 'listing' maps to all states + that are considered valid when listing objects; 'updating' maps to all + states that are considered valid for redirecting an object update. + + :param states: a list of values each of which may be the name of a + state, the number of a state, or an alias + :return: a set of integer state numbers, or None if no states are given + :raises ValueError: if any value in the given list is neither a valid + state nor a valid alias + """ + if states: + resolved_states = set() + for state in states: + if state == 'listing': + resolved_states.update(SHARD_LISTING_STATES) + elif state == 'updating': + resolved_states.update(SHARD_UPDATE_STATES) + else: + resolved_states.add(ShardRange.resolve_state(state)[0]) + return resolved_states + return None + + def get_shard_ranges(self, marker=None, end_marker=None, includes=None, + reverse=False, include_deleted=False, states=None, + exclude_states=None, include_own=False, + exclude_others=False, fill_gaps=False): + """ + Returns a list of persisted shard ranges. + + :param marker: restricts the returned list to shard ranges whose + namespace includes or is greater than the marker value. + :param end_marker: restricts the returned list to shard ranges whose + namespace includes or is less than the end_marker value. + :param includes: restricts the returned list to the shard range that + includes the given value; if ``includes`` is specified then + ``marker`` and ``end_marker`` are ignored. + :param reverse: reverse the result order. + :param include_deleted: include items that have the delete marker set + :param states: if specified, restricts the returned list to shard + ranges that have the given state(s); can be a list of ints or a + single int. + :param exclude_states: exclude rows matching the given state(s); can be + an int or a list of ints; takes precedence over ``state``. + :param include_own: boolean that governs whether the row whose name + matches the broker's path is included in the returned list. If + True, that row is included, otherwise it is not included. Default + is False. + :param exclude_others: boolean that governs whether the rows whose + names do not match the broker's path are included in the returned + list. If True, those rows are not included, otherwise they are + included. Default is False. + :param fill_gaps: if True, insert own shard range to fill any gaps in + at the tail of other shard ranges. + :return: a list of instances of :class:`swift.common.utils.ShardRange` + """ + def shard_range_filter(sr): + end = start = True + if end_marker: + end = end_marker > sr.lower + if marker: + start = marker < sr.upper + return start and end + + if reverse: + marker, end_marker = end_marker, marker + if marker and end_marker and marker >= end_marker: + return [] + + shard_ranges = [ + ShardRange(*row) + for row in self._get_shard_range_rows( + include_deleted=include_deleted, states=states, + exclude_states=exclude_states, include_own=include_own, + exclude_others=exclude_others)] + # note if this ever changes to *not* sort by upper first then it breaks + # a key assumption for bisect, which is used by utils.find_shard_ranges + shard_ranges.sort(key=lambda sr: (sr.upper, sr.state, sr.lower)) + if includes: + shard_range = find_shard_range(includes, shard_ranges) + return [shard_range] if shard_range else [] + + if reverse: + shard_ranges.reverse() + if marker or end_marker: + shard_ranges = list(filter(shard_range_filter, shard_ranges)) + + if fill_gaps: + if reverse: + if shard_ranges: + last_upper = shard_ranges[0].upper + else: + last_upper = marker or ShardRange.MIN + required_upper = end_marker or ShardRange.MAX + filler_index = 0 + else: + if shard_ranges: + last_upper = shard_ranges[-1].upper + else: + last_upper = marker or ShardRange.MIN + required_upper = end_marker or ShardRange.MAX + filler_index = len(shard_ranges) + if required_upper > last_upper: + filler_sr = self.get_own_shard_range() + filler_sr.lower = last_upper + filler_sr.upper = required_upper + shard_ranges.insert(filler_index, filler_sr) + + return shard_ranges + + def _own_shard_range(self, no_default=False): + shard_ranges = self.get_shard_ranges(include_own=True, + include_deleted=True, + exclude_others=True) + if shard_ranges: + own_shard_range = shard_ranges[0] + elif no_default: + return None + else: + own_shard_range = ShardRange( + self.path, Timestamp.now(), ShardRange.MIN, ShardRange.MAX, + state=ShardRange.ACTIVE) + return own_shard_range + + def get_own_shard_range(self, no_default=False): + """ + Returns a shard range representing this broker's own shard range. If no + such range has been persisted in the broker's shard ranges table then a + default shard range representing the entire namespace will be returned. + + The returned shard range will be updated with the current object stats + for this broker and a meta timestamp set to the current time. For these + values to be persisted the caller must merge the shard range. + + :param no_default: if True and the broker's own shard range is not + found in the shard ranges table then None is returned, otherwise a + default shard range is returned. + :return: an instance of :class:`~swift.common.utils.ShardRange` + """ + own_shard_range = self._own_shard_range(no_default=no_default) + if own_shard_range: + info = self.get_info() + own_shard_range.update_meta( + info['object_count'], info['bytes_used']) + return own_shard_range + + def is_own_shard_range(self, shard_range): + return shard_range.name == self.path + + def enable_sharding(self, epoch): + """ + Updates this broker's own shard range with the given epoch, sets its + state to SHARDING and persists it in the DB. + + :param epoch: a :class:`~swift.utils.common.Timestamp` + :return: the broker's updated own shard range. + """ + own_shard_range = self._own_shard_range() + own_shard_range.update_state(ShardRange.SHARDING, epoch) + own_shard_range.epoch = epoch + self.merge_shard_ranges(own_shard_range) + return own_shard_range + + def get_shard_usage(self): + """ + Get the aggregate object stats for all shard ranges in states ACTIVE, + SHARDING or SHRINKING. + + :return: a dict with keys {bytes_used, object_count} + """ + shard_ranges = self.get_shard_ranges(states=SHARD_STATS_STATES) + return {'bytes_used': sum(sr.bytes_used for sr in shard_ranges), + 'object_count': sum(sr.object_count for sr in shard_ranges)} + + def get_all_shard_range_data(self): + """ + Returns a list of all shard range data, including own shard range and + deleted shard ranges. + + :return: A list of dict representations of a ShardRange. + """ + shard_ranges = self.get_shard_ranges(include_deleted=True, + include_own=True) + return [dict(sr) for sr in shard_ranges] + + def set_sharding_state(self): + """ + Creates and initializes a fresh DB file in preparation for sharding a + retiring DB. The broker's own shard range must have an epoch timestamp + for this method to succeed. + + :return: True if the fresh DB was successfully created, False + otherwise. + """ + epoch = self.get_own_shard_range().epoch + if not epoch: + self.logger.warning("Container '%s' cannot be set to sharding " + "state: missing epoch", self.path) + return False + state = self.get_db_state() + if not state == UNSHARDED: + self.logger.warning("Container '%s' cannot be set to sharding " + "state while in %s state", self.path, state) + return False + + info = self.get_info() + # The tmp_dir is cleaned up by the replicators after reclaim_age, so if + # we initially create the fresh DB there, we will already have cleanup + # covered if there is an error. + tmp_dir = os.path.join(self.get_device_path(), 'tmp') + if not os.path.exists(tmp_dir): + mkdirs(tmp_dir) + tmp_db_file = os.path.join(tmp_dir, "fresh%s.db" % str(uuid4())) + fresh_broker = ContainerBroker(tmp_db_file, self.timeout, self.logger, + self.account, self.container) + fresh_broker.initialize(info['put_timestamp'], + info['storage_policy_index']) + # copy relevant data from the retiring db to the fresh db + fresh_broker.update_metadata(self.metadata) + fresh_broker.merge_shard_ranges(self.get_all_shard_range_data()) + # copy sync points so that any peer in sync with retiring db will + # appear to be in sync with the fresh db, although the peer shouldn't + # attempt to replicate objects to a db with shard ranges. + for incoming in (True, False): + syncs = self.get_syncs(incoming) + fresh_broker.merge_syncs(syncs, incoming) + + max_row = self.get_max_row() + with fresh_broker.get() as fresh_broker_conn: + # Initialise the rowid to continue from where the retiring db ended + try: + sql = "INSERT into object " \ + "(ROWID, name, created_at, size, content_type, etag) " \ + "values (?, 'tmp_sharding', ?, 0, '', ?)" + fresh_broker_conn.execute( + sql, (max_row, Timestamp.now().internal, + MD5_OF_EMPTY_STRING)) + fresh_broker_conn.execute( + 'DELETE FROM object WHERE ROWID = ?', (max_row,)) + fresh_broker_conn.commit() + except sqlite3.OperationalError as err: + self.logger.error( + 'Failed to set the ROWID of the fresh database for %s: %s', + self.path, err) + return False + + # Set the created_at and hash in the container_info table the same + # in both brokers + try: + fresh_broker_conn.execute( + 'UPDATE container_stat SET created_at=?', + (info['created_at'],)) + fresh_broker_conn.commit() + except sqlite3.OperationalError as err: + self.logger.error('Failed to set matching created_at time in ' + 'the fresh database for %s: %s', + self.path, err) + return False + + # Rename to the new database + fresh_db_filename = make_db_file_path(self._db_file, epoch) + renamer(tmp_db_file, fresh_db_filename) + self.reload_db_files() + return True + + def set_sharded_state(self): + """ + Unlink's the broker's retiring DB file. + + :return: True if the retiring DB was successfully unlinked, False + otherwise. + """ + state = self.get_db_state() + if not state == SHARDING: + self.logger.warning("Container %r cannot be set to sharded " + "state while in %s state", + self.path, state) + return False + + self.reload_db_files() + if len(self.db_files) < 2: + self.logger.warning( + 'Refusing to delete db file for %r: no fresher db file found ' + 'in %r.', self.path, self.db_files) + return False + + retiring_file = self.db_files[-2] + try: + os.unlink(retiring_file) + self.logger.debug('Unlinked retiring db %r', retiring_file) + except OSError as err: + if err.errno != errno.ENOENT: + self.logger.exception('Failed to unlink %r' % self._db_file) + return False + + self.reload_db_files() + if len(self.db_files) >= 2: + self.logger.warning( + 'Still have multiple db files after unlinking %r: %r', + retiring_file, self.db_files) + return False + + return True + + def get_brokers(self): + """ + Return a list of brokers for component dbs. The list has two entries + while the db state is sharding: the first entry is a broker for the + retiring db with ``skip_commits`` set to ``True``; the second entry is + a broker for the fresh db with ``skip_commits`` set to ``False``. For + any other db state the list has one entry. + + :return: a list of :class:`~swift.container.backend.ContainerBroker` + """ + if len(self.db_files) > 2: + self.logger.warning('Unexpected db files will be ignored: %s' % + self.db_files[:-2]) + brokers = [] + db_files = self.db_files[-2:] + while db_files: + db_file = db_files.pop(0) + sub_broker = ContainerBroker( + db_file, self.timeout, self.logger, self.account, + self.container, self.pending_timeout, self.stale_reads_ok, + force_db_file=True, skip_commits=bool(db_files)) + brokers.append(sub_broker) + return brokers + + def set_sharding_sysmeta(self, key, value): + """ + Updates the broker's metadata metadata stored under the given key + prefixed with a sharding specific namespace. + + :param key: metadata key in the sharding metadata namespace. + :param value: metadata value + """ + self.update_metadata({'X-Container-Sysmeta-Shard-' + key: + (value, Timestamp.now().internal)}) + + def get_sharding_sysmeta(self, key=None): + """ + Returns sharding specific info from the broker's metadata. + + :param key: if given the value stored under ``key`` in the sharding + info will be returned. + :return: either a dict of sharding info or the value stored under + ``key`` in that dict. + """ + prefix = 'X-Container-Sysmeta-Shard-' + metadata = self.metadata + info = dict((k[len(prefix):], v[0]) for + k, v in metadata.items() if k.startswith(prefix)) + if key: + return info.get(key) + return info + + def _load_root_info(self): + """ + Load the root container name and account for the container represented + by this broker. + + The root container path, if set, is stored in sysmeta under the key + ``X-Container-Sysmeta-Shard-Root``. If this sysmeta is not set then the + container is considered to be a root container and ``_root_account`` + and ``_root_container`` are set equal to the broker ``account`` and + ``container`` attributes respectively. + + """ + path = self.get_sharding_sysmeta('Root') + if not path: + # Ensure account/container get populated + self._populate_instance_cache() + self._root_account = self.account + self._root_container = self.container + return + + try: + self._root_account, self._root_container = split_path( + '/' + path, 2, 2) + except ValueError: + raise ValueError("Expected X-Container-Sysmeta-Shard-Root to be " + "of the form 'account/container', got %r" % path) + + @property + def root_account(self): + if not self._root_account: + self._load_root_info() + return self._root_account + + @property + def root_container(self): + if not self._root_container: + self._load_root_info() + return self._root_container + + @property + def root_path(self): + return '%s/%s' % (self.root_account, self.root_container) + + def is_root_container(self): + """ + Returns True if this container is a root container, False otherwise. + + A root container is a container that is not a shard of another + container. + """ + self._populate_instance_cache() + return (self.root_account == self.account and + self.root_container == self.container) + + def _get_next_shard_range_upper(self, shard_size, last_upper=None): + """ + Returns the name of the object that is ``shard_size`` rows beyond + ``last_upper`` in the object table ordered by name. If ``last_upper`` + is not given then it defaults to the start of object table ordered by + name. + + :param last_upper: the upper bound of the last found shard range. + :return: an object name, or None if the number of rows beyond + ``last_upper`` is less than ``shard_size``. + """ + self._commit_puts_stale_ok() + with self.get() as connection: + sql = ('SELECT name FROM object WHERE %s=0 ' % + self._get_deleted_key(connection)) + args = [] + if last_upper: + sql += "AND name > ? " + args.append(str(last_upper)) + sql += "ORDER BY name LIMIT 1 OFFSET %d" % (shard_size - 1) + row = connection.execute(sql, args).fetchone() + return row['name'] if row else None + + def find_shard_ranges(self, shard_size, limit=-1, existing_ranges=None): + """ + Scans the container db for shard ranges. Scanning will start at the + upper bound of the any ``existing_ranges`` that are given, otherwise + at ``ShardRange.MIN``. Scanning will stop when ``limit`` shard ranges + have been found or when no more shard ranges can be found. In the + latter case, the upper bound of the final shard range will be equal to + the upper bound of the container namespace. + + This method does not modify the state of the db; callers are + responsible for persisting any shard range data in the db. + + :param shard_size: the size of each shard range + :param limit: the maximum number of shard points to be found; a + negative value (default) implies no limit. + :param existing_ranges: an optional list of existing ShardRanges; if + given, this list should be sorted in order of upper bounds; the + scan for new shard ranges will start at the upper bound of the last + existing ShardRange. + :return: a tuple; the first value in the tuple is a list of + dicts each having keys {'index', 'lower', 'upper', 'object_count'} + in order of ascending 'upper'; the second value in the tuple is a + boolean which is True if the last shard range has been found, False + otherwise. + """ + existing_ranges = existing_ranges or [] + object_count = self.get_info().get('object_count', 0) + if shard_size >= object_count: + # container not big enough to shard + return [], False + + own_shard_range = self.get_own_shard_range() + progress = 0 + progress_reliable = True + # update initial state to account for any existing shard ranges + if existing_ranges: + if all([sr.state == ShardRange.FOUND + for sr in existing_ranges]): + progress = sum([sr.object_count for sr in existing_ranges]) + else: + # else: object count in existing shard ranges may have changed + # since they were found so progress cannot be reliably + # calculated; use default progress of zero - that's ok, + # progress is used for optimisation not correctness + progress_reliable = False + last_shard_upper = existing_ranges[-1].upper + if last_shard_upper >= own_shard_range.upper: + # == implies all ranges were previously found + # > implies an acceptor range has been set into which this + # shard should cleave itself + return [], True + else: + last_shard_upper = own_shard_range.lower + + found_ranges = [] + sub_broker = self.get_brokers()[0] + index = len(existing_ranges) + while limit < 0 or len(found_ranges) < limit: + if progress + shard_size >= object_count: + # next shard point is at or beyond final object name so don't + # bother with db query + next_shard_upper = None + else: + try: + next_shard_upper = sub_broker._get_next_shard_range_upper( + shard_size, last_shard_upper) + except (sqlite3.OperationalError, LockTimeout): + self.logger.exception( + "Problem finding shard upper in %r: " % self.db_file) + break + + if (next_shard_upper is None or + next_shard_upper > own_shard_range.upper): + # We reached the end of the container namespace, or possibly + # beyond if the container has misplaced objects. In either case + # limit the final shard range to own_shard_range.upper. + next_shard_upper = own_shard_range.upper + if progress_reliable: + # object count may include misplaced objects so the final + # shard size may not be accurate until cleaved, but at + # least the sum of shard sizes will equal the unsharded + # object_count + shard_size = object_count - progress + + # NB shard ranges are created with a non-zero object count so that + # the apparent container object count remains constant, and the + # container is non-deletable while shards have been found but not + # yet cleaved + found_ranges.append( + {'index': index, + 'lower': str(last_shard_upper), + 'upper': str(next_shard_upper), + 'object_count': shard_size}) + + if next_shard_upper == own_shard_range.upper: + return found_ranges, True + + progress += shard_size + last_shard_upper = next_shard_upper + index += 1 + + return found_ranges, False diff --git a/swift/container/replicator.py b/swift/container/replicator.py index 9f3fdb53c7..b326ab70e3 100644 --- a/swift/container/replicator.py +++ b/swift/container/replicator.py @@ -26,9 +26,10 @@ from swift.container.reconciler import ( get_reconciler_container_name, get_row_to_q_entry_translator) from swift.common import db_replicator from swift.common.storage_policy import POLICIES +from swift.common.swob import HTTPOk, HTTPAccepted from swift.common.exceptions import DeviceUnavailable from swift.common.http import is_success -from swift.common.utils import Timestamp, majority_size +from swift.common.utils import Timestamp, majority_size, get_db_files class ContainerReplicator(db_replicator.Replicator): @@ -76,9 +77,51 @@ class ContainerReplicator(db_replicator.Replicator): if any(info[key] != remote_info[key] for key in sync_timestamps): broker.merge_timestamps(*(remote_info[key] for key in sync_timestamps)) + + # Grab remote's shard ranges, too + self._fetch_and_merge_shard_ranges(http, broker) + return super(ContainerReplicator, self)._handle_sync_response( node, response, info, broker, http, different_region) + def _sync_shard_ranges(self, broker, http, local_id): + # TODO: currently the number of shard ranges is expected to be _much_ + # less than normal objects so all are sync'd on each cycle. However, in + # future there should be sync points maintained much like for object + # syncing so that only new shard range rows are sync'd. + shard_range_data = broker.get_all_shard_range_data() + if shard_range_data: + if not self._send_replicate_request( + http, 'merge_shard_ranges', shard_range_data, local_id): + return False + self.logger.debug('%s synced %s shard ranges to %s', + broker.db_file, len(shard_range_data), + '%(ip)s:%(port)s/%(device)s' % http.node) + return True + + def _choose_replication_mode(self, node, rinfo, info, local_sync, broker, + http, different_region): + # Always replicate shard ranges + shard_range_success = self._sync_shard_ranges(broker, http, info['id']) + if broker.sharding_initiated(): + self.logger.warning( + '%s is able to shard -- refusing to replicate objects to peer ' + '%s; have shard ranges and will wait for cleaving', + broker.db_file, + '%(ip)s:%(port)s/%(device)s' % node) + self.stats['deferred'] += 1 + return shard_range_success + + success = super(ContainerReplicator, self)._choose_replication_mode( + node, rinfo, info, local_sync, broker, http, + different_region) + return shard_range_success and success + + def _fetch_and_merge_shard_ranges(self, http, broker): + response = http.replicate('get_shard_ranges') + if is_success(response.status): + broker.merge_shard_ranges(json.loads(response.data)) + def find_local_handoff_for_part(self, part): """ Look through devices in the ring for the first handoff device that was @@ -202,6 +245,18 @@ class ContainerReplicator(db_replicator.Replicator): # replication broker.update_reconciler_sync(max_sync) + def cleanup_post_replicate(self, broker, orig_info, responses): + debug_template = 'Not deleting db %s (%%s)' % broker.db_file + if broker.sharding_required(): + # despite being a handoff, since we're sharding we're not going to + # do any cleanup so we can continue cleaving - this is still + # considered "success" + reason = 'requires sharding, state %s' % broker.get_db_state() + self.logger.debug(debug_template, reason) + return True + return super(ContainerReplicator, self).cleanup_post_replicate( + broker, orig_info, responses) + def delete_db(self, broker): """ Ensure that reconciler databases are only cleaned up at the end of the @@ -255,9 +310,20 @@ class ContainerReplicator(db_replicator.Replicator): self.replicate_reconcilers() return rv + def _in_sync(self, rinfo, info, broker, local_sync): + # TODO: don't always sync shard ranges! + if broker.get_shard_ranges(include_own=True, include_deleted=True): + return False + + return super(ContainerReplicator, self)._in_sync( + rinfo, info, broker, local_sync) + class ContainerReplicatorRpc(db_replicator.ReplicatorRpc): + def _db_file_exists(self, db_path): + return bool(get_db_files(db_path)) + def _parse_sync_args(self, args): parent = super(ContainerReplicatorRpc, self) remote_info = parent._parse_sync_args(args) @@ -285,3 +351,27 @@ class ContainerReplicatorRpc(db_replicator.ReplicatorRpc): timestamp=status_changed_at) info = broker.get_replication_info() return info + + def _abort_rsync_then_merge(self, db_file, old_filename): + if super(ContainerReplicatorRpc, self)._abort_rsync_then_merge( + db_file, old_filename): + return True + # if the local db has started sharding since the original 'sync' + # request then abort object replication now; instantiate a fresh broker + # each time this check if performed so to get latest state + broker = ContainerBroker(db_file) + return broker.sharding_initiated() + + def _post_rsync_then_merge_hook(self, existing_broker, new_broker): + # Note the following hook will need to change to using a pointer and + # limit in the future. + new_broker.merge_shard_ranges( + existing_broker.get_all_shard_range_data()) + + def merge_shard_ranges(self, broker, args): + broker.merge_shard_ranges(args[0]) + return HTTPAccepted() + + def get_shard_ranges(self, broker, args): + return HTTPOk(headers={'Content-Type': 'application/json'}, + body=json.dumps(broker.get_all_shard_range_data())) diff --git a/test/unit/cli/test_info.py b/test/unit/cli/test_info.py index d1ea79cff3..1d5c56e9f4 100644 --- a/test/unit/cli/test_info.py +++ b/test/unit/cli/test_info.py @@ -31,6 +31,7 @@ from swift.cli.info import (print_db_info_metadata, print_ring_locations, parse_get_node_args) from swift.account.server import AccountController from swift.container.server import ContainerController +from swift.container.backend import UNSHARDED, SHARDED from swift.obj.diskfile import write_metadata @@ -103,17 +104,18 @@ class TestCliInfo(TestCliInfoBase): self.assertRaisesMessage(ValueError, 'Info is incomplete', print_db_info_metadata, 'container', {}, {}) - info = dict( - account='acct', - created_at=100.1, - put_timestamp=106.3, - delete_timestamp=107.9, - status_changed_at=108.3, - container_count='3', - object_count='20', - bytes_used='42') - info['hash'] = 'abaddeadbeefcafe' - info['id'] = 'abadf100d0ddba11' + info = { + 'account': 'acct', + 'created_at': 100.1, + 'put_timestamp': 106.3, + 'delete_timestamp': 107.9, + 'status_changed_at': 108.3, + 'container_count': '3', + 'object_count': '20', + 'bytes_used': '42', + 'hash': 'abaddeadbeefcafe', + 'id': 'abadf100d0ddba11', + } md = {'x-account-meta-mydata': ('swift', '0000000000.00000'), 'x-other-something': ('boo', '0000000000.00000')} out = StringIO() @@ -154,7 +156,9 @@ No system metadata found in db file reported_object_count='20', reported_bytes_used='42', x_container_foo='bar', - x_container_bar='goo') + x_container_bar='goo', + db_state=UNSHARDED, + is_root=True) info['hash'] = 'abaddeadbeefcafe' info['id'] = 'abadf100d0ddba11' md = {'x-container-sysmeta-mydata': ('swift', '0000000000.00000')} @@ -182,10 +186,88 @@ Metadata: X-Container-Bar: goo X-Container-Foo: bar System Metadata: {'mydata': 'swift'} -No user metadata found in db file''' % POLICIES[0].name +No user metadata found in db file +Sharding Metadata: + Type: root + State: unsharded''' % POLICIES[0].name self.assertEqual(sorted(out.getvalue().strip().split('\n')), sorted(exp_out.split('\n'))) + def test_print_db_info_metadata_with_shard_ranges(self): + + shard_ranges = [utils.ShardRange( + name='.sharded_a/shard_range_%s' % i, + timestamp=utils.Timestamp(i), lower='%da' % i, + upper='%dz' % i, object_count=i, bytes_used=i, + meta_timestamp=utils.Timestamp(i)) for i in range(1, 4)] + shard_ranges[0].state = utils.ShardRange.CLEAVED + shard_ranges[1].state = utils.ShardRange.CREATED + + info = dict( + account='acct', + container='cont', + storage_policy_index=0, + created_at='0000000100.10000', + put_timestamp='0000000106.30000', + delete_timestamp='0000000107.90000', + status_changed_at='0000000108.30000', + object_count='20', + bytes_used='42', + reported_put_timestamp='0000010106.30000', + reported_delete_timestamp='0000010107.90000', + reported_object_count='20', + reported_bytes_used='42', + db_state=SHARDED, + is_root=True, + shard_ranges=shard_ranges) + info['hash'] = 'abaddeadbeefcafe' + info['id'] = 'abadf100d0ddba11' + out = StringIO() + with mock.patch('sys.stdout', out): + print_db_info_metadata('container', info, {}) + exp_out = '''Path: /acct/cont + Account: acct + Container: cont + Container Hash: d49d0ecbb53be1fcc49624f2f7c7ccae +Metadata: + Created at: 1970-01-01T00:01:40.100000 (0000000100.10000) + Put Timestamp: 1970-01-01T00:01:46.300000 (0000000106.30000) + Delete Timestamp: 1970-01-01T00:01:47.900000 (0000000107.90000) + Status Timestamp: 1970-01-01T00:01:48.300000 (0000000108.30000) + Object Count: 20 + Bytes Used: 42 + Storage Policy: %s (0) + Reported Put Timestamp: 1970-01-01T02:48:26.300000 (0000010106.30000) + Reported Delete Timestamp: 1970-01-01T02:48:27.900000 (0000010107.90000) + Reported Object Count: 20 + Reported Bytes Used: 42 + Chexor: abaddeadbeefcafe + UUID: abadf100d0ddba11 +No system metadata found in db file +No user metadata found in db file +Sharding Metadata: + Type: root + State: sharded +Shard Ranges (3): + Name: .sharded_a/shard_range_1 + lower: '1a', upper: '1z' + Object Count: 1, Bytes Used: 1, State: cleaved (30) + Created at: 1970-01-01T00:00:01.000000 (0000000001.00000) + Meta Timestamp: 1970-01-01T00:00:01.000000 (0000000001.00000) + Name: .sharded_a/shard_range_2 + lower: '2a', upper: '2z' + Object Count: 2, Bytes Used: 2, State: created (20) + Created at: 1970-01-01T00:00:02.000000 (0000000002.00000) + Meta Timestamp: 1970-01-01T00:00:02.000000 (0000000002.00000) + Name: .sharded_a/shard_range_3 + lower: '3a', upper: '3z' + Object Count: 3, Bytes Used: 3, State: found (10) + Created at: 1970-01-01T00:00:03.000000 (0000000003.00000) + Meta Timestamp: 1970-01-01T00:00:03.000000 (0000000003.00000)''' %\ + POLICIES[0].name + self.assertEqual(sorted(out.getvalue().strip().split('\n')), + sorted(exp_out.strip().split('\n'))) + def test_print_ring_locations_invalid_args(self): self.assertRaises(ValueError, print_ring_locations, None, 'dir', 'acct') @@ -423,14 +505,8 @@ No user metadata found in db file''' % POLICIES[0].name '1', 'b47', 'dc5be2aa4347a22a0fee6bc7de505b47', 'dc5be2aa4347a22a0fee6bc7de505b47.db') - try: - print_info('account', db_file, swift_dir=self.testdir) - except Exception: - exp_raised = True - if exp_raised: - self.fail("Unexpected exception raised") - else: - self.assertGreater(len(out.getvalue().strip()), 800) + print_info('account', db_file, swift_dir=self.testdir) + self.assertGreater(len(out.getvalue().strip()), 800) controller = ContainerController( {'devices': self.testdir, 'mount_check': 'false'}) diff --git a/test/unit/common/test_db_replicator.py b/test/unit/common/test_db_replicator.py index 20c5d6738a..e4fdce8e91 100644 --- a/test/unit/common/test_db_replicator.py +++ b/test/unit/common/test_db_replicator.py @@ -274,6 +274,9 @@ class FakeBroker(object): self.put_timestamp = put_timestamp self.delete_timestamp = delete_timestamp + def get_brokers(self): + return [self] + class FakeAccountBroker(FakeBroker): db_type = 'account' @@ -1205,7 +1208,7 @@ class TestDBReplicator(unittest.TestCase): unit.mock_check_drive(isdir=True): mock_os.path.exists.side_effect = [False, True] response = rpc.dispatch(('drive', 'part', 'hash'), - ['complete_rsync', 'arg1', 'arg2']) + ['complete_rsync', 'arg1']) expected_calls = [call('/part/ash/hash/hash.db'), call('/drive/tmp/arg1')] self.assertEqual(mock_os.path.exists.call_args_list, @@ -1213,6 +1216,19 @@ class TestDBReplicator(unittest.TestCase): self.assertEqual('204 No Content', response.status) self.assertEqual(204, response.status_int) + with patch('swift.common.db_replicator.os', + new=mock.MagicMock(wraps=os)) as mock_os, \ + unit.mock_check_drive(isdir=True): + mock_os.path.exists.side_effect = [False, True] + response = rpc.dispatch(('drive', 'part', 'hash'), + ['complete_rsync', 'arg1', 'arg2']) + expected_calls = [call('/part/ash/hash/arg2'), + call('/drive/tmp/arg1')] + self.assertEqual(mock_os.path.exists.call_args_list, + expected_calls) + self.assertEqual('204 No Content', response.status) + self.assertEqual(204, response.status_int) + def test_rsync_then_merge_db_does_not_exist(self): rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker, mount_check=False) @@ -1267,13 +1283,22 @@ class TestDBReplicator(unittest.TestCase): rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker, mount_check=False) + with patch('swift.common.db_replicator.os', + new=mock.MagicMock(wraps=os)) as mock_os, \ + unit.mock_check_drive(isdir=True): + mock_os.path.exists.return_value = True + response = rpc.complete_rsync('drive', '/data/db.db', ['arg1']) + mock_os.path.exists.assert_called_with('/data/db.db') + self.assertEqual('404 Not Found', response.status) + self.assertEqual(404, response.status_int) + with patch('swift.common.db_replicator.os', new=mock.MagicMock(wraps=os)) as mock_os, \ unit.mock_check_drive(isdir=True): mock_os.path.exists.return_value = True response = rpc.complete_rsync('drive', '/data/db.db', ['arg1', 'arg2']) - mock_os.path.exists.assert_called_with('/data/db.db') + mock_os.path.exists.assert_called_with('/data/arg2') self.assertEqual('404 Not Found', response.status) self.assertEqual(404, response.status_int) @@ -1286,37 +1311,57 @@ class TestDBReplicator(unittest.TestCase): unit.mock_check_drive(isdir=True): mock_os.path.exists.return_value = False response = rpc.complete_rsync('drive', '/data/db.db', - ['arg1', 'arg2']) + ['arg1']) expected_calls = [call('/data/db.db'), call('/drive/tmp/arg1')] self.assertEqual(expected_calls, mock_os.path.exists.call_args_list) self.assertEqual('404 Not Found', response.status) self.assertEqual(404, response.status_int) + with patch('swift.common.db_replicator.os', + new=mock.MagicMock(wraps=os)) as mock_os, \ + unit.mock_check_drive(isdir=True): + mock_os.path.exists.return_value = False + response = rpc.complete_rsync('drive', '/data/db.db', + ['arg1', 'arg2']) + expected_calls = [call('/data/arg2'), call('/drive/tmp/arg1')] + self.assertEqual(expected_calls, + mock_os.path.exists.call_args_list) + self.assertEqual('404 Not Found', response.status) + self.assertEqual(404, response.status_int) + def test_complete_rsync_rename(self): rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker, mount_check=False) - def mock_exists(path): - if path == '/data/db.db': - return False - self.assertEqual('/drive/tmp/arg1', path) - return True - def mock_renamer(old, new): - self.assertEqual('/drive/tmp/arg1', old) - self.assertEqual('/data/db.db', new) + renamer_calls.append((old, new)) self._patch(patch.object, db_replicator, 'renamer', mock_renamer) + renamer_calls = [] + with patch('swift.common.db_replicator.os', + new=mock.MagicMock(wraps=os)) as mock_os, \ + unit.mock_check_drive(isdir=True): + mock_os.path.exists.side_effect = [False, True] + response = rpc.complete_rsync('drive', '/data/db.db', + ['arg1']) + self.assertEqual('204 No Content', response.status) + self.assertEqual(204, response.status_int) + self.assertEqual(('/drive/tmp/arg1', '/data/db.db'), renamer_calls[0]) + self.assertFalse(renamer_calls[1:]) + + renamer_calls = [] with patch('swift.common.db_replicator.os', new=mock.MagicMock(wraps=os)) as mock_os, \ unit.mock_check_drive(isdir=True): mock_os.path.exists.side_effect = [False, True] response = rpc.complete_rsync('drive', '/data/db.db', ['arg1', 'arg2']) - self.assertEqual('204 No Content', response.status) - self.assertEqual(204, response.status_int) + self.assertEqual('204 No Content', response.status) + self.assertEqual(204, response.status_int) + self.assertEqual(('/drive/tmp/arg1', '/data/arg2'), renamer_calls[0]) + self.assertFalse(renamer_calls[1:]) def test_replicator_sync_with_broker_replication_missing_table(self): rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker, @@ -1675,10 +1720,10 @@ class TestDBReplicator(unittest.TestCase): db_file = __file__ replicator = TestReplicator({}) replicator._http_connect(node, partition, db_file) + expected_hsh = os.path.basename(db_file).split('.', 1)[0] + expected_hsh = expected_hsh.split('_', 1)[0] db_replicator.ReplConnection.assert_has_calls([ - mock.call(node, partition, - os.path.basename(db_file).split('.', 1)[0], - replicator.logger)]) + mock.call(node, partition, expected_hsh, replicator.logger)]) class TestHandoffsOnly(unittest.TestCase): diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index 1f495ac876..33a437262a 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -3878,6 +3878,47 @@ cluster_dfw1 = http://dfw1.host/v1/ found = utils.find_shard_range('l', overlapping_ranges) self.assertEqual(found, ktol) + def test_parse_db_filename(self): + actual = utils.parse_db_filename('hash.db') + self.assertEqual(('hash', None, '.db'), actual) + actual = utils.parse_db_filename('hash_1234567890.12345.db') + self.assertEqual(('hash', '1234567890.12345', '.db'), actual) + actual = utils.parse_db_filename( + '/dev/containers/part/ash/hash/hash_1234567890.12345.db') + self.assertEqual(('hash', '1234567890.12345', '.db'), actual) + self.assertRaises(ValueError, utils.parse_db_filename, '/path/to/dir/') + # These shouldn't come up in practice; included for completeness + self.assertEqual(utils.parse_db_filename('hashunder_.db'), + ('hashunder', '', '.db')) + self.assertEqual(utils.parse_db_filename('lots_of_underscores.db'), + ('lots', 'of', '.db')) + + def test_make_db_file_path(self): + epoch = utils.Timestamp.now() + actual = utils.make_db_file_path('hash.db', epoch) + self.assertEqual('hash_%s.db' % epoch.internal, actual) + + actual = utils.make_db_file_path('hash_oldepoch.db', epoch) + self.assertEqual('hash_%s.db' % epoch.internal, actual) + + actual = utils.make_db_file_path('/path/to/hash.db', epoch) + self.assertEqual('/path/to/hash_%s.db' % epoch.internal, actual) + + epoch = utils.Timestamp.now() + actual = utils.make_db_file_path(actual, epoch) + self.assertEqual('/path/to/hash_%s.db' % epoch.internal, actual) + + # epochs shouldn't have offsets + epoch = utils.Timestamp.now(offset=10) + actual = utils.make_db_file_path(actual, epoch) + self.assertEqual('/path/to/hash_%s.db' % epoch.normal, actual) + + self.assertRaises(ValueError, utils.make_db_file_path, + '/path/to/hash.db', 'bad epoch') + + self.assertRaises(ValueError, utils.make_db_file_path, + '/path/to/hash.db', None) + def test_modify_priority(self): pid = os.getpid() logger = debug_logger() @@ -4168,6 +4209,70 @@ cluster_dfw1 = http://dfw1.host/v1/ # iterators self.assertListEqual([1, 4, 6, 2, 5, 7, 3, 8, 9], got) + @with_tempdir + def test_get_db_files(self, tempdir): + dbdir = os.path.join(tempdir, 'dbdir') + self.assertEqual([], utils.get_db_files(dbdir)) + path_1 = os.path.join(dbdir, 'dbfile.db') + self.assertEqual([], utils.get_db_files(path_1)) + os.mkdir(dbdir) + self.assertEqual([], utils.get_db_files(path_1)) + with open(path_1, 'wb'): + pass + self.assertEqual([path_1], utils.get_db_files(path_1)) + + path_2 = os.path.join(dbdir, 'dbfile_2.db') + self.assertEqual([path_1], utils.get_db_files(path_2)) + + with open(path_2, 'wb'): + pass + + self.assertEqual([path_1, path_2], utils.get_db_files(path_1)) + self.assertEqual([path_1, path_2], utils.get_db_files(path_2)) + + path_3 = os.path.join(dbdir, 'dbfile_3.db') + self.assertEqual([path_1, path_2], utils.get_db_files(path_3)) + + with open(path_3, 'wb'): + pass + + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(path_1)) + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(path_2)) + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(path_3)) + + other_hash = os.path.join(dbdir, 'other.db') + self.assertEqual([], utils.get_db_files(other_hash)) + other_hash = os.path.join(dbdir, 'other_1.db') + self.assertEqual([], utils.get_db_files(other_hash)) + + pending = os.path.join(dbdir, 'dbfile.pending') + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(pending)) + + with open(pending, 'wb'): + pass + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(pending)) + + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(path_1)) + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(path_2)) + self.assertEqual([path_1, path_2, path_3], utils.get_db_files(path_3)) + self.assertEqual([], utils.get_db_files(dbdir)) + + os.unlink(path_1) + self.assertEqual([path_2, path_3], utils.get_db_files(path_1)) + self.assertEqual([path_2, path_3], utils.get_db_files(path_2)) + self.assertEqual([path_2, path_3], utils.get_db_files(path_3)) + + os.unlink(path_2) + self.assertEqual([path_3], utils.get_db_files(path_1)) + self.assertEqual([path_3], utils.get_db_files(path_2)) + self.assertEqual([path_3], utils.get_db_files(path_3)) + + os.unlink(path_3) + self.assertEqual([], utils.get_db_files(path_1)) + self.assertEqual([], utils.get_db_files(path_2)) + self.assertEqual([], utils.get_db_files(path_3)) + self.assertEqual([], utils.get_db_files('/path/to/nowhere')) + class ResellerConfReader(unittest.TestCase): diff --git a/test/unit/container/test_backend.py b/test/unit/container/test_backend.py index 9a3d86d4d4..0069f812e1 100644 --- a/test/unit/container/test_backend.py +++ b/test/unit/container/test_backend.py @@ -14,9 +14,10 @@ # limitations under the License. """ Tests for swift.container.backend """ - +import errno import os import hashlib +import inspect import unittest from time import sleep, time from uuid import uuid4 @@ -27,20 +28,34 @@ import sqlite3 import pickle import json +from swift.common.exceptions import LockTimeout from swift.container.backend import ContainerBroker, \ - update_new_item_from_existing -from swift.common.utils import Timestamp, encode_timestamps, hash_path + update_new_item_from_existing, UNSHARDED, SHARDING, SHARDED, \ + COLLAPSED, SHARD_LISTING_STATES, SHARD_UPDATE_STATES +from swift.common.db import DatabaseAlreadyExists, GreenDBConnection +from swift.common.utils import Timestamp, encode_timestamps, hash_path, \ + ShardRange, make_db_file_path from swift.common.storage_policy import POLICIES import mock +from test import annotate_failure from test.unit import (patch_policies, with_tempdir, make_timestamp_iter, - EMPTY_ETAG) + EMPTY_ETAG, FakeLogger, mock_timestamp_now) from test.unit.common import test_db class TestContainerBroker(unittest.TestCase): """Tests for ContainerBroker""" + expected_db_tables = {'outgoing_sync', 'incoming_sync', 'object', + 'sqlite_sequence', 'policy_stat', + 'container_info', 'shard_range'} + + def _assert_shard_ranges(self, broker, expected, include_own=False): + actual = broker.get_shard_ranges(include_deleted=True, + include_own=include_own) + self.assertEqual([dict(sr) for sr in expected], + [dict(sr) for sr in actual]) def test_creation(self): # Test ContainerBroker.__init__ @@ -51,6 +66,23 @@ class TestContainerBroker(unittest.TestCase): curs = conn.cursor() curs.execute('SELECT 1') self.assertEqual(curs.fetchall()[0][0], 1) + curs.execute("SELECT name FROM sqlite_master WHERE type='table';") + self.assertEqual(self.expected_db_tables, + {row[0] for row in curs.fetchall()}) + # check the update trigger + broker.put_object('blah', Timestamp.now().internal, 0, 'text/plain', + 'etag', 0, 0) + with broker.get() as conn: + with self.assertRaises(sqlite3.DatabaseError) as cm: + conn.execute('UPDATE object SET name="blah";') + self.assertIn('UPDATE not allowed', str(cm.exception)) + if 'shard_range' in self.expected_db_tables: + # check the update trigger + broker.merge_shard_ranges(broker.get_own_shard_range()) + with broker.get() as conn: + with self.assertRaises(sqlite3.DatabaseError) as cm: + conn.execute('UPDATE shard_range SET name="blah";') + self.assertIn('UPDATE not allowed', str(cm.exception)) @patch_policies def test_storage_policy_property(self): @@ -91,16 +123,296 @@ class TestContainerBroker(unittest.TestCase): pass self.assertTrue(broker.conn is None) - def test_empty(self): + @with_tempdir + def test_is_deleted(self, tempdir): + # Test ContainerBroker.is_deleted() and get_info_is_deleted() + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + + self.assertFalse(broker.is_deleted()) + broker.delete_db(next(ts_iter).internal) + self.assertTrue(broker.is_deleted()) + + def check_object_counted(broker_to_test, broker_with_object): + obj = {'name': 'o', 'created_at': next(ts_iter).internal, + 'size': 0, 'content_type': 'text/plain', 'etag': EMPTY_ETAG, + 'deleted': 0} + broker_with_object.merge_items([dict(obj)]) + self.assertFalse(broker_to_test.is_deleted()) + info, deleted = broker_to_test.get_info_is_deleted() + self.assertFalse(deleted) + self.assertEqual(1, info['object_count']) + obj.update({'created_at': next(ts_iter).internal, 'deleted': 1}) + broker_with_object.merge_items([dict(obj)]) + self.assertTrue(broker_to_test.is_deleted()) + info, deleted = broker_to_test.get_info_is_deleted() + self.assertTrue(deleted) + self.assertEqual(0, info['object_count']) + + def check_object_not_counted(broker): + obj = {'name': 'o', 'created_at': next(ts_iter).internal, + 'size': 0, 'content_type': 'text/plain', 'etag': EMPTY_ETAG, + 'deleted': 0} + broker.merge_items([dict(obj)]) + self.assertTrue(broker.is_deleted()) + info, deleted = broker.get_info_is_deleted() + self.assertTrue(deleted) + self.assertEqual(0, info['object_count']) + obj.update({'created_at': next(ts_iter).internal, 'deleted': 1}) + broker.merge_items([dict(obj)]) + self.assertTrue(broker.is_deleted()) + info, deleted = broker.get_info_is_deleted() + self.assertTrue(deleted) + self.assertEqual(0, info['object_count']) + + def check_shard_ranges_not_counted(): + sr = ShardRange('.shards_a/shard_c', next(ts_iter), object_count=0) + sr.update_meta(13, 99, meta_timestamp=next(ts_iter)) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertTrue(broker.is_deleted()) + info, deleted = broker.get_info_is_deleted() + self.assertTrue(deleted) + self.assertEqual(0, info['object_count']) + + def check_shard_ranges_counted(): + sr = ShardRange('.shards_a/shard_c', next(ts_iter), object_count=0) + sr.update_meta(13, 99, meta_timestamp=next(ts_iter)) + counted_states = (ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + expected = state not in counted_states + self.assertEqual(expected, broker.is_deleted()) + info, deleted = broker.get_info_is_deleted() + self.assertEqual(expected, deleted) + self.assertEqual(0 if expected else 13, info['object_count']) + + sr.update_meta(0, 0, meta_timestamp=next(ts_iter)) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertTrue(broker.is_deleted()) + info, deleted = broker.get_info_is_deleted() + self.assertTrue(deleted) + self.assertEqual(0, info['object_count']) + + # unsharded + check_object_counted(broker, broker) + check_shard_ranges_not_counted() + + # move to sharding state + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + broker.delete_db(next(ts_iter).internal) + self.assertTrue(broker.is_deleted()) + + # check object in retiring db is considered + check_object_counted(broker, broker.get_brokers()[0]) + self.assertTrue(broker.is_deleted()) + check_shard_ranges_not_counted() + # misplaced object in fresh db is not considered + check_object_not_counted(broker) + + # move to sharded state + self.assertTrue(broker.set_sharded_state()) + check_object_not_counted(broker) + check_shard_ranges_counted() + + # own shard range has no influence + own_sr = broker.get_own_shard_range() + own_sr.update_meta(3, 4, meta_timestamp=next(ts_iter)) + broker.merge_shard_ranges([own_sr]) + self.assertTrue(broker.is_deleted()) + + @with_tempdir + def test_empty(self, tempdir): # Test ContainerBroker.empty - broker = ContainerBroker(':memory:', account='a', container='c') - broker.initialize(Timestamp('1').internal, 0) + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + self.assertTrue(broker.is_root_container()) + + def check_object_counted(broker_to_test, broker_with_object): + obj = {'name': 'o', 'created_at': next(ts_iter).internal, + 'size': 0, 'content_type': 'text/plain', 'etag': EMPTY_ETAG, + 'deleted': 0} + broker_with_object.merge_items([dict(obj)]) + self.assertFalse(broker_to_test.empty()) + # and delete it + obj.update({'created_at': next(ts_iter).internal, 'deleted': 1}) + broker_with_object.merge_items([dict(obj)]) + self.assertTrue(broker_to_test.empty()) + + def check_shard_ranges_not_counted(): + sr = ShardRange('.shards_a/shard_c', next(ts_iter), object_count=0) + sr.update_meta(13, 99, meta_timestamp=next(ts_iter)) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertTrue(broker.empty()) + + # empty other shard ranges do not influence result + sr.update_meta(0, 0, meta_timestamp=next(ts_iter)) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertTrue(broker.empty()) + self.assertTrue(broker.empty()) - broker.put_object('o', Timestamp.now().internal, 0, 'text/plain', - 'd41d8cd98f00b204e9800998ecf8427e') - self.assertTrue(not broker.empty()) - sleep(.00001) - broker.delete_object('o', Timestamp.now().internal) + check_object_counted(broker, broker) + check_shard_ranges_not_counted() + + # own shard range is not considered for object count + own_sr = broker.get_own_shard_range() + self.assertEqual(0, own_sr.object_count) + broker.merge_shard_ranges([own_sr]) + self.assertTrue(broker.empty()) + + broker.put_object('o', next(ts_iter).internal, 0, 'text/plain', + EMPTY_ETAG) + own_sr = broker.get_own_shard_range() + self.assertEqual(1, own_sr.object_count) + broker.merge_shard_ranges([own_sr]) + self.assertFalse(broker.empty()) + broker.delete_object('o', next(ts_iter).internal) + self.assertTrue(broker.empty()) + + # have own shard range but in state ACTIVE + self.assertEqual(ShardRange.ACTIVE, own_sr.state) + check_object_counted(broker, broker) + check_shard_ranges_not_counted() + + def check_shard_ranges_counted(): + # other shard range is considered + sr = ShardRange('.shards_a/shard_c', next(ts_iter), object_count=0) + sr.update_meta(13, 99, meta_timestamp=next(ts_iter)) + counted_states = (ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertEqual(state not in counted_states, broker.empty()) + + # empty other shard ranges do not influence result + sr.update_meta(0, 0, meta_timestamp=next(ts_iter)) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertTrue(broker.empty()) + + # enable sharding + broker.enable_sharding(next(ts_iter)) + check_object_counted(broker, broker) + check_shard_ranges_counted() + + # move to sharding state + self.assertTrue(broker.set_sharding_state()) + # check object in retiring db is considered + check_object_counted(broker, broker.get_brokers()[0]) + self.assertTrue(broker.empty()) + # as well as misplaced objects in fresh db + check_object_counted(broker, broker) + check_shard_ranges_counted() + + # move to sharded state + self.assertTrue(broker.set_sharded_state()) + self.assertTrue(broker.empty()) + check_object_counted(broker, broker) + check_shard_ranges_counted() + + # own shard range still has no influence + own_sr = broker.get_own_shard_range() + own_sr.update_meta(3, 4, meta_timestamp=next(ts_iter)) + broker.merge_shard_ranges([own_sr]) + self.assertTrue(broker.empty()) + + @with_tempdir + def test_empty_shard_container(self, tempdir): + # Test ContainerBroker.empty for a shard container where shard range + # usage should not be considered + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(db_path, account='.shards_a', container='cc') + broker.initialize(next(ts_iter).internal, 0) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertFalse(broker.is_root_container()) + + def check_object_counted(broker_to_test, broker_with_object): + obj = {'name': 'o', 'created_at': next(ts_iter).internal, + 'size': 0, 'content_type': 'text/plain', 'etag': EMPTY_ETAG, + 'deleted': 0} + broker_with_object.merge_items([dict(obj)]) + self.assertFalse(broker_to_test.empty()) + # and delete it + obj.update({'created_at': next(ts_iter).internal, 'deleted': 1}) + broker_with_object.merge_items([dict(obj)]) + self.assertTrue(broker_to_test.empty()) + + self.assertTrue(broker.empty()) + check_object_counted(broker, broker) + + # own shard range is not considered for object count + own_sr = broker.get_own_shard_range() + self.assertEqual(0, own_sr.object_count) + broker.merge_shard_ranges([own_sr]) + self.assertTrue(broker.empty()) + + broker.put_object('o', next(ts_iter).internal, 0, 'text/plain', + EMPTY_ETAG) + own_sr = broker.get_own_shard_range() + self.assertEqual(1, own_sr.object_count) + broker.merge_shard_ranges([own_sr]) + self.assertFalse(broker.empty()) + broker.delete_object('o', next(ts_iter).internal) + self.assertTrue(broker.empty()) + + def check_shard_ranges_not_counted(): + sr = ShardRange('.shards_a/shard_c', next(ts_iter), object_count=0) + sr.update_meta(13, 99, meta_timestamp=next(ts_iter)) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertTrue(broker.empty()) + + # empty other shard ranges do not influence result + sr.update_meta(0, 0, meta_timestamp=next(ts_iter)) + for state in ShardRange.STATES: + sr.update_state(state, state_timestamp=next(ts_iter)) + broker.merge_shard_ranges([sr]) + self.assertTrue(broker.empty()) + + check_shard_ranges_not_counted() + + # move to sharding state + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + + # check object in retiring db is considered + check_object_counted(broker, broker.get_brokers()[0]) + self.assertTrue(broker.empty()) + # as well as misplaced objects in fresh db + check_object_counted(broker, broker) + check_shard_ranges_not_counted() + + # move to sharded state + self.assertTrue(broker.set_sharded_state()) + self.assertTrue(broker.empty()) + check_object_counted(broker, broker) + check_shard_ranges_not_counted() + + # own shard range still has no influence + own_sr = broker.get_own_shard_range() + own_sr.update_meta(3, 4, meta_timestamp=next(ts_iter)) + broker.merge_shard_ranges([own_sr]) self.assertTrue(broker.empty()) def test_reclaim(self): @@ -163,6 +475,77 @@ class TestContainerBroker(unittest.TestCase): broker.reclaim(Timestamp.now().internal, time()) broker.delete_db(Timestamp.now().internal) + @with_tempdir + def test_reclaim_deadlock(self, tempdir): + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', '%s.db' % uuid4()) + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(Timestamp(100).internal, 0) + # there's some magic count here that causes the failure, something + # about the size of object records and sqlite page size maybe? + count = 23000 + for i in range(count): + obj_name = 'o%d' % i + ts = Timestamp(200).internal + broker.delete_object(obj_name, ts) + broker._commit_puts() + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT count(*) FROM object").fetchone()[0], count) + # make a broker whose container attribute is not yet set so that + # reclaim will need to query info to set it + broker = ContainerBroker(db_path, timeout=1) + # verify that reclaim doesn't get deadlocked and timeout + broker.reclaim(300, 300) + # check all objects were reclaimed + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT count(*) FROM object" + ).fetchone()[0], 0) + + @with_tempdir + def test_reclaim_shard_ranges(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', '%s.db' % uuid4()) + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + older = next(ts_iter) + same = next(ts_iter) + newer = next(ts_iter) + shard_ranges = [ + ShardRange('.shards_a/older_deleted', older.internal, '', 'a', + deleted=True), + ShardRange('.shards_a/same_deleted', same.internal, 'a', 'b', + deleted=True), + ShardRange('.shards_a/newer_deleted', newer.internal, 'b', 'c', + deleted=True), + ShardRange('.shards_a/older', older.internal, 'c', 'd'), + ShardRange('.shards_a/same', same.internal, 'd', 'e'), + ShardRange('.shards_a/newer', newer.internal, 'e', 'f'), + # own shard range is never reclaimed, even if deleted + ShardRange('a/c', older.internal, '', '', deleted=True)] + broker.merge_shard_ranges( + random.sample(shard_ranges, len(shard_ranges))) + + def assert_row_count(expected): + with broker.get() as conn: + res = conn.execute("SELECT count(*) FROM shard_range") + self.assertEqual(expected, res.fetchone()[0]) + + broker.reclaim(older.internal, older.internal) + assert_row_count(7) + self._assert_shard_ranges(broker, shard_ranges, include_own=True) + broker.reclaim(older.internal, same.internal) + assert_row_count(6) + self._assert_shard_ranges(broker, shard_ranges[1:], include_own=True) + broker.reclaim(older.internal, newer.internal) + assert_row_count(5) + self._assert_shard_ranges(broker, shard_ranges[2:], include_own=True) + broker.reclaim(older.internal, next(ts_iter).internal) + assert_row_count(4) + self._assert_shard_ranges(broker, shard_ranges[3:], include_own=True) + def test_get_info_is_deleted(self): ts = make_timestamp_iter() start = next(ts) @@ -179,7 +562,8 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(info['delete_timestamp'], '0') if self.__class__ in (TestContainerBrokerBeforeMetadata, TestContainerBrokerBeforeXSync, - TestContainerBrokerBeforeSPI): + TestContainerBrokerBeforeSPI, + TestContainerBrokerBeforeShardRanges): self.assertEqual(info['status_changed_at'], '0') else: self.assertEqual(info['status_changed_at'], @@ -431,6 +815,273 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(conn.execute( "SELECT deleted FROM object").fetchone()[0], 0) + def test_merge_shard_range_single_record(self): + # Test ContainerBroker.merge_shard_range + broker = ContainerBroker(':memory:', account='a', container='c') + broker.initialize(Timestamp('1').internal, 0) + + ts_iter = make_timestamp_iter() + # Stash these for later + old_put_timestamp = next(ts_iter).internal + old_delete_timestamp = next(ts_iter).internal + + # Create initial object + timestamp = next(ts_iter).internal + meta_timestamp = next(ts_iter).internal + broker.merge_shard_ranges( + ShardRange('"a/{}"', timestamp, + 'low', 'up', meta_timestamp=meta_timestamp)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'low') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'up') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 0) + + # Reput same event + broker.merge_shard_ranges( + ShardRange('"a/{}"', timestamp, + 'low', 'up', meta_timestamp=meta_timestamp)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'low') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'up') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 0) + + # Put new event + timestamp = next(ts_iter).internal + meta_timestamp = next(ts_iter).internal + broker.merge_shard_ranges( + ShardRange('"a/{}"', timestamp, + 'lower', 'upper', 1, 2, meta_timestamp=meta_timestamp)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'lower') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'upper') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 1) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 2) + + # Put old event + broker.merge_shard_ranges( + ShardRange('"a/{}"', old_put_timestamp, + 'lower', 'upper', 1, 2, meta_timestamp=meta_timestamp)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) # Not old_put_timestamp! + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'lower') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'upper') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 1) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 2) + + # Put old delete event + broker.merge_shard_ranges( + ShardRange('"a/{}"', old_delete_timestamp, + 'lower', 'upper', meta_timestamp=meta_timestamp, + deleted=1)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) # Not old_delete_timestamp! + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'lower') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'upper') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 1) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 2) + + # Put new delete event + timestamp = next(ts_iter).internal + broker.merge_shard_ranges( + ShardRange('"a/{}"', timestamp, + 'lower', 'upper', meta_timestamp=meta_timestamp, + deleted=1)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 1) + + # Put new event + timestamp = next(ts_iter).internal + meta_timestamp = next(ts_iter).internal + broker.merge_shard_ranges( + ShardRange('"a/{}"', timestamp, + 'lowerer', 'upperer', 3, 4, + meta_timestamp=meta_timestamp)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'lowerer') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'upperer') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 3) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 4) + + # We'll use this later + in_between_timestamp = next(ts_iter).internal + + # New update event, meta_timestamp increases + meta_timestamp = next(ts_iter).internal + broker.merge_shard_ranges( + ShardRange('"a/{}"', timestamp, + 'lowerer', 'upperer', 3, 4, + meta_timestamp=meta_timestamp)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'lowerer') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'upperer') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 3) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 4) + + # Put event from after last put but before last post + timestamp = in_between_timestamp + broker.merge_shard_ranges( + ShardRange('"a/{}"', timestamp, + 'lowererer', 'uppererer', 5, 6, + meta_timestamp=meta_timestamp)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT name FROM shard_range").fetchone()[0], + '"a/{}"') + self.assertEqual(conn.execute( + "SELECT timestamp FROM shard_range").fetchone()[0], + timestamp) + self.assertEqual(conn.execute( + "SELECT meta_timestamp FROM shard_range").fetchone()[0], + meta_timestamp) + self.assertEqual(conn.execute( + "SELECT lower FROM shard_range").fetchone()[0], 'lowererer') + self.assertEqual(conn.execute( + "SELECT upper FROM shard_range").fetchone()[0], 'uppererer') + self.assertEqual(conn.execute( + "SELECT deleted FROM shard_range").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT object_count FROM shard_range").fetchone()[0], 5) + self.assertEqual(conn.execute( + "SELECT bytes_used FROM shard_range").fetchone()[0], 6) + + def test_merge_shard_ranges_deleted(self): + # Test ContainerBroker.merge_shard_ranges sets deleted attribute + ts_iter = make_timestamp_iter() + broker = ContainerBroker(':memory:', account='a', container='c') + broker.initialize(Timestamp('1').internal, 0) + # put shard range + broker.merge_shard_ranges(ShardRange('a/o', next(ts_iter).internal)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT count(*) FROM shard_range " + "WHERE deleted = 0").fetchone()[0], 1) + self.assertEqual(conn.execute( + "SELECT count(*) FROM shard_range " + "WHERE deleted = 1").fetchone()[0], 0) + + # delete shard range + broker.merge_shard_ranges(ShardRange('a/o', next(ts_iter).internal, + deleted=1)) + with broker.get() as conn: + self.assertEqual(conn.execute( + "SELECT count(*) FROM shard_range " + "WHERE deleted = 0").fetchone()[0], 0) + self.assertEqual(conn.execute( + "SELECT count(*) FROM shard_range " + "WHERE deleted = 1").fetchone()[0], 1) + def test_make_tuple_for_pickle(self): record = {'name': 'obj', 'created_at': '1234567890.12345', @@ -618,6 +1269,194 @@ class TestContainerBroker(unittest.TestCase): broker = ContainerBroker(':memory:', account='a', container='c') self._test_put_object_multiple_encoded_timestamps(broker) + @with_tempdir + def test_get_db_state(self, tempdir): + acct = 'account' + cont = 'container' + hsh = hash_path(acct, cont) + db_file = "%s.db" % hsh + epoch = Timestamp.now() + fresh_db_file = "%s_%s.db" % (hsh, epoch.normal) + db_path = os.path.join(tempdir, db_file) + fresh_db_path = os.path.join(tempdir, fresh_db_file) + ts = Timestamp.now() + + # First test NOTFOUND state + broker = ContainerBroker(db_path, account=acct, container=cont) + self.assertEqual(broker.get_db_state(), 'not_found') + + # Test UNSHARDED state, that is when db_file exists and fresh_db_file + # doesn't + broker.initialize(ts.internal, 0) + self.assertEqual(broker.get_db_state(), 'unsharded') + + # Test the SHARDING state, this is the period when both the db_file and + # the fresh_db_file exist + fresh_broker = ContainerBroker(fresh_db_path, account=acct, + container=cont, force_db_file=True) + fresh_broker.initialize(ts.internal, 0) + own_shard_range = fresh_broker.get_own_shard_range() + own_shard_range.update_state(ShardRange.SHARDING) + own_shard_range.epoch = epoch + shard_range = ShardRange( + '.shards_%s/%s' % (acct, cont), Timestamp.now()) + fresh_broker.merge_shard_ranges([own_shard_range, shard_range]) + + self.assertEqual(fresh_broker.get_db_state(), 'sharding') + # old broker will also change state if we reload its db files + broker.reload_db_files() + self.assertEqual(broker.get_db_state(), 'sharding') + + # Test the SHARDED state, this is when only fresh_db_file exists. + os.unlink(db_path) + fresh_broker.reload_db_files() + self.assertEqual(fresh_broker.get_db_state(), 'sharded') + + # Test the COLLAPSED state, this is when only fresh_db_file exists. + shard_range.deleted = 1 + shard_range.timestamp = Timestamp.now() + fresh_broker.merge_shard_ranges([shard_range]) + self.assertEqual(fresh_broker.get_db_state(), 'collapsed') + + # back to UNSHARDED if the desired epoch changes + own_shard_range.update_state(ShardRange.SHRINKING, + state_timestamp=Timestamp.now()) + own_shard_range.epoch = Timestamp.now() + fresh_broker.merge_shard_ranges([own_shard_range]) + self.assertEqual(fresh_broker.get_db_state(), 'unsharded') + + @with_tempdir + def test_db_file(self, tempdir): + acct = 'account' + cont = 'continer' + hsh = hash_path(acct, cont) + db_file = "%s.db" % hsh + ts_epoch = Timestamp.now() + fresh_db_file = "%s_%s.db" % (hsh, ts_epoch.normal) + db_path = os.path.join(tempdir, db_file) + fresh_db_path = os.path.join(tempdir, fresh_db_file) + ts = Timestamp.now() + + # First test NOTFOUND state, this will return the db_file passed + # in the constructor + def check_unfound_db_files(broker, init_db_file): + self.assertEqual(init_db_file, broker.db_file) + self.assertEqual(broker._db_file, db_path) + self.assertFalse(os.path.exists(db_path)) + self.assertFalse(os.path.exists(fresh_db_path)) + self.assertEqual([], broker.db_files) + + broker = ContainerBroker(db_path, account=acct, container=cont) + check_unfound_db_files(broker, db_path) + broker = ContainerBroker(fresh_db_path, account=acct, container=cont) + check_unfound_db_files(broker, fresh_db_path) + + # Test UNSHARDED state, that is when db_file exists and fresh_db_file + # doesn't, so it should return the db_path + def check_unsharded_db_files(broker): + self.assertEqual(broker.db_file, db_path) + self.assertEqual(broker._db_file, db_path) + self.assertTrue(os.path.exists(db_path)) + self.assertFalse(os.path.exists(fresh_db_path)) + self.assertEqual([db_path], broker.db_files) + + broker = ContainerBroker(db_path, account=acct, container=cont) + broker.initialize(ts.internal, 0) + check_unsharded_db_files(broker) + broker = ContainerBroker(fresh_db_path, account=acct, container=cont) + check_unsharded_db_files(broker) + # while UNSHARDED db_path is still used despite giving fresh_db_path + # to init, so we cannot initialize this broker + with self.assertRaises(DatabaseAlreadyExists): + broker.initialize(ts.internal, 0) + + # Test the SHARDING state, this is the period when both the db_file and + # the fresh_db_file exist, in this case it should return the + # fresh_db_path. + def check_sharding_db_files(broker): + self.assertEqual(broker.db_file, fresh_db_path) + self.assertEqual(broker._db_file, db_path) + self.assertTrue(os.path.exists(db_path)) + self.assertTrue(os.path.exists(fresh_db_path)) + self.assertEqual([db_path, fresh_db_path], broker.db_files) + + # Use force_db_file to have db_shard_path created when initializing + broker = ContainerBroker(fresh_db_path, account=acct, + container=cont, force_db_file=True) + self.assertEqual([db_path], broker.db_files) + broker.initialize(ts.internal, 0) + check_sharding_db_files(broker) + broker = ContainerBroker(db_path, account=acct, container=cont) + check_sharding_db_files(broker) + broker = ContainerBroker(fresh_db_path, account=acct, container=cont) + check_sharding_db_files(broker) + + # force_db_file can be used to open db_path specifically + forced_broker = ContainerBroker(db_path, account=acct, + container=cont, force_db_file=True) + self.assertEqual(forced_broker.db_file, db_path) + self.assertEqual(forced_broker._db_file, db_path) + + def check_sharded_db_files(broker): + self.assertEqual(broker.db_file, fresh_db_path) + self.assertEqual(broker._db_file, db_path) + self.assertFalse(os.path.exists(db_path)) + self.assertTrue(os.path.exists(fresh_db_path)) + self.assertEqual([fresh_db_path], broker.db_files) + + # Test the SHARDED state, this is when only fresh_db_file exists, so + # obviously this should return the fresh_db_path + os.unlink(db_path) + broker.reload_db_files() + check_sharded_db_files(broker) + broker = ContainerBroker(db_path, account=acct, container=cont) + check_sharded_db_files(broker) + + @with_tempdir + def test_sharding_initiated_and_required(self, tempdir): + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', '%s.db' % uuid4()) + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(Timestamp.now().internal, 0) + # no shard ranges + self.assertIs(False, broker.sharding_initiated()) + self.assertIs(False, broker.sharding_required()) + # only own shard range + own_sr = broker.get_own_shard_range() + for state in ShardRange.STATES: + own_sr.update_state(state, state_timestamp=Timestamp.now()) + broker.merge_shard_ranges(own_sr) + self.assertIs(False, broker.sharding_initiated()) + self.assertIs(False, broker.sharding_required()) + + # shard ranges, still ACTIVE + own_sr.update_state(ShardRange.ACTIVE, + state_timestamp=Timestamp.now()) + broker.merge_shard_ranges(own_sr) + broker.merge_shard_ranges(ShardRange('.shards_a/cc', Timestamp.now())) + self.assertIs(False, broker.sharding_initiated()) + self.assertIs(False, broker.sharding_required()) + + # shard ranges and SHARDING, SHRINKING or SHARDED + broker.enable_sharding(Timestamp.now()) + self.assertTrue(broker.set_sharding_state()) + self.assertIs(True, broker.sharding_initiated()) + self.assertIs(True, broker.sharding_required()) + + epoch = broker.db_epoch + own_sr.update_state(ShardRange.SHRINKING, + state_timestamp=Timestamp.now()) + own_sr.epoch = epoch + broker.merge_shard_ranges(own_sr) + self.assertIs(True, broker.sharding_initiated()) + self.assertIs(True, broker.sharding_required()) + + own_sr.update_state(ShardRange.SHARDED) + broker.merge_shard_ranges(own_sr) + self.assertTrue(broker.set_sharded_state()) + self.assertIs(True, broker.sharding_initiated()) + self.assertIs(False, broker.sharding_required()) + @with_tempdir def test_put_object_multiple_encoded_timestamps_using_file(self, tempdir): # Test ContainerBroker.put_object with differing data, content-type @@ -968,7 +1807,8 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(info['delete_timestamp'], '0') if self.__class__ in (TestContainerBrokerBeforeMetadata, TestContainerBrokerBeforeXSync, - TestContainerBrokerBeforeSPI): + TestContainerBrokerBeforeSPI, + TestContainerBrokerBeforeShardRanges): self.assertEqual(info['status_changed_at'], '0') else: self.assertEqual(info['status_changed_at'], @@ -1014,6 +1854,84 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(info['x_container_sync_point1'], -1) self.assertEqual(info['x_container_sync_point2'], -1) + @with_tempdir + def test_get_info_sharding_states(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'part', 'suffix', 'hash', 'hash.db') + broker = ContainerBroker( + db_path, account='myaccount', container='mycontainer') + broker.initialize(next(ts_iter).internal, 0) + broker.put_object('o1', next(ts_iter).internal, 123, 'text/plain', + 'fake etag') + sr = ShardRange('.shards_a/c', next(ts_iter)) + broker.merge_shard_ranges(sr) + + def check_info(expected): + errors = [] + for k, v in expected.items(): + if info.get(k) != v: + errors.append((k, v, info.get(k))) + if errors: + self.fail('Mismatches: %s' % ', '.join( + ['%s should be %s but got %s' % error + for error in errors])) + + # unsharded + with mock.patch.object( + broker, 'get_shard_usage') as mock_get_shard_usage: + info = broker.get_info() + mock_get_shard_usage.assert_not_called() + check_info({'account': 'myaccount', + 'container': 'mycontainer', + 'object_count': 1, + 'bytes_used': 123, + 'db_state': 'unsharded'}) + + # sharding + epoch = next(ts_iter) + broker.enable_sharding(epoch) + self.assertTrue(broker.set_sharding_state()) + broker.put_object('o2', next(ts_iter).internal, 1, 'text/plain', + 'fake etag') + broker.put_object('o3', next(ts_iter).internal, 320, 'text/plain', + 'fake etag') + with mock.patch.object( + broker, 'get_shard_usage') as mock_get_shard_usage: + info = broker.get_info() + mock_get_shard_usage.assert_not_called() + check_info({'account': 'myaccount', + 'container': 'mycontainer', + 'object_count': 1, + 'bytes_used': 123, + 'db_state': 'sharding'}) + + # sharded + self.assertTrue(broker.set_sharded_state()) + shard_stats = {'object_count': 1001, 'bytes_used': 3003} + with mock.patch.object( + broker, 'get_shard_usage') as mock_get_shard_usage: + mock_get_shard_usage.return_value = shard_stats + info = broker.get_info() + mock_get_shard_usage.assert_called_once_with() + check_info({'account': 'myaccount', + 'container': 'mycontainer', + 'object_count': 1001, + 'bytes_used': 3003, + 'db_state': 'sharded'}) + + # collapsed + sr.set_deleted(next(ts_iter)) + broker.merge_shard_ranges(sr) + with mock.patch.object( + broker, 'get_shard_usage') as mock_get_shard_usage: + info = broker.get_info() + mock_get_shard_usage.assert_not_called() + check_info({'account': 'myaccount', + 'container': 'mycontainer', + 'object_count': 2, + 'bytes_used': 321, + 'db_state': 'collapsed'}) + def test_set_x_syncs(self): broker = ContainerBroker(':memory:', account='test1', container='test2') @@ -1095,6 +2013,105 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(info['reported_object_count'], 2) self.assertEqual(info['reported_bytes_used'], 1123) + def test_get_objects(self): + broker = ContainerBroker(':memory:', account='a', container='c') + broker.initialize(Timestamp('1').internal, 0) + ts_iter = make_timestamp_iter() + objects_0 = [{'name': 'obj_0_%d' % i, + 'created_at': next(ts_iter).normal, + 'content_type': 'text/plain', + 'etag': 'etag_%d' % i, + 'size': 1024 * i, + 'deleted': i % 2, + 'storage_policy_index': 0 + } for i in range(1, 8)] + objects_1 = [{'name': 'obj_1_%d' % i, + 'created_at': next(ts_iter).normal, + 'content_type': 'text/plain', + 'etag': 'etag_%d' % i, + 'size': 1024 * i, + 'deleted': i % 2, + 'storage_policy_index': 1 + } for i in range(1, 8)] + # merge_objects mutates items + broker.merge_items([dict(obj) for obj in objects_0 + objects_1]) + + actual = broker.get_objects() + self.assertEqual(objects_0 + objects_1, actual) + + with mock.patch('swift.container.backend.CONTAINER_LISTING_LIMIT', 2): + actual = broker.get_objects() + self.assertEqual(objects_0[:2], actual) + + with mock.patch('swift.container.backend.CONTAINER_LISTING_LIMIT', 2): + actual = broker.get_objects(limit=9) + self.assertEqual(objects_0 + objects_1[:2], actual) + + actual = broker.get_objects(marker=objects_0[2]['name']) + self.assertEqual(objects_0[3:] + objects_1, actual) + + actual = broker.get_objects(end_marker=objects_0[2]['name']) + self.assertEqual(objects_0[:2], actual) + + actual = broker.get_objects(include_deleted=True) + self.assertEqual(objects_0[::2] + objects_1[::2], actual) + + actual = broker.get_objects(include_deleted=False) + self.assertEqual(objects_0[1::2] + objects_1[1::2], actual) + + actual = broker.get_objects(include_deleted=None) + self.assertEqual(objects_0 + objects_1, actual) + + def test_get_objects_since_row(self): + ts_iter = make_timestamp_iter() + broker = ContainerBroker(':memory:', account='a', container='c') + broker.initialize(Timestamp('1').internal, 0) + obj_names = ['obj%03d' % i for i in range(20)] + timestamps = [next(ts_iter) for o in obj_names] + for name, timestamp in zip(obj_names, timestamps): + broker.put_object(name, timestamp.internal, + 0, 'text/plain', EMPTY_ETAG) + broker._commit_puts() # ensure predictable row order + timestamps = [next(ts_iter) for o in obj_names[10:]] + for name, timestamp in zip(obj_names[10:], timestamps): + broker.put_object(name, timestamp.internal, + 0, 'text/plain', EMPTY_ETAG, deleted=1) + broker._commit_puts() # ensure predictable row order + + # sanity check + self.assertEqual(30, broker.get_max_row()) + actual = broker.get_objects() + self.assertEqual(obj_names, [o['name'] for o in actual]) + + # all rows included + actual = broker.get_objects(since_row=None) + self.assertEqual(obj_names, [o['name'] for o in actual]) + + actual = broker.get_objects(since_row=-1) + self.assertEqual(obj_names, [o['name'] for o in actual]) + + # selected rows + for since_row in range(10): + actual = broker.get_objects(since_row=since_row) + with annotate_failure(since_row): + self.assertEqual(obj_names[since_row:], + [o['name'] for o in actual]) + + for since_row in range(10, 20): + actual = broker.get_objects(since_row=since_row) + with annotate_failure(since_row): + self.assertEqual(obj_names[10:], + [o['name'] for o in actual]) + + for since_row in range(20, len(obj_names) + 1): + actual = broker.get_objects(since_row=since_row) + with annotate_failure(since_row): + self.assertEqual(obj_names[since_row - 10:], + [o['name'] for o in actual]) + + self.assertFalse(broker.get_objects(end_marker=obj_names[5], + since_row=5)) + def test_list_objects_iter(self): # Test ContainerBroker.list_objects_iter broker = ContainerBroker(':memory:', account='a', container='c') @@ -1827,6 +2844,21 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(['a', 'b', 'c'], sorted([rec['name'] for rec in items])) + @with_tempdir + def test_merge_items_is_green(self, tempdir): + ts = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts).internal, 1) + + broker.put_object('b', next(ts).internal, 0, 'text/plain', + EMPTY_ETAG) + + with mock.patch('swift.container.backend.tpool') as mock_tpool: + broker.get_info() + mock_tpool.execute.assert_called_once() + def test_merge_items_overwrite_unicode(self): # test DatabaseBroker.merge_items snowman = u'\N{SNOWMAN}'.encode('utf-8') @@ -1937,7 +2969,8 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(0, info['bytes_used']) if self.__class__ in (TestContainerBrokerBeforeMetadata, TestContainerBrokerBeforeXSync, - TestContainerBrokerBeforeSPI): + TestContainerBrokerBeforeSPI, + TestContainerBrokerBeforeShardRanges): self.assertEqual(info['status_changed_at'], '0') else: self.assertEqual(timestamp.internal, info['status_changed_at']) @@ -2116,6 +3149,1237 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(ts.internal, broker.get_info()['put_timestamp']) self.assertEqual(0, broker.get_info()['storage_policy_index']) + epoch = Timestamp.now() + broker = ContainerBroker.create_broker(tempdir, 0, 'a', 'c3', + epoch=epoch) + hsh = hash_path('a', 'c3') + expected_path = os.path.join( + tempdir, 'containers', '0', hsh[-3:], + hsh, '%s_%s.db' % (hsh, epoch.internal)) + self.assertEqual(expected_path, broker.db_file) + + @with_tempdir + def test_pending_file_name(self, tempdir): + # pending file should have same name for sharded or unsharded db + expected_pending_path = os.path.join(tempdir, 'container.db.pending') + + db_path = os.path.join(tempdir, 'container.db') + fresh_db_path = os.path.join(tempdir, 'container_epoch.db') + + def do_test(given_db_file, expected_db_file): + broker = ContainerBroker(given_db_file, account='a', container='c') + self.assertEqual(expected_pending_path, broker.pending_file) + self.assertEqual(expected_db_file, broker.db_file) + + # no files exist + do_test(db_path, db_path) + do_test(fresh_db_path, fresh_db_path) + + # only container.db exists - unsharded + with open(db_path, 'wb'): + pass + do_test(db_path, db_path) + do_test(fresh_db_path, db_path) + + # container.db and container_shard.db exist - sharding + with open(fresh_db_path, 'wb'): + pass + do_test(db_path, fresh_db_path) + do_test(fresh_db_path, fresh_db_path) + + # only container_shard.db exists - sharded + os.unlink(db_path) + do_test(db_path, fresh_db_path) + do_test(fresh_db_path, fresh_db_path) + + @with_tempdir + def test_sharding_sysmeta(self, tempdir): + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker( + db_path, account='myaccount', container='mycontainer') + broker.initialize(Timestamp.now().internal) + + expected = 'aaa/ccc' + with mock_timestamp_now() as now: + broker.set_sharding_sysmeta('Root', expected) + actual = broker.metadata + self.assertEqual([expected, now.internal], + actual.get('X-Container-Sysmeta-Shard-Root')) + self.assertEqual(expected, broker.get_sharding_sysmeta('Root')) + + expected = {'key': 'value'} + with mock_timestamp_now() as now: + broker.set_sharding_sysmeta('test', expected) + actual = broker.metadata + self.assertEqual([expected, now.internal], + actual.get('X-Container-Sysmeta-Shard-test')) + self.assertEqual(expected, broker.get_sharding_sysmeta('test')) + + @with_tempdir + def test_path(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker( + db_path, account='myaccount', container='mycontainer') + broker.initialize(next(ts_iter).internal, 1) + # make sure we can cope with unitialized account and container + broker.account = broker.container = None + self.assertEqual('myaccount/mycontainer', broker.path) + + @with_tempdir + def test_root_account_container_path(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker( + db_path, account='root_a', container='root_c') + broker.initialize(next(ts_iter).internal, 1) + # make sure we can cope with unitialized account and container + broker.account = broker.container = None + + self.assertEqual('root_a', broker.root_account) + self.assertEqual('root_c', broker.root_container) + self.assertEqual('root_a/root_c', broker.root_path) + self.assertTrue(broker.is_root_container()) + self.assertEqual('root_a', broker.account) # sanity check + self.assertEqual('root_c', broker.container) # sanity check + + # we don't expect root containers to have this sysmeta set but if it is + # the broker should still behave like a root container + metadata = { + 'X-Container-Sysmeta-Shard-Root': + ('root_a/root_c', next(ts_iter).internal)} + broker = ContainerBroker( + db_path, account='root_a', container='root_c') + broker.update_metadata(metadata) + broker.account = broker.container = None + self.assertEqual('root_a', broker.root_account) + self.assertEqual('root_c', broker.root_container) + self.assertEqual('root_a/root_c', broker.root_path) + self.assertTrue(broker.is_root_container()) + + # if root is marked deleted, it still considers itself to be a root + broker.delete_db(next(ts_iter).internal) + self.assertEqual('root_a', broker.root_account) + self.assertEqual('root_c', broker.root_container) + self.assertEqual('root_a/root_c', broker.root_path) + self.assertTrue(broker.is_root_container()) + # check the values are not just being cached + broker = ContainerBroker(db_path) + self.assertEqual('root_a', broker.root_account) + self.assertEqual('root_c', broker.root_container) + self.assertEqual('root_a/root_c', broker.root_path) + self.assertTrue(broker.is_root_container()) + + # check a shard container + db_path = os.path.join(tempdir, 'shard_container.db') + broker = ContainerBroker( + db_path, account='.shards_root_a', container='c_shard') + broker.initialize(next(ts_iter).internal, 1) + # now the metadata is significant... + metadata = { + 'X-Container-Sysmeta-Shard-Root': + ('root_a/root_c', next(ts_iter).internal)} + broker.update_metadata(metadata) + broker.account = broker.container = None + broker._root_account = broker._root_container = None + + self.assertEqual('root_a', broker.root_account) + self.assertEqual('root_c', broker.root_container) + self.assertEqual('root_a/root_c', broker.root_path) + self.assertFalse(broker.is_root_container()) + + # check validation + def check_validation(root_value): + metadata = { + 'X-Container-Sysmeta-Shard-Root': + (root_value, next(ts_iter).internal)} + broker.update_metadata(metadata) + broker.account = broker.container = None + broker._root_account = broker._root_container = None + with self.assertRaises(ValueError) as cm: + broker.root_account + self.assertIn('Expected X-Container-Sysmeta-Shard-Root', + str(cm.exception)) + with self.assertRaises(ValueError): + broker.root_container + + check_validation('root_a') + check_validation('/root_a') + check_validation('/root_a/root_c') + check_validation('/root_a/root_c/blah') + check_validation('/') + + def test_resolve_shard_range_states(self): + self.assertIsNone(ContainerBroker.resolve_shard_range_states(None)) + self.assertIsNone(ContainerBroker.resolve_shard_range_states([])) + + for state_num, state_name in ShardRange.STATES.items(): + self.assertEqual({state_num}, + ContainerBroker.resolve_shard_range_states( + [state_name])) + self.assertEqual({state_num}, + ContainerBroker.resolve_shard_range_states( + [state_num])) + + self.assertEqual(set(ShardRange.STATES), + ContainerBroker.resolve_shard_range_states( + ShardRange.STATES_BY_NAME)) + + self.assertEqual( + set(ShardRange.STATES), + ContainerBroker.resolve_shard_range_states(ShardRange.STATES)) + + # check aliases + self.assertEqual( + {ShardRange.CLEAVED, ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING}, + ContainerBroker.resolve_shard_range_states(['listing'])) + + self.assertEqual( + {ShardRange.CLEAVED, ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING}, + ContainerBroker.resolve_shard_range_states(['listing', 'active'])) + + self.assertEqual( + {ShardRange.CLEAVED, ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING, ShardRange.CREATED}, + ContainerBroker.resolve_shard_range_states(['listing', 'created'])) + + self.assertEqual( + {ShardRange.CREATED, ShardRange.CLEAVED, ShardRange.ACTIVE, + ShardRange.SHARDING}, + ContainerBroker.resolve_shard_range_states(['updating'])) + + self.assertEqual( + {ShardRange.CREATED, ShardRange.CLEAVED, ShardRange.ACTIVE, + ShardRange.SHARDING, ShardRange.SHRINKING}, + ContainerBroker.resolve_shard_range_states( + ['updating', 'listing'])) + + def check_bad_value(value): + with self.assertRaises(ValueError) as cm: + ContainerBroker.resolve_shard_range_states(value) + self.assertIn('Invalid state', str(cm.exception)) + + check_bad_value(['bad_state', 'active']) + check_bad_value(['']) + check_bad_value('active') + + @with_tempdir + def test_get_shard_ranges(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + + # no rows + self.assertFalse(broker.get_shard_ranges()) + # check that a default own shard range is not generated + self.assertFalse(broker.get_shard_ranges(include_own=True)) + + # merge row for own shard range + own_shard_range = ShardRange(broker.path, next(ts_iter), 'l', 'u', + state=ShardRange.SHARDING) + broker.merge_shard_ranges([own_shard_range]) + self.assertFalse(broker.get_shard_ranges()) + self.assertFalse(broker.get_shard_ranges(include_own=False)) + + actual = broker.get_shard_ranges(include_own=True) + self.assertEqual([dict(sr) for sr in [own_shard_range]], + [dict(sr) for sr in actual]) + + # merge rows for other shard ranges + shard_ranges = [ + ShardRange('.a/c0', next(ts_iter), 'a', 'c'), + ShardRange('.a/c1', next(ts_iter), 'c', 'd'), + ShardRange('.a/c2', next(ts_iter), 'd', 'f', + state=ShardRange.ACTIVE), + ShardRange('.a/c3', next(ts_iter), 'e', 'f', deleted=1, + state=ShardRange.SHARDED,), + ShardRange('.a/c4', next(ts_iter), 'f', 'h', + state=ShardRange.CREATED), + ShardRange('.a/c5', next(ts_iter), 'h', 'j', deleted=1) + ] + broker.merge_shard_ranges(shard_ranges) + actual = broker.get_shard_ranges() + undeleted = shard_ranges[:3] + shard_ranges[4:5] + self.assertEqual([dict(sr) for sr in undeleted], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(include_deleted=True) + self.assertEqual([dict(sr) for sr in shard_ranges], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(reverse=True) + self.assertEqual([dict(sr) for sr in reversed(undeleted)], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(marker='c', end_marker='e') + self.assertEqual([dict(sr) for sr in shard_ranges[1:3]], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(marker='c', end_marker='e', + states=ShardRange.ACTIVE) + self.assertEqual([dict(sr) for sr in shard_ranges[2:3]], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(marker='e', end_marker='e') + self.assertFalse([dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(includes='f') + self.assertEqual([dict(sr) for sr in shard_ranges[2:3]], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(includes='i') + self.assertFalse(actual) + + actual = broker.get_shard_ranges( + states=[ShardRange.CREATED, ShardRange.ACTIVE]) + self.assertEqual( + [dict(sr) for sr in [shard_ranges[2], shard_ranges[4]]], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(exclude_states=ShardRange.CREATED) + self.assertEqual([dict(sr) for sr in shard_ranges[:3]], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges( + exclude_states=[ShardRange.CREATED, ShardRange.ACTIVE]) + self.assertEqual([dict(sr) for sr in shard_ranges[:2]], + [dict(sr) for sr in actual]) + + # exclude_states takes precedence + actual = broker.get_shard_ranges( + states=ShardRange.CREATED, exclude_states=ShardRange.CREATED) + self.assertEqual([dict(sr) for sr in shard_ranges[:3]], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(states=[ShardRange.CREATED], + exclude_states=[ShardRange.ACTIVE]) + self.assertEqual([dict(sr) for sr in shard_ranges[4:5]], + [dict(sr) for sr in actual]) + + # get everything + actual = broker.get_shard_ranges(include_own=True) + self.assertEqual([dict(sr) for sr in undeleted + [own_shard_range]], + [dict(sr) for sr in actual]) + + # get just own range + actual = broker.get_shard_ranges(include_own=True, exclude_others=True) + self.assertEqual([dict(sr) for sr in [own_shard_range]], + [dict(sr) for sr in actual]) + + # exclude_states overrides include_own + actual = broker.get_shard_ranges(include_own=True, + exclude_states=ShardRange.SHARDING, + exclude_others=True) + self.assertFalse(actual) + + # if you ask for nothing you'll get nothing + actual = broker.get_shard_ranges( + include_own=False, exclude_others=True) + self.assertFalse(actual) + + @with_tempdir + def test_get_shard_ranges_with_sharding_overlaps(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + shard_ranges = [ + ShardRange('.shards_a/c0', next(ts_iter), 'a', 'd', + state=ShardRange.ACTIVE), + ShardRange('.shards_a/c1_0', next(ts_iter), 'd', 'g', + state=ShardRange.CLEAVED), + ShardRange('.shards_a/c1_1', next(ts_iter), 'g', 'j', + state=ShardRange.CLEAVED), + ShardRange('.shards_a/c1_2', next(ts_iter), 'j', 'm', + state=ShardRange.CREATED), + ShardRange('.shards_a/c1', next(ts_iter), 'd', 'm', + state=ShardRange.SHARDING), + ShardRange('.shards_a/c2', next(ts_iter), 'm', '', + state=ShardRange.ACTIVE), + ] + broker.merge_shard_ranges( + random.sample(shard_ranges, len(shard_ranges))) + actual = broker.get_shard_ranges() + self.assertEqual([dict(sr) for sr in shard_ranges], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(states=SHARD_LISTING_STATES) + self.assertEqual( + [dict(sr) for sr in shard_ranges[:3] + shard_ranges[4:]], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(states=SHARD_UPDATE_STATES, + includes='e') + self.assertEqual([shard_ranges[1]], actual) + actual = broker.get_shard_ranges(states=SHARD_UPDATE_STATES, + includes='j') + self.assertEqual([shard_ranges[2]], actual) + actual = broker.get_shard_ranges(states=SHARD_UPDATE_STATES, + includes='k') + self.assertEqual([shard_ranges[3]], actual) + + @with_tempdir + def test_get_shard_ranges_with_shrinking_overlaps(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + shard_ranges = [ + ShardRange('.shards_a/c0', next(ts_iter), 'a', 'k', + state=ShardRange.ACTIVE), + ShardRange('.shards_a/c1', next(ts_iter), 'k', 'm', + state=ShardRange.SHRINKING), + ShardRange('.shards_a/c2', next(ts_iter), 'k', 't', + state=ShardRange.ACTIVE), + ShardRange('.shards_a/c3', next(ts_iter), 't', '', + state=ShardRange.ACTIVE), + ] + broker.merge_shard_ranges( + random.sample(shard_ranges, len(shard_ranges))) + actual = broker.get_shard_ranges() + self.assertEqual([dict(sr) for sr in shard_ranges], + [dict(sr) for sr in actual]) + + actual = broker.get_shard_ranges(states=SHARD_UPDATE_STATES, + includes='l') + self.assertEqual([shard_ranges[2]], actual) + + @with_tempdir + def test_get_own_shard_range(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker( + db_path, account='.shards_a', container='shard_c') + broker.initialize(next(ts_iter).internal, 0) + + # no row for own shard range - expect entire namespace default + now = Timestamp.now() + expected = ShardRange(broker.path, now, '', '', 0, 0, now, + state=ShardRange.ACTIVE) + with mock.patch('swift.container.backend.Timestamp.now', + return_value=now): + actual = broker.get_own_shard_range() + self.assertEqual(dict(expected), dict(actual)) + + actual = broker.get_own_shard_range(no_default=True) + self.assertIsNone(actual) + + # row for own shard range and others + ts_1 = next(ts_iter) + own_sr = ShardRange(broker.path, ts_1, 'l', 'u') + broker.merge_shard_ranges( + [own_sr, + ShardRange('.a/c1', next(ts_iter), 'b', 'c'), + ShardRange('.a/c2', next(ts_iter), 'c', 'd')]) + expected = ShardRange(broker.path, ts_1, 'l', 'u', 0, 0, now) + with mock.patch('swift.container.backend.Timestamp.now', + return_value=now): + actual = broker.get_own_shard_range() + self.assertEqual(dict(expected), dict(actual)) + + # check stats get updated + broker.put_object( + 'o1', next(ts_iter).internal, 100, 'text/plain', 'etag1') + broker.put_object( + 'o2', next(ts_iter).internal, 99, 'text/plain', 'etag2') + expected = ShardRange( + broker.path, ts_1, 'l', 'u', 2, 199, now) + with mock.patch('swift.container.backend.Timestamp.now', + return_value=now): + actual = broker.get_own_shard_range() + self.assertEqual(dict(expected), dict(actual)) + + # still returned when deleted + delete_ts = next(ts_iter) + own_sr.set_deleted(timestamp=delete_ts) + broker.merge_shard_ranges(own_sr) + with mock.patch('swift.container.backend.Timestamp.now', + return_value=now): + actual = broker.get_own_shard_range() + expected = ShardRange( + broker.path, delete_ts, 'l', 'u', 2, 199, now, deleted=True) + self.assertEqual(dict(expected), dict(actual)) + + # still in table after reclaim_age + broker.reclaim(next(ts_iter).internal, next(ts_iter).internal) + with mock.patch('swift.container.backend.Timestamp.now', + return_value=now): + actual = broker.get_own_shard_range() + self.assertEqual(dict(expected), dict(actual)) + + # entire namespace + ts_2 = next(ts_iter) + broker.merge_shard_ranges( + [ShardRange(broker.path, ts_2, '', '')]) + expected = ShardRange( + broker.path, ts_2, '', '', 2, 199, now) + with mock.patch('swift.container.backend.Timestamp.now', + return_value=now): + actual = broker.get_own_shard_range() + self.assertEqual(dict(expected), dict(actual)) + + @with_tempdir + def test_enable_sharding(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'container.db') + broker = ContainerBroker( + db_path, account='.shards_a', container='shard_c') + broker.initialize(next(ts_iter).internal, 0) + epoch = next(ts_iter) + broker.enable_sharding(epoch) + own_sr = broker.get_own_shard_range(no_default=True) + self.assertEqual(epoch, own_sr.epoch) + self.assertEqual(epoch, own_sr.state_timestamp) + self.assertEqual(ShardRange.SHARDING, own_sr.state) + + @with_tempdir + def test_get_shard_usage(self, tempdir): + ts_iter = make_timestamp_iter() + shard_range_by_state = dict( + (state, ShardRange('.shards_a/c_%s' % state, next(ts_iter), + str(state), str(state + 1), + 2 * state, 2 * state + 1, 2, + state=state)) + for state in ShardRange.STATES) + + def make_broker(a, c): + db_path = os.path.join(tempdir, '%s.db' % uuid4()) + broker = ContainerBroker(db_path, account=a, container=c) + broker.initialize(next(ts_iter).internal, 0) + broker.set_sharding_sysmeta('Root', 'a/c') + broker.merge_shard_ranges(shard_range_by_state.values()) + return broker + + # make broker appear to be a root container + broker = make_broker('a', 'c') + self.assertTrue(broker.is_root_container()) + included_states = (ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING) + included = [shard_range_by_state[state] for state in included_states] + expected = { + 'object_count': sum([sr.object_count for sr in included]), + 'bytes_used': sum([sr.bytes_used for sr in included]) + } + self.assertEqual(expected, broker.get_shard_usage()) + + @with_tempdir + def _check_find_shard_ranges(self, c_lower, c_upper, tempdir): + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() + container_name = 'test_container' + + def do_test(expected_bounds, expected_last_found, shard_size, limit, + start_index=0, existing=None): + # expected_bounds is a list of tuples (lower, upper, object_count) + # build expected shard ranges + expected_shard_ranges = [ + dict(lower=lower, upper=upper, index=index, + object_count=object_count) + for index, (lower, upper, object_count) + in enumerate(expected_bounds, start_index)] + + with mock.patch('swift.common.utils.time.time', + return_value=float(ts_now.normal)): + ranges, last_found = broker.find_shard_ranges( + shard_size, limit=limit, existing_ranges=existing) + self.assertEqual(expected_shard_ranges, ranges) + self.assertEqual(expected_last_found, last_found) + + db_path = os.path.join(tempdir, 'test_container.db') + broker = ContainerBroker( + db_path, account='a', container=container_name) + # shard size > object count, no objects + broker.initialize(next(ts_iter).internal, 0) + + ts = next(ts_iter) + if c_lower or c_upper: + # testing a shard, so set its own shard range + own_shard_range = ShardRange(broker.path, ts, c_lower, c_upper) + broker.merge_shard_ranges([own_shard_range]) + + self.assertEqual(([], False), broker.find_shard_ranges(10)) + + for i in range(10): + broker.put_object( + 'obj%02d' % i, next(ts_iter).internal, 0, 'text/plain', 'etag') + + expected_bounds = [(c_lower, 'obj04', 5), ('obj04', c_upper, 5)] + do_test(expected_bounds, True, shard_size=5, limit=None) + + expected = [(c_lower, 'obj06', 7), ('obj06', c_upper, 3)] + do_test(expected, True, shard_size=7, limit=None) + expected = [(c_lower, 'obj08', 9), ('obj08', c_upper, 1)] + do_test(expected, True, shard_size=9, limit=None) + # shard size >= object count + do_test([], False, shard_size=10, limit=None) + do_test([], False, shard_size=11, limit=None) + + # check use of limit + do_test([], False, shard_size=4, limit=0) + expected = [(c_lower, 'obj03', 4)] + do_test(expected, False, shard_size=4, limit=1) + expected = [(c_lower, 'obj03', 4), ('obj03', 'obj07', 4)] + do_test(expected, False, shard_size=4, limit=2) + expected = [(c_lower, 'obj03', 4), ('obj03', 'obj07', 4), + ('obj07', c_upper, 2)] + do_test(expected, True, shard_size=4, limit=3) + do_test(expected, True, shard_size=4, limit=4) + do_test(expected, True, shard_size=4, limit=-1) + + # increase object count to 11 + broker.put_object( + 'obj10', next(ts_iter).internal, 0, 'text/plain', 'etag') + expected = [(c_lower, 'obj03', 4), ('obj03', 'obj07', 4), + ('obj07', c_upper, 3)] + do_test(expected, True, shard_size=4, limit=None) + + expected = [(c_lower, 'obj09', 10), ('obj09', c_upper, 1)] + do_test(expected, True, shard_size=10, limit=None) + do_test([], False, shard_size=11, limit=None) + + # now pass in a pre-existing shard range + existing = [ShardRange( + '.shards_a/srange-0', Timestamp.now(), '', 'obj03', + object_count=4, state=ShardRange.FOUND)] + + expected = [('obj03', 'obj07', 4), ('obj07', c_upper, 3)] + do_test(expected, True, shard_size=4, limit=None, start_index=1, + existing=existing) + expected = [('obj03', 'obj07', 4)] + do_test(expected, False, shard_size=4, limit=1, start_index=1, + existing=existing) + # using increased shard size should not distort estimation of progress + expected = [('obj03', 'obj09', 6), ('obj09', c_upper, 1)] + do_test(expected, True, shard_size=6, limit=None, start_index=1, + existing=existing) + + # add another existing... + existing.append(ShardRange( + '.shards_a/srange-1', Timestamp.now(), '', 'obj07', + object_count=4, state=ShardRange.FOUND)) + expected = [('obj07', c_upper, 3)] + do_test(expected, True, shard_size=10, limit=None, start_index=2, + existing=existing) + # an existing shard range not in FOUND state should not distort + # estimation of progress, but may cause final range object count to + # default to shard_size + existing[-1].state = ShardRange.CREATED + existing[-1].object_count = 10 + # there's only 3 objects left to scan but progress cannot be reliably + # calculated, so final shard range has object count of 2 + expected = [('obj07', 'obj09', 2), ('obj09', c_upper, 2)] + do_test(expected, True, shard_size=2, limit=None, start_index=2, + existing=existing) + + # add last shard range so there's none left to find + existing.append(ShardRange( + '.shards_a/srange-2', Timestamp.now(), 'obj07', c_upper, + object_count=4, state=ShardRange.FOUND)) + do_test([], True, shard_size=4, limit=None, existing=existing) + + def test_find_shard_ranges(self): + self._check_find_shard_ranges('', '') + self._check_find_shard_ranges('', 'upper') + self._check_find_shard_ranges('lower', '') + self._check_find_shard_ranges('lower', 'upper') + + @with_tempdir + def test_find_shard_ranges_with_misplaced_objects(self, tempdir): + # verify that misplaced objects outside of a shard's range do not + # influence choice of shard ranges (but do distort the object counts) + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() + container_name = 'test_container' + + db_path = os.path.join(tempdir, 'test_container.db') + broker = ContainerBroker( + db_path, account='a', container=container_name) + # shard size > object count, no objects + broker.initialize(next(ts_iter).internal, 0) + + ts = next(ts_iter) + own_shard_range = ShardRange(broker.path, ts, 'l', 'u') + broker.merge_shard_ranges([own_shard_range]) + + self.assertEqual(([], False), broker.find_shard_ranges(10)) + + for name in ('a-misplaced', 'm', 'n', 'p', 'q', 'r', 'z-misplaced'): + broker.put_object( + name, next(ts_iter).internal, 0, 'text/plain', 'etag') + + expected_bounds = ( + ('l', 'n', 2), # contains m, n + ('n', 'q', 2), # contains p, q + ('q', 'u', 3) # contains r; object count distorted by 2 misplaced + ) + expected_shard_ranges = [ + dict(lower=lower, upper=upper, index=index, + object_count=object_count) + for index, (lower, upper, object_count) + in enumerate(expected_bounds)] + + with mock.patch('swift.common.utils.time.time', + return_value=float(ts_now.normal)): + actual_shard_ranges, last_found = broker.find_shard_ranges(2, -1) + self.assertEqual(expected_shard_ranges, actual_shard_ranges) + + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() + container_name = 'test_container' + + @with_tempdir + def test_find_shard_ranges_errors(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join(tempdir, 'test_container.db') + broker = ContainerBroker(db_path, account='a', container='c', + logger=FakeLogger()) + broker.initialize(next(ts_iter).internal, 0) + for i in range(2): + broker.put_object( + 'obj%d' % i, next(ts_iter).internal, 0, 'text/plain', 'etag') + + klass = 'swift.container.backend.ContainerBroker' + with mock.patch(klass + '._get_next_shard_range_upper', + side_effect=LockTimeout()): + ranges, last_found = broker.find_shard_ranges(1) + self.assertFalse(ranges) + self.assertFalse(last_found) + lines = broker.logger.get_lines_for_level('error') + self.assertIn('Problem finding shard upper', lines[0]) + self.assertFalse(lines[1:]) + + broker.logger.clear() + with mock.patch(klass + '._get_next_shard_range_upper', + side_effect=sqlite3.OperationalError()): + ranges, last_found = broker.find_shard_ranges(1) + self.assertFalse(last_found) + self.assertFalse(ranges) + lines = broker.logger.get_lines_for_level('error') + self.assertIn('Problem finding shard upper', lines[0]) + self.assertFalse(lines[1:]) + + @with_tempdir + def test_set_db_states(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + + # load up the broker with some objects + objects = [{'name': 'obj_%d' % i, + 'created_at': next(ts_iter).normal, + 'content_type': 'text/plain', + 'etag': 'etag_%d' % i, + 'size': 1024 * i, + 'deleted': 0, + 'storage_policy_index': 0, + } for i in range(1, 6)] + # merge_items mutates items + broker.merge_items([dict(obj) for obj in objects]) + original_info = broker.get_info() + + # Add some metadata + meta = { + 'X-Container-Meta-Color': ['Blue', next(ts_iter).normal], + 'X-Container-Meta-Cleared': ['', next(ts_iter).normal], + 'X-Container-Sysmeta-Shape': ['Circle', next(ts_iter).normal], + } + broker.update_metadata(meta) + + # Add some syncs + incoming_sync = {'remote_id': 'incoming_123', 'sync_point': 1} + outgoing_sync = {'remote_id': 'outgoing_123', 'sync_point': 2} + broker.merge_syncs([outgoing_sync], incoming=False) + broker.merge_syncs([incoming_sync], incoming=True) + + # Add some ShardRanges + shard_ranges = [ShardRange( + name='.shards_a/shard_range_%s' % i, + timestamp=next(ts_iter), lower='obj_%d' % i, + upper='obj_%d' % (i + 2), + object_count=len(objects[i:i + 2]), + bytes_used=sum(obj['size'] for obj in objects[i:i + 2]), + meta_timestamp=next(ts_iter)) for i in range(0, 6, 2)] + deleted_range = ShardRange('.shards_a/shard_range_z', next(ts_iter), + 'z', '', state=ShardRange.SHARDED, + deleted=1) + own_sr = ShardRange(name='a/c', timestamp=next(ts_iter), + state=ShardRange.ACTIVE) + broker.merge_shard_ranges([own_sr] + shard_ranges + [deleted_range]) + ts_epoch = next(ts_iter) + new_db_path = os.path.join(tempdir, 'part', 'suffix', 'hash', + 'container_%s.db' % ts_epoch.normal) + + def check_broker_properties(broker): + # these broker properties should remain unchanged as state changes + self.assertEqual(broker.get_max_row(), 5) + all_metadata = broker.metadata + original_meta = dict((k, all_metadata[k]) for k in meta) + self.assertEqual(original_meta, meta) + self.assertEqual(broker.get_syncs(True)[0], incoming_sync) + self.assertEqual(broker.get_syncs(False)[0], outgoing_sync) + self.assertEqual(shard_ranges + [own_sr, deleted_range], + broker.get_shard_ranges(include_own=True, + include_deleted=True)) + + def check_broker_info(actual_info): + for key in ('db_state', 'id', 'hash'): + actual_info.pop(key, None) + original_info.pop(key, None) + self.assertEqual(original_info, actual_info) + + def check_unsharded_state(broker): + # these are expected properties in unsharded state + self.assertEqual(len(broker.get_brokers()), 1) + self.assertEqual(broker.get_db_state(), UNSHARDED) + self.assertTrue(os.path.exists(db_path)) + self.assertFalse(os.path.exists(new_db_path)) + self.assertEqual(objects, broker.get_objects()) + + # Sanity checks + check_broker_properties(broker) + check_unsharded_state(broker) + check_broker_info(broker.get_info()) + + # first test that moving from UNSHARDED to SHARDED doesn't work + self.assertFalse(broker.set_sharded_state()) + # check nothing changed + check_broker_properties(broker) + check_broker_info(broker.get_info()) + check_unsharded_state(broker) + + # cannot go to SHARDING without an epoch set + self.assertFalse(broker.set_sharding_state()) + + # now set sharding epoch and make sure everything moves. + broker.enable_sharding(ts_epoch) + self.assertTrue(broker.set_sharding_state()) + check_broker_properties(broker) + check_broker_info(broker.get_info()) + + def check_sharding_state(broker): + self.assertEqual(len(broker.get_brokers()), 2) + self.assertEqual(broker.get_db_state(), SHARDING) + self.assertTrue(os.path.exists(db_path)) + self.assertTrue(os.path.exists(new_db_path)) + self.assertEqual([], broker.get_objects()) + self.assertEqual(objects, broker.get_brokers()[0].get_objects()) + check_sharding_state(broker) + + # to confirm we're definitely looking at the shard db + broker2 = ContainerBroker(new_db_path) + check_broker_properties(broker2) + check_broker_info(broker2.get_info()) + self.assertEqual([], broker2.get_objects()) + + # Try to set sharding state again + self.assertFalse(broker.set_sharding_state()) + # check nothing changed + check_broker_properties(broker) + check_broker_info(broker.get_info()) + check_sharding_state(broker) + + # Now move to the final state - update shard ranges' state + broker.merge_shard_ranges( + [dict(sr, state=ShardRange.ACTIVE, + state_timestamp=next(ts_iter).internal) + for sr in shard_ranges]) + # pretend all ranges have been cleaved + self.assertTrue(broker.set_sharded_state()) + check_broker_properties(broker) + check_broker_info(broker.get_info()) + + def check_sharded_state(broker): + self.assertEqual(broker.get_db_state(), SHARDED) + self.assertEqual(len(broker.get_brokers()), 1) + self.assertFalse(os.path.exists(db_path)) + self.assertTrue(os.path.exists(new_db_path)) + self.assertEqual([], broker.get_objects()) + check_sharded_state(broker) + + # Try to set sharded state again + self.assertFalse(broker.set_sharded_state()) + # check nothing changed + check_broker_properties(broker) + check_broker_info(broker.get_info()) + check_sharded_state(broker) + + # delete the container - sharding sysmeta gets erased + broker.delete_db(next(ts_iter).internal) + # but it is not considered deleted while shards have content + self.assertFalse(broker.is_deleted()) + check_sharded_state(broker) + # empty the shard ranges + empty_shard_ranges = [sr.copy(object_count=0, bytes_used=0, + meta_timestamp=next(ts_iter)) + for sr in shard_ranges] + broker.merge_shard_ranges(empty_shard_ranges) + # and no it is deleted + self.assertTrue(broker.is_deleted()) + check_sharded_state(broker) + + def do_revive_shard_delete(shard_ranges): + # delete all shard ranges + deleted_shard_ranges = [sr.copy(timestamp=next(ts_iter), deleted=1) + for sr in shard_ranges] + broker.merge_shard_ranges(deleted_shard_ranges) + self.assertEqual(COLLAPSED, broker.get_db_state()) + + # add new shard ranges and go to sharding state - need to force + # broker time to be after the delete time in order to write new + # sysmeta + broker.enable_sharding(next(ts_iter)) + shard_ranges = [sr.copy(timestamp=next(ts_iter)) + for sr in shard_ranges] + broker.merge_shard_ranges(shard_ranges) + with mock.patch('swift.common.db.time.time', + lambda: float(next(ts_iter))): + self.assertTrue(broker.set_sharding_state()) + self.assertEqual(SHARDING, broker.get_db_state()) + + # go to sharded + self.assertTrue( + broker.set_sharded_state()) + self.assertEqual(SHARDED, broker.get_db_state()) + + # delete again + broker.delete_db(next(ts_iter).internal) + self.assertTrue(broker.is_deleted()) + self.assertEqual(SHARDED, broker.get_db_state()) + + do_revive_shard_delete(shard_ranges) + do_revive_shard_delete(shard_ranges) + + @with_tempdir + def test_set_sharding_state_errors(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(db_path, account='a', container='c', + logger=FakeLogger()) + broker.initialize(next(ts_iter).internal, 0) + broker.enable_sharding(next(ts_iter)) + + orig_execute = GreenDBConnection.execute + trigger = 'INSERT into object' + + def mock_execute(conn, *args, **kwargs): + if trigger in args[0]: + raise sqlite3.OperationalError() + return orig_execute(conn, *args, **kwargs) + + with mock.patch('swift.common.db.GreenDBConnection.execute', + mock_execute): + res = broker.set_sharding_state() + self.assertFalse(res) + lines = broker.logger.get_lines_for_level('error') + self.assertIn('Failed to set the ROWID', lines[0]) + self.assertFalse(lines[1:]) + + broker.logger.clear() + trigger = 'UPDATE container_stat SET created_at' + with mock.patch('swift.common.db.GreenDBConnection.execute', + mock_execute): + res = broker.set_sharding_state() + self.assertFalse(res) + lines = broker.logger.get_lines_for_level('error') + self.assertIn('Failed to set matching', lines[0]) + self.assertFalse(lines[1:]) + + @with_tempdir + def test_set_sharded_state_errors(self, tempdir): + ts_iter = make_timestamp_iter() + retiring_db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(retiring_db_path, account='a', container='c', + logger=FakeLogger()) + broker.initialize(next(ts_iter).internal, 0) + pre_epoch = next(ts_iter) + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + # unlink fails + with mock.patch('os.unlink', side_effect=OSError(errno.EPERM)): + self.assertFalse(broker.set_sharded_state()) + lines = broker.logger.get_lines_for_level('error') + self.assertIn('Failed to unlink', lines[0]) + self.assertFalse(lines[1:]) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertTrue(os.path.exists(retiring_db_path)) + self.assertTrue(os.path.exists(broker.db_file)) + + # extra files + extra_filename = make_db_file_path(broker.db_file, pre_epoch) + self.assertNotEqual(extra_filename, broker.db_file) # sanity check + with open(extra_filename, 'wb'): + pass + broker.logger.clear() + self.assertFalse(broker.set_sharded_state()) + lines = broker.logger.get_lines_for_level('warning') + self.assertIn('Still have multiple db files', lines[0]) + self.assertFalse(lines[1:]) + self.assertFalse(broker.logger.get_lines_for_level('error')) + self.assertTrue(os.path.exists(retiring_db_path)) + self.assertTrue(os.path.exists(broker.db_file)) + + # retiring file missing + broker.logger.clear() + os.unlink(retiring_db_path) + self.assertFalse(broker.set_sharded_state()) + lines = broker.logger.get_lines_for_level('warning') + self.assertIn('Refusing to delete', lines[0]) + self.assertFalse(lines[1:]) + self.assertFalse(broker.logger.get_lines_for_level('error')) + self.assertTrue(os.path.exists(broker.db_file)) + + @with_tempdir + def test_get_brokers(self, tempdir): + ts_iter = make_timestamp_iter() + retiring_db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(retiring_db_path, account='a', container='c', + logger=FakeLogger()) + broker.initialize(next(ts_iter).internal, 0) + brokers = broker.get_brokers() + self.assertEqual(retiring_db_path, brokers[0].db_file) + self.assertFalse(brokers[0].skip_commits) + self.assertFalse(brokers[1:]) + + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + brokers = broker.get_brokers() + self.assertEqual(retiring_db_path, brokers[0].db_file) + self.assertTrue(brokers[0].skip_commits) + self.assertEqual(broker.db_file, brokers[1].db_file) + self.assertFalse(brokers[1].skip_commits) + self.assertFalse(brokers[2:]) + + # same outcome when called on retiring db broker + brokers = brokers[0].get_brokers() + self.assertEqual(retiring_db_path, brokers[0].db_file) + self.assertTrue(brokers[0].skip_commits) + self.assertEqual(broker.db_file, brokers[1].db_file) + self.assertFalse(brokers[1].skip_commits) + self.assertFalse(brokers[2:]) + + self.assertTrue(broker.set_sharded_state()) + brokers = broker.get_brokers() + self.assertEqual(broker.db_file, brokers[0].db_file) + self.assertFalse(brokers[0].skip_commits) + self.assertFalse(brokers[1:]) + + # unexpected extra file should be ignored + with open(retiring_db_path, 'wb'): + pass + retiring_db_path = broker.db_file + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + broker.reload_db_files() + self.assertEqual(3, len(broker.db_files)) # sanity check + brokers = broker.get_brokers() + self.assertEqual(retiring_db_path, brokers[0].db_file) + self.assertTrue(brokers[0].skip_commits) + self.assertEqual(broker.db_file, brokers[1].db_file) + self.assertFalse(brokers[1].skip_commits) + self.assertFalse(brokers[2:]) + lines = broker.logger.get_lines_for_level('warning') + self.assertIn('Unexpected db files', lines[0]) + self.assertFalse(lines[1:]) + + @with_tempdir + def test_merge_shard_ranges(self, tempdir): + ts_iter = make_timestamp_iter() + ts = [next(ts_iter) for _ in range(13)] + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker( + db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + + # sanity check + self.assertFalse(broker.get_shard_ranges(include_deleted=True)) + + broker.merge_shard_ranges(None) + self.assertFalse(broker.get_shard_ranges(include_deleted=True)) + + # merge item at ts1 + # sr___ + sr_b_1_1 = ShardRange('a/c_b', ts[1], lower='a', upper='b', + object_count=2) + broker.merge_shard_ranges([sr_b_1_1]) + self._assert_shard_ranges(broker, [sr_b_1_1]) + + # merge older item - ignored + sr_b_0_0 = ShardRange('a/c_b', ts[0], lower='a', upper='b', + object_count=1) + broker.merge_shard_ranges([sr_b_0_0]) + self._assert_shard_ranges(broker, [sr_b_1_1]) + + # merge same timestamp - ignored + broker.merge_shard_ranges([dict(sr_b_1_1, lower='', upper='c')]) + self._assert_shard_ranges(broker, [sr_b_1_1]) + broker.merge_shard_ranges([dict(sr_b_1_1, object_count=99)]) + self._assert_shard_ranges(broker, [sr_b_1_1]) + + # merge list with older item *after* newer item + sr_c_2_2 = ShardRange('a/c_c', ts[2], lower='b', upper='c', + object_count=3) + sr_c_3_3 = ShardRange('a/c_c', ts[3], lower='b', upper='c', + object_count=4) + broker.merge_shard_ranges([sr_c_3_3, sr_c_2_2]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_3_3]) + + # merge newer item - updated + sr_c_5_5 = ShardRange('a/c_c', ts[5], lower='b', upper='c', + object_count=5) + broker.merge_shard_ranges([sr_c_5_5]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_5]) + + # merge older metadata item - ignored + sr_c_5_4 = ShardRange('a/c_c', ts[5], lower='b', upper='c', + object_count=6, meta_timestamp=ts[4]) + broker.merge_shard_ranges([sr_c_5_4]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_5]) + + # merge newer metadata item - only metadata is updated + sr_c_5_6 = ShardRange('a/c_c', ts[5], lower='b', upper='c', + object_count=7, meta_timestamp=ts[6]) + broker.merge_shard_ranges([dict(sr_c_5_6, lower='', upper='d')]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_6]) + + # merge older created_at, newer metadata item - ignored + sr_c_4_7 = ShardRange('a/c_c', ts[4], lower='b', upper='c', + object_count=8, meta_timestamp=ts[7]) + broker.merge_shard_ranges([sr_c_4_7]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_6]) + + # merge list with older metadata item *after* newer metadata item + sr_c_5_11 = ShardRange('a/c_c', ts[5], lower='b', upper='c', + object_count=9, meta_timestamp=ts[11]) + broker.merge_shard_ranges([sr_c_5_11, sr_c_5_6]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_11]) + + # deleted item at *same timestamp* as existing - deleted ignored + broker.merge_shard_ranges([dict(sr_b_1_1, deleted=1, object_count=0)]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_11]) + sr_b_1_1.meta_timestamp = ts[11] + broker.merge_shard_ranges([dict(sr_b_1_1, deleted=1)]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_11]) + sr_b_1_1.state_timestamp = ts[11] + broker.merge_shard_ranges([dict(sr_b_1_1, deleted=1)]) + self._assert_shard_ranges(broker, [sr_b_1_1, sr_c_5_11]) + + # delete item at *newer timestamp* - updated + sr_b_2_2_deleted = ShardRange('a/c_b', ts[2], lower='a', upper='b', + object_count=0, deleted=1) + broker.merge_shard_ranges([sr_b_2_2_deleted]) + self._assert_shard_ranges(broker, [sr_b_2_2_deleted, sr_c_5_11]) + + # merge list with older undeleted item *after* newer deleted item + # NB deleted timestamp trumps newer meta timestamp + sr_c_9_12 = ShardRange('a/c_c', ts[9], lower='b', upper='c', + object_count=10, meta_timestamp=ts[12]) + sr_c_10_10_deleted = ShardRange('a/c_c', ts[10], lower='b', upper='c', + object_count=0, deleted=1) + broker.merge_shard_ranges([sr_c_10_10_deleted, sr_c_9_12]) + self._assert_shard_ranges( + broker, [sr_b_2_2_deleted, sr_c_10_10_deleted]) + + @with_tempdir + def test_merge_shard_ranges_state(self, tempdir): + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(next(ts_iter).internal, 0) + expected_shard_ranges = [] + + def do_test(orig_state, orig_timestamp, test_state, test_timestamp, + expected_state, expected_timestamp): + index = len(expected_shard_ranges) + sr = ShardRange('a/%s' % index, orig_timestamp, '%03d' % index, + '%03d' % (index + 1), state=orig_state) + broker.merge_shard_ranges([sr]) + sr.state = test_state + sr.state_timestamp = test_timestamp + broker.merge_shard_ranges([sr]) + sr.state = expected_state + sr.state_timestamp = expected_timestamp + expected_shard_ranges.append(sr) + self._assert_shard_ranges(broker, expected_shard_ranges) + + # state at older state_timestamp is not merged + for orig_state in ShardRange.STATES: + for test_state in ShardRange.STATES: + ts_older = next(ts_iter) + ts = next(ts_iter) + do_test(orig_state, ts, test_state, ts_older, orig_state, ts) + + # more advanced state at same timestamp is merged + for orig_state in ShardRange.STATES: + for test_state in ShardRange.STATES: + ts = next(ts_iter) + do_test(orig_state, ts, test_state, ts, + test_state if test_state > orig_state else orig_state, + ts) + + # any state at newer timestamp is merged + for orig_state in ShardRange.STATES: + for test_state in ShardRange.STATES: + ts = next(ts_iter) + ts_newer = next(ts_iter) + do_test(orig_state, ts, test_state, ts_newer, test_state, + ts_newer) + + def _check_object_stats_when_sharded(self, a, c, root_a, root_c, tempdir): + # common setup and assertions for root and shard containers + ts_iter = make_timestamp_iter() + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', 'container.db') + broker = ContainerBroker( + db_path, account=a, container=c) + broker.initialize(next(ts_iter).internal, 0) + broker.set_sharding_sysmeta('Root', '%s/%s' % (root_a, root_c)) + broker.merge_items([{'name': 'obj', 'size': 14, 'etag': 'blah', + 'content_type': 'text/plain', 'deleted': 0, + 'created_at': Timestamp.now().internal}]) + self.assertEqual(1, broker.get_info()['object_count']) + self.assertEqual(14, broker.get_info()['bytes_used']) + + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + sr_1 = ShardRange( + '%s/%s1' % (root_a, root_c), Timestamp.now(), lower='', upper='m', + object_count=99, bytes_used=999, state=ShardRange.ACTIVE) + sr_2 = ShardRange( + '%s/%s2' % (root_a, root_c), Timestamp.now(), lower='m', upper='', + object_count=21, bytes_used=1000, state=ShardRange.ACTIVE) + broker.merge_shard_ranges([sr_1, sr_2]) + self.assertEqual(1, broker.get_info()['object_count']) + self.assertEqual(14, broker.get_info()['bytes_used']) + return broker + + @with_tempdir + def test_object_stats_root_container(self, tempdir): + broker = self._check_object_stats_when_sharded( + 'a', 'c', 'a', 'c', tempdir) + self.assertTrue(broker.is_root_container()) # sanity + self.assertTrue(broker.set_sharded_state()) + self.assertEqual(120, broker.get_info()['object_count']) + self.assertEqual(1999, broker.get_info()['bytes_used']) + + @with_tempdir + def test_object_stats_shard_container(self, tempdir): + broker = self._check_object_stats_when_sharded( + '.shard_a', 'c-blah', 'a', 'c', tempdir) + self.assertFalse(broker.is_root_container()) # sanity + self.assertTrue(broker.set_sharded_state()) + self.assertEqual(0, broker.get_info()['object_count']) + self.assertEqual(0, broker.get_info()['bytes_used']) + class TestCommonContainerBroker(test_db.TestExampleBroker): @@ -2144,6 +4408,8 @@ class ContainerBrokerMigrationMixin(object): ContainerBroker.create_object_table ContainerBroker.create_object_table = \ prespi_create_object_table + self._imported_create_shard_ranges_table = \ + ContainerBroker.create_shard_range_table self._imported_create_container_info_table = \ ContainerBroker.create_container_info_table ContainerBroker.create_container_info_table = \ @@ -2168,6 +4434,8 @@ class ContainerBrokerMigrationMixin(object): self._imported_create_container_info_table ContainerBroker.create_object_table = \ self._imported_create_object_table + ContainerBroker.create_shard_range_table = \ + self._imported_create_shard_ranges_table ContainerBroker.create_policy_stat_table = \ self._imported_create_policy_stat_table @@ -2221,6 +4489,8 @@ class TestContainerBrokerBeforeMetadata(ContainerBrokerMigrationMixin, Tests for ContainerBroker against databases created before the metadata column was added. """ + expected_db_tables = {'outgoing_sync', 'incoming_sync', 'object', + 'sqlite_sequence', 'container_stat', 'shard_range'} def setUp(self): super(TestContainerBrokerBeforeMetadata, self).setUp() @@ -2293,6 +4563,8 @@ class TestContainerBrokerBeforeXSync(ContainerBrokerMigrationMixin, Tests for ContainerBroker against databases created before the x_container_sync_point[12] columns were added. """ + expected_db_tables = {'outgoing_sync', 'incoming_sync', 'object', + 'sqlite_sequence', 'container_stat', 'shard_range'} def setUp(self): super(TestContainerBrokerBeforeXSync, self).setUp() @@ -2407,6 +4679,8 @@ class TestContainerBrokerBeforeSPI(ContainerBrokerMigrationMixin, Tests for ContainerBroker against databases created before the storage_policy_index column was added. """ + expected_db_tables = {'outgoing_sync', 'incoming_sync', 'object', + 'sqlite_sequence', 'container_stat', 'shard_range'} def setUp(self): super(TestContainerBrokerBeforeSPI, self).setUp() @@ -2611,6 +4885,48 @@ class TestContainerBrokerBeforeSPI(ContainerBrokerMigrationMixin, self.assertEqual(info['bytes_used'], 456) +class TestContainerBrokerBeforeShardRanges(ContainerBrokerMigrationMixin, + TestContainerBroker): + """ + Tests for ContainerBroker against databases created + before the shard_ranges table was added. + """ + expected_db_tables = {'outgoing_sync', 'incoming_sync', 'object', + 'sqlite_sequence', 'container_stat'} + + class Override(object): + def __init__(self, func): + self.func = func + + def __get__(self, obj, obj_type): + if inspect.stack()[1][3] == '_initialize': + return lambda *a, **kw: None + return self.func.__get__(obj, obj_type) + + def setUp(self): + super(TestContainerBrokerBeforeShardRanges, self).setUp() + ContainerBroker.create_shard_range_table = self.Override( + ContainerBroker.create_shard_range_table) + broker = ContainerBroker(':memory:', account='a', container='c') + broker.initialize(Timestamp('1').internal, 0) + exc = None + with broker.get() as conn: + try: + conn.execute('''SELECT * + FROM shard_range''') + except BaseException as err: + exc = err + self.assertTrue('no such table: shard_range' in str(exc)) + + def tearDown(self): + super(TestContainerBrokerBeforeShardRanges, self).tearDown() + broker = ContainerBroker(':memory:', account='a', container='c') + broker.initialize(Timestamp('1').internal, 0) + with broker.get() as conn: + conn.execute('''SELECT * + FROM shard_range''') + + class TestUpdateNewItemFromExisting(unittest.TestCase): # TODO: add test scenarios that have swift_bytes in content_type t0 = '1234567890.00000' diff --git a/test/unit/container/test_replicator.py b/test/unit/container/test_replicator.py index ff63a2992c..23f06ddc97 100644 --- a/test/unit/container/test_replicator.py +++ b/test/unit/container/test_replicator.py @@ -26,13 +26,17 @@ from swift.common import db_replicator from swift.container import replicator, backend, server, sync_store from swift.container.reconciler import ( MISPLACED_OBJECTS_ACCOUNT, get_reconciler_container_name) -from swift.common.utils import Timestamp, encode_timestamps +from swift.common.utils import Timestamp, encode_timestamps, ShardRange, \ + get_db_files, make_db_file_path from swift.common.storage_policy import POLICIES from test.unit.common import test_db_replicator -from test.unit import patch_policies, make_timestamp_iter, mock_check_drive +from test.unit import patch_policies, make_timestamp_iter, mock_check_drive, \ + debug_logger from contextlib import contextmanager +from test.unit.common.test_db_replicator import attach_fake_replication_rpc + @patch_policies class TestReplicatorSync(test_db_replicator.TestReplicatorSync): @@ -42,6 +46,16 @@ class TestReplicatorSync(test_db_replicator.TestReplicatorSync): replicator_daemon = replicator.ContainerReplicator replicator_rpc = replicator.ContainerReplicatorRpc + def assertShardRangesEqual(self, x, y): + # ShardRange.__eq__ only compares lower and upper; here we generate + # dict representations to compare all attributes + self.assertEqual([dict(sr) for sr in x], [dict(sr) for sr in y]) + + def assertShardRangesNotEqual(self, x, y): + # ShardRange.__eq__ only compares lower and upper; here we generate + # dict representations to compare all attributes + self.assertNotEqual([dict(sr) for sr in x], [dict(sr) for sr in y]) + def test_report_up_to_date(self): broker = self._get_broker('a', 'c', node_index=0) broker.initialize(Timestamp(1).internal, int(POLICIES.default)) @@ -1148,6 +1162,1037 @@ class TestReplicatorSync(test_db_replicator.TestReplicatorSync): self.assertEqual(1, mock_remove.call_count) self.assertEqual(broker_2.db_file, mock_remove.call_args[0][0].db_file) + def test_cleanup_post_replicate(self): + broker = self._get_broker('a', 'c', node_index=0) + put_timestamp = Timestamp.now() + broker.initialize(put_timestamp.internal, POLICIES.default.idx) + orig_info = broker.get_replication_info() + daemon = replicator.ContainerReplicator({}, logger=self.logger) + + # db should not be here, replication ok, deleted + res = daemon.cleanup_post_replicate(broker, orig_info, [True] * 3) + self.assertTrue(res) + self.assertFalse(os.path.exists(broker.db_file)) + self.assertEqual(['Successfully deleted db %s' % broker.db_file], + daemon.logger.get_lines_for_level('debug')) + daemon.logger.clear() + + # failed replication, not deleted + broker.initialize(put_timestamp.internal, POLICIES.default.idx) + orig_info = broker.get_replication_info() + res = daemon.cleanup_post_replicate(broker, orig_info, + [False, True, True]) + self.assertTrue(res) + self.assertTrue(os.path.exists(broker.db_file)) + self.assertEqual(['Not deleting db %s (2/3 success)' % broker.db_file], + daemon.logger.get_lines_for_level('debug')) + daemon.logger.clear() + + # db has shard ranges, not deleted + broker.enable_sharding(Timestamp.now()) + broker.merge_shard_ranges( + [ShardRange('.shards_a/c', Timestamp.now(), '', 'm')]) + self.assertTrue(broker.sharding_required()) # sanity check + res = daemon.cleanup_post_replicate(broker, orig_info, [True] * 3) + self.assertTrue(res) + self.assertTrue(os.path.exists(broker.db_file)) + self.assertEqual( + ['Not deleting db %s (requires sharding, state unsharded)' % + broker.db_file], + daemon.logger.get_lines_for_level('debug')) + daemon.logger.clear() + + # db sharding, not deleted + self._goto_sharding_state(broker, Timestamp.now()) + self.assertTrue(broker.sharding_required()) # sanity check + orig_info = broker.get_replication_info() + res = daemon.cleanup_post_replicate(broker, orig_info, [True] * 3) + self.assertTrue(res) + self.assertTrue(os.path.exists(broker.db_file)) + self.assertEqual( + ['Not deleting db %s (requires sharding, state sharding)' % + broker.db_file], + daemon.logger.get_lines_for_level('debug')) + daemon.logger.clear() + + # db sharded, should not be here, failed replication, not deleted + self._goto_sharded_state(broker) + self.assertFalse(broker.sharding_required()) # sanity check + res = daemon.cleanup_post_replicate(broker, orig_info, + [True, False, True]) + self.assertTrue(res) + self.assertTrue(os.path.exists(broker.db_file)) + self.assertEqual(['Not deleting db %s (2/3 success)' % + broker.db_file], + daemon.logger.get_lines_for_level('debug')) + daemon.logger.clear() + + # db sharded, should not be here, new shard ranges (e.g. from reverse + # replication), deleted + broker.merge_shard_ranges( + [ShardRange('.shards_a/c', Timestamp.now(), '', 'm')]) + res = daemon.cleanup_post_replicate(broker, orig_info, [True] * 3) + self.assertTrue(res) + self.assertFalse(os.path.exists(broker.db_file)) + daemon.logger.clear() + + # db sharded, should not be here, replication ok, deleted + broker.initialize(put_timestamp.internal, POLICIES.default.idx) + self.assertTrue(os.path.exists(broker.db_file)) + orig_info = broker.get_replication_info() + res = daemon.cleanup_post_replicate(broker, orig_info, [True] * 3) + self.assertTrue(res) + self.assertFalse(os.path.exists(broker.db_file)) + self.assertEqual(['Successfully deleted db %s' % broker.db_file], + daemon.logger.get_lines_for_level('debug')) + daemon.logger.clear() + + def test_sync_shard_ranges(self): + put_timestamp = Timestamp.now().internal + # create "local" broker + broker = self._get_broker('a', 'c', node_index=0) + broker.initialize(put_timestamp, POLICIES.default.idx) + # create "remote" broker + remote_broker = self._get_broker('a', 'c', node_index=1) + remote_broker.initialize(put_timestamp, POLICIES.default.idx) + + def check_replicate(expected_shard_ranges, from_broker, to_broker): + daemon = replicator.ContainerReplicator({}) + part, node = self._get_broker_part_node(to_broker) + info = broker.get_replication_info() + success = daemon._repl_to_node(node, from_broker, part, info) + self.assertTrue(success) + self.assertEqual( + expected_shard_ranges, + to_broker.get_all_shard_range_data() + ) + self.assertEqual(1, daemon.stats['deferred']) + self.assertEqual(0, daemon.stats['rsync']) + self.assertEqual(0, daemon.stats['diff']) + local_info = self._get_broker( + 'a', 'c', node_index=0).get_info() + remote_info = self._get_broker( + 'a', 'c', node_index=1).get_info() + for k, v in local_info.items(): + if k == 'id': + continue + self.assertEqual(remote_info[k], v, + "mismatch remote %s %r != %r" % ( + k, remote_info[k], v)) + + bounds = (('', 'g'), ('g', 'r'), ('r', '')) + shard_ranges = [ + ShardRange('.shards_a/sr-%s' % upper, Timestamp.now(), lower, + upper, i + 1, 10 * (i + 1)) + for i, (lower, upper) in enumerate(bounds) + ] + # add first two shard_ranges to both brokers + for shard_range in shard_ranges[:2]: + for db in (broker, remote_broker): + db.merge_shard_ranges(shard_range) + # now add a shard range to the "local" broker only + own_sr = broker.enable_sharding(Timestamp.now()) + broker.merge_shard_ranges(shard_ranges[2]) + broker_ranges = broker.get_all_shard_range_data() + self.assertShardRangesEqual(shard_ranges + [own_sr], broker_ranges) + check_replicate(broker_ranges, broker, remote_broker) + + # update one shard range + shard_ranges[1].update_meta(99, 0) + broker.merge_shard_ranges(shard_ranges[1]) + # sanity check + broker_ranges = broker.get_all_shard_range_data() + self.assertShardRangesEqual(shard_ranges + [own_sr], broker_ranges) + check_replicate(broker_ranges, broker, remote_broker) + + # delete one shard range + shard_ranges[0].deleted = 1 + shard_ranges[0].timestamp = Timestamp.now() + broker.merge_shard_ranges(shard_ranges[0]) + # sanity check + broker_ranges = broker.get_all_shard_range_data() + self.assertShardRangesEqual(shard_ranges + [own_sr], broker_ranges) + check_replicate(broker_ranges, broker, remote_broker) + + # put a shard range again + shard_ranges[2].timestamp = Timestamp.now() + shard_ranges[2].object_count = 0 + broker.merge_shard_ranges(shard_ranges[2]) + # sanity check + broker_ranges = broker.get_all_shard_range_data() + self.assertShardRangesEqual(shard_ranges + [own_sr], broker_ranges) + check_replicate(broker_ranges, broker, remote_broker) + + # update same shard range on local and remote, remote later + shard_ranges[-1].meta_timestamp = Timestamp.now() + shard_ranges[-1].bytes_used += 1000 + broker.merge_shard_ranges(shard_ranges[-1]) + remote_shard_ranges = remote_broker.get_shard_ranges( + include_deleted=True) + remote_shard_ranges[-1].meta_timestamp = Timestamp.now() + remote_shard_ranges[-1].bytes_used += 2000 + remote_broker.merge_shard_ranges(remote_shard_ranges[-1]) + # sanity check + remote_broker_ranges = remote_broker.get_all_shard_range_data() + self.assertShardRangesEqual(remote_shard_ranges + [own_sr], + remote_broker_ranges) + self.assertShardRangesNotEqual(shard_ranges, remote_shard_ranges) + check_replicate(remote_broker_ranges, broker, remote_broker) + + # undelete shard range *on the remote* + deleted_ranges = [sr for sr in remote_shard_ranges if sr.deleted] + self.assertEqual([shard_ranges[0]], deleted_ranges) + deleted_ranges[0].deleted = 0 + deleted_ranges[0].timestamp = Timestamp.now() + remote_broker.merge_shard_ranges(deleted_ranges[0]) + # sanity check + remote_broker_ranges = remote_broker.get_all_shard_range_data() + self.assertShardRangesEqual(remote_shard_ranges + [own_sr], + remote_broker_ranges) + self.assertShardRangesNotEqual(shard_ranges, remote_shard_ranges) + check_replicate(remote_broker_ranges, broker, remote_broker) + + # reverse replication direction and expect syncs to propagate + check_replicate(remote_broker_ranges, remote_broker, broker) + + def test_sync_shard_ranges_with_rsync(self): + broker = self._get_broker('a', 'c', node_index=0) + put_timestamp = time.time() + broker.initialize(put_timestamp, POLICIES.default.idx) + + bounds = (('', 'g'), ('g', 'r'), ('r', '')) + shard_ranges = [ + ShardRange('.shards_a/sr-%s' % upper, Timestamp.now(), lower, + upper, i + 1, 10 * (i + 1)) + for i, (lower, upper) in enumerate(bounds) + ] + # add first shard range + own_sr = broker.enable_sharding(Timestamp.now()) + broker.merge_shard_ranges(shard_ranges[:1]) + + # "replicate" + part, node = self._get_broker_part_node(broker) + daemon = self._run_once(node) + self.assertEqual(2, daemon.stats['rsync']) + + # complete rsync to all other nodes + def check_replicate(expected_ranges): + for i in range(1, 3): + remote_broker = self._get_broker('a', 'c', node_index=i) + self.assertTrue(os.path.exists(remote_broker.db_file)) + self.assertShardRangesEqual( + expected_ranges, + remote_broker.get_shard_ranges(include_deleted=True, + include_own=True) + ) + remote_info = remote_broker.get_info() + local_info = self._get_broker( + 'a', 'c', node_index=0).get_info() + for k, v in local_info.items(): + if k == 'id': + continue + if k == 'hash': + self.assertEqual(remote_info[k], '0' * 32) + continue + if k == 'object_count': + self.assertEqual(remote_info[k], 0) + continue + self.assertEqual(remote_info[k], v, + "mismatch remote %s %r != %r" % ( + k, remote_info[k], v)) + + check_replicate([shard_ranges[0], own_sr]) + + # delete and add some more shard ranges + shard_ranges[0].deleted = 1 + shard_ranges[0].timestamp = Timestamp.now() + for shard_range in shard_ranges: + broker.merge_shard_ranges(shard_range) + daemon = self._run_once(node) + self.assertEqual(2, daemon.stats['deferred']) + check_replicate(shard_ranges + [own_sr]) + + def check_replicate(self, from_broker, remote_node_index, repl_conf=None, + expect_success=True, errors=None): + repl_conf = repl_conf or {} + repl_calls = [] + rsync_calls = [] + + def repl_hook(op, *sync_args): + repl_calls.append((op, sync_args)) + + fake_repl_connection = attach_fake_replication_rpc( + self.rpc, replicate_hook=repl_hook, errors=errors) + db_replicator.ReplConnection = fake_repl_connection + daemon = replicator.ContainerReplicator( + repl_conf, logger=debug_logger()) + self._install_fake_rsync_file(daemon, rsync_calls) + part, nodes = self._ring.get_nodes(from_broker.account, + from_broker.container) + + def find_node(node_index): + for node in nodes: + if node['index'] == node_index: + return node + else: + self.fail('Failed to find node index %s' % remote_node_index) + + remote_node = find_node(remote_node_index) + info = from_broker.get_replication_info() + success = daemon._repl_to_node(remote_node, from_broker, part, info) + self.assertEqual(expect_success, success) + return daemon, repl_calls, rsync_calls + + def assert_synced_shard_ranges(self, expected, synced_items): + expected.sort(key=lambda sr: (sr.lower, sr.upper)) + for item in synced_items: + item.pop('record_type', None) + self.assertEqual([dict(ex) for ex in expected], synced_items) + + def assert_info_synced(self, local, remote_node_index, mismatches=None): + mismatches = mismatches or [] + mismatches.append('id') + remote = self._get_broker(local.account, local.container, + node_index=remote_node_index) + local_info = local.get_info() + remote_info = remote.get_info() + errors = [] + for k, v in local_info.items(): + if remote_info.get(k) == v: + if k in mismatches: + errors.append( + "unexpected match remote %s %r == %r" % ( + k, remote_info[k], v)) + continue + else: + if k not in mismatches: + errors.append( + "unexpected mismatch remote %s %r != %r" % ( + k, remote_info[k], v)) + if errors: + self.fail('Found sync errors:\n' + '\n'.join(errors)) + + def assert_shard_ranges_synced(self, local_broker, remote_broker): + self.assertShardRangesEqual( + local_broker.get_shard_ranges(include_deleted=True, + include_own=True), + remote_broker.get_shard_ranges(include_deleted=True, + include_own=True) + ) + + def _setup_replication_test(self, node_index): + ts_iter = make_timestamp_iter() + policy_idx = POLICIES.default.idx + put_timestamp = Timestamp.now().internal + # create "local" broker + broker = self._get_broker('a', 'c', node_index=node_index) + broker.initialize(put_timestamp, policy_idx) + + objs = [{'name': 'blah%03d' % i, 'created_at': next(ts_iter).internal, + 'size': i, 'content_type': 'text/plain', 'etag': 'etag%s' % i, + 'deleted': 0, 'storage_policy_index': policy_idx} + for i in range(20)] + bounds = (('', 'a'), ('a', 'b'), ('b', 'c'), ('c', '')) + shard_ranges = [ + ShardRange( + '.sharded_a/sr-%s' % upper, Timestamp.now(), lower, upper) + for i, (lower, upper) in enumerate(bounds) + ] + return {'broker': broker, + 'objects': objs, + 'shard_ranges': shard_ranges} + + def _merge_object(self, broker, objects, index, **kwargs): + if not isinstance(index, slice): + index = slice(index, index + 1) + objs = [dict(obj) for obj in objects[index]] + broker.merge_items(objs) + + def _merge_shard_range(self, broker, shard_ranges, index, **kwargs): + broker.merge_shard_ranges(shard_ranges[index:index + 1]) + + def _goto_sharding_state(self, broker, epoch): + broker.enable_sharding(epoch) + self.assertTrue(broker.set_sharding_state()) + self.assertEqual(backend.SHARDING, broker.get_db_state()) + + def _goto_sharded_state(self, broker): + self.assertTrue(broker.set_sharded_state()) + self.assertEqual(backend.SHARDED, broker.get_db_state()) + + def _assert_local_sharded_in_sync(self, local_broker, local_id): + daemon, repl_calls, rsync_calls = self.check_replicate(local_broker, 1) + self.assertEqual(['sync', 'get_shard_ranges', 'merge_shard_ranges'], + [call[0] for call in repl_calls]) + self.assertEqual(1, daemon.stats['deferred']) + self.assertEqual(0, daemon.stats['rsync']) + self.assertEqual(0, daemon.stats['diff']) + self.assertFalse(rsync_calls) + # new db sync + self.assertEqual(local_id, repl_calls[0][1][2]) + # ...but we still get a merge_shard_ranges for shard ranges + self.assert_synced_shard_ranges( + local_broker.get_shard_ranges(include_own=True), + repl_calls[2][1][0]) + self.assertEqual(local_id, repl_calls[2][1][1]) + + def _check_only_shard_ranges_replicated(self, local_broker, + remote_node_index, + repl_conf, + expected_shard_ranges, + expect_success=True): + # expected_shard_ranges is expected final list of sync'd ranges + daemon, repl_calls, rsync_calls = self.check_replicate( + local_broker, remote_node_index, repl_conf, + expect_success=expect_success) + + # we always expect only shard ranges to end in abort + self.assertEqual(1, daemon.stats['deferred']) + self.assertEqual(0, daemon.stats['diff']) + self.assertEqual(0, daemon.stats['rsync']) + self.assertEqual(['sync', 'get_shard_ranges', 'merge_shard_ranges'], + [call[0] for call in repl_calls]) + self.assertFalse(rsync_calls) + # sync + local_id = local_broker.get_info()['id'] + self.assertEqual(local_id, repl_calls[0][1][2]) + # get_shard_ranges + self.assertEqual((), repl_calls[1][1]) + # merge_shard_ranges for sending local shard ranges + self.assertShardRangesEqual(expected_shard_ranges, repl_calls[2][1][0]) + self.assertEqual(local_id, repl_calls[2][1][1]) + remote_broker = self._get_broker( + local_broker.account, local_broker.container, node_index=1) + self.assertNotEqual(local_id, remote_broker.get_info()['id']) + self.assert_shard_ranges_synced(remote_broker, local_broker) + + def test_replication_local_unsharded_remote_missing(self): + context = self._setup_replication_test(0) + local_broker = context['broker'] + local_id = local_broker.get_info()['id'] + objs = context['objects'] + self._merge_object(index=0, **context) + + daemon, repl_calls, rsync_calls = self.check_replicate(local_broker, 1) + + self.assert_info_synced(local_broker, 1) + self.assertEqual(1, daemon.stats['rsync']) + self.assertEqual(['sync', 'complete_rsync'], + [call[0] for call in repl_calls]) + self.assertEqual(local_id, repl_calls[1][1][0]) + self.assertEqual(os.path.basename(local_broker.db_file), + repl_calls[1][1][1]) + self.assertEqual(local_broker.db_file, rsync_calls[0][0]) + self.assertEqual(local_id, os.path.basename(rsync_calls[0][1])) + self.assertFalse(rsync_calls[1:]) + remote_broker = self._get_broker('a', 'c', node_index=1) + self.assert_shard_ranges_synced(local_broker, remote_broker) + self.assertTrue(os.path.exists(remote_broker._db_file)) + self.assertNotEqual(local_id, remote_broker.get_info()['id']) + self.assertEqual(objs[:1], remote_broker.get_objects()) + + def _check_replication_local_unsharded_remote_sharded(self, repl_conf): + context = self._setup_replication_test(0) + local_broker = context['broker'] + local_id = local_broker.get_info()['id'] + self._merge_object(index=slice(0, 6), **context) + + remote_context = self._setup_replication_test(1) + self._merge_object(index=4, **remote_context) + remote_broker = remote_context['broker'] + epoch = Timestamp.now() + self._goto_sharding_state(remote_broker, epoch=epoch) + remote_context['shard_ranges'][0].object_count = 101 + remote_context['shard_ranges'][0].bytes_used = 1010 + remote_context['shard_ranges'][0].state = ShardRange.ACTIVE + self._merge_shard_range(index=0, **remote_context) + self._merge_object(index=5, **remote_context) + self._goto_sharded_state(remote_broker) + self.assertEqual(backend.SHARDED, remote_broker.get_db_state()) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + remote_broker.get_shard_ranges(include_own=True)) + + remote_broker = self._get_broker( + local_broker.account, local_broker.container, node_index=1) + self.assertEqual(backend.SHARDED, remote_broker.get_db_state()) + self.assertFalse(os.path.exists(remote_broker._db_file)) + self.assertNotEqual(local_id, remote_broker.get_info()['id']) + self.assertEqual(remote_context['objects'][5:6], + remote_broker.get_objects()) + + # Now that we have shard ranges, we're never considered in-sync :-/ + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + remote_broker.get_shard_ranges(include_own=True)) + + def test_replication_local_unsharded_remote_sharded(self): + self._check_replication_local_unsharded_remote_sharded({}) + + def test_replication_local_unsharded_remote_sharded_large_diff(self): + self._check_replication_local_unsharded_remote_sharded({'per_diff': 1}) + + def _check_replication_local_sharding_remote_missing(self, repl_conf): + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + self._merge_object(index=0, **local_context) + self._merge_object(index=1, **local_context) + epoch = Timestamp.now() + self._goto_sharding_state(local_broker, epoch) + self._merge_shard_range(index=0, **local_context) + self._merge_object(index=slice(2, 8), **local_context) + objs = local_context['objects'] + + daemon, repl_calls, rsync_calls = self.check_replicate( + local_broker, 1, repl_conf=repl_conf) + + self.assertEqual(['sync', 'complete_rsync'], + [call[0] for call in repl_calls]) + self.assertEqual(1, daemon.stats['rsync']) + self.assertEqual(0, daemon.stats['deferred']) + self.assertEqual(0, daemon.stats['diff']) + + # fresh db is sync'd first... + fresh_id = local_broker.get_info()['id'] + self.assertEqual(fresh_id, repl_calls[0][1][2]) + self.assertEqual(fresh_id, repl_calls[1][1][0]) + # retired db is not sync'd at all + old_broker = self.backend( + local_broker._db_file, account=local_broker.account, + container=local_broker.container, force_db_file=True) + old_id = old_broker.get_info()['id'] + bad_calls = [] + for call in repl_calls: + if old_id in call[1]: + bad_calls.append( + 'old db id %r in %r call args %r' % ( + old_id, call[0], call[1])) + if bad_calls: + self.fail('Found some bad calls:\n' + '\n'.join(bad_calls)) + # complete_rsync + self.assertEqual(os.path.basename(local_broker.db_file), + repl_calls[1][1][1]) + self.assertEqual(local_broker.db_file, rsync_calls[0][0]) + self.assertEqual(fresh_id, os.path.basename(rsync_calls[0][1])) + self.assertFalse(rsync_calls[1:]) + + # TODO: make these stats better; in sharding state local broker pulls + # stats for 2 objects from old db, whereas remote thinks it's sharded + # and has an empty shard range table + self.assert_info_synced(local_broker, 1, mismatches=[ + 'object_count', 'bytes_used', 'db_state']) + + remote_broker = self._get_broker('a', 'c', node_index=1) + remote_id = remote_broker.get_info()['id'] + self.assertNotEqual(old_id, remote_id) + self.assertNotEqual(fresh_id, remote_id) + self.assertEqual( + [remote_broker.db_file], get_db_files(remote_broker.db_file)) + self.assertEqual(os.path.basename(remote_broker.db_file), + os.path.basename(local_broker.db_file)) + self.assertEqual(epoch, remote_broker.db_epoch) + # remote db has only the misplaced objects + self.assertEqual(objs[2:8], remote_broker.get_objects()) + self.assert_shard_ranges_synced(local_broker, remote_broker) + + # replicate again, check asserts abort + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + local_broker.get_shard_ranges(include_own=True)) + + # sanity + remote_broker = self._get_broker('a', 'c', node_index=1) + self.assertEqual( + [remote_broker.db_file], get_db_files(remote_broker.db_file)) + self.assertEqual(os.path.basename(remote_broker.db_file), + os.path.basename(local_broker.db_file)) + self.assertEqual(objs[2:8], remote_broker.get_objects()) + self.assertEqual(epoch, remote_broker.db_epoch) + + def test_replication_local_sharding_remote_missing(self): + self._check_replication_local_sharding_remote_missing({}) + + def test_replication_local_sharding_remote_missing_large_diff(self): + # the local shard db has large diff with respect to the old db + self._check_replication_local_sharding_remote_missing({'per_diff': 1}) + + def _check_replication_local_sharding_remote_unsharded(self, repl_conf): + local_context = self._setup_replication_test(0) + self._merge_object(index=slice(0, 3), **local_context) + local_broker = local_context['broker'] + epoch = Timestamp.now() + self._goto_sharding_state(local_broker, epoch) + self._merge_shard_range(index=0, **local_context) + self._merge_object(index=slice(3, 11), **local_context) + + remote_context = self._setup_replication_test(1) + self._merge_object(index=11, **remote_context) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + local_broker.get_shard_ranges(include_own=True)) + + remote_broker = self._get_broker('a', 'c', node_index=1) + self.assertEqual( + [remote_broker._db_file], get_db_files(remote_broker.db_file)) + self.assertEqual(remote_context['objects'][11:12], + remote_broker.get_objects()) + + self.assert_info_synced( + local_broker, 1, + mismatches=['db_state', 'object_count', 'bytes_used', + 'status_changed_at', 'hash']) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + local_broker.get_shard_ranges(include_own=True)) + + def test_replication_local_sharding_remote_unsharded(self): + self._check_replication_local_sharding_remote_unsharded({}) + + def test_replication_local_sharding_remote_unsharded_large_diff(self): + self._check_replication_local_sharding_remote_unsharded( + {'per_diff': 1}) + + def _check_replication_local_sharding_remote_sharding(self, repl_conf): + local_context = self._setup_replication_test(0) + self._merge_object(index=slice(0, 5), **local_context) + local_broker = local_context['broker'] + epoch = Timestamp.now() + self._goto_sharding_state(local_broker, epoch) + self._merge_shard_range(index=0, **local_context) + self._merge_object(index=slice(5, 10), **local_context) + + remote_context = self._setup_replication_test(1) + self._merge_object(index=12, **remote_context) + # take snapshot of info now before transition to sharding... + orig_remote_info = remote_context['broker'].get_info() + remote_broker = remote_context['broker'] + self._goto_sharding_state(remote_broker, epoch) + self._merge_shard_range(index=0, **remote_context) + self._merge_object(index=13, **remote_context) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + remote_broker.get_shard_ranges(include_own=True)) + + # in sharding state brokers only reports object stats from old db, and + # they are different + self.assert_info_synced( + local_broker, 1, mismatches=['object_count', 'bytes_used', + 'status_changed_at', 'hash']) + + remote_broker = self._get_broker('a', 'c', node_index=1) + shard_db = make_db_file_path(remote_broker._db_file, epoch) + self.assertEqual([remote_broker._db_file, shard_db], + get_db_files(remote_broker.db_file)) + shard_db = make_db_file_path(remote_broker._db_file, epoch) + self.assertEqual([remote_broker._db_file, shard_db], + get_db_files(remote_broker.db_file)) + # no local objects have been sync'd to remote shard db + self.assertEqual(remote_context['objects'][13:14], + remote_broker.get_objects()) + # remote *old db* is unchanged + remote_old_broker = self.backend( + remote_broker._db_file, account=remote_broker.account, + container=remote_broker.container, force_db_file=True) + self.assertEqual(remote_context['objects'][12:13], + remote_old_broker.get_objects()) + self.assertFalse(remote_old_broker.get_shard_ranges()) + remote_old_info = remote_old_broker.get_info() + orig_remote_info.pop('db_state') + remote_old_info.pop('db_state') + self.assertEqual(orig_remote_info, remote_old_info) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + local_broker.get_shard_ranges(include_own=True)) + + def test_replication_local_sharding_remote_sharding(self): + self._check_replication_local_sharding_remote_sharding({}) + + def test_replication_local_sharding_remote_sharding_large_diff(self): + self._check_replication_local_sharding_remote_sharding({'per_diff': 1}) + + def test_replication_local_sharded_remote_missing(self): + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + epoch = Timestamp.now() + self._goto_sharding_state(local_broker, epoch) + local_context['shard_ranges'][0].object_count = 99 + local_context['shard_ranges'][0].state = ShardRange.ACTIVE + self._merge_shard_range(index=0, **local_context) + self._merge_object(index=slice(0, 3), **local_context) + self._goto_sharded_state(local_broker) + objs = local_context['objects'] + + daemon, repl_calls, rsync_calls = self.check_replicate(local_broker, 1) + + self.assertEqual(['sync', 'complete_rsync'], + [call[0] for call in repl_calls]) + self.assertEqual(1, daemon.stats['rsync']) + + # sync + local_id = local_broker.get_info()['id'] + self.assertEqual(local_id, repl_calls[0][1][2]) + # complete_rsync + self.assertEqual(local_id, repl_calls[1][1][0]) + self.assertEqual( + os.path.basename(local_broker.db_file), repl_calls[1][1][1]) + self.assertEqual(local_broker.db_file, rsync_calls[0][0]) + self.assertEqual(local_id, os.path.basename(rsync_calls[0][1])) + self.assertFalse(rsync_calls[1:]) + + self.assert_info_synced(local_broker, 1) + + remote_broker = self._get_broker('a', 'c', node_index=1) + remote_id = remote_broker.get_info()['id'] + self.assertNotEqual(local_id, remote_id) + shard_db = make_db_file_path(remote_broker._db_file, epoch) + self.assertEqual([shard_db], + get_db_files(remote_broker.db_file)) + self.assertEqual(objs[:3], remote_broker.get_objects()) + self.assertEqual(local_broker.get_shard_ranges(), + remote_broker.get_shard_ranges()) + + # sanity check - in sync + self._assert_local_sharded_in_sync(local_broker, local_id) + + remote_broker = self._get_broker('a', 'c', node_index=1) + shard_db = make_db_file_path(remote_broker._db_file, epoch) + self.assertEqual([shard_db], + get_db_files(remote_broker.db_file)) + # the remote broker object_count comes from replicated shard range... + self.assertEqual(99, remote_broker.get_info()['object_count']) + # these are replicated misplaced objects... + self.assertEqual(objs[:3], remote_broker.get_objects()) + self.assertEqual(local_broker.get_shard_ranges(), + remote_broker.get_shard_ranges()) + + def _check_replication_local_sharded_remote_unsharded(self, repl_conf): + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + epoch = Timestamp.now() + self._goto_sharding_state(local_broker, epoch) + local_context['shard_ranges'][0].object_count = 99 + local_context['shard_ranges'][0].state = ShardRange.ACTIVE + self._merge_shard_range(index=0, **local_context) + self._merge_object(index=slice(0, 3), **local_context) + self._goto_sharded_state(local_broker) + + remote_context = self._setup_replication_test(1) + self._merge_object(index=4, **remote_context) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + local_broker.get_shard_ranges(include_own=True), + expect_success=True) + + # sharded broker takes object count from shard range whereas remote + # unsharded broker takes it from object table + self.assert_info_synced( + local_broker, 1, + mismatches=['db_state', 'object_count', 'bytes_used', + 'status_changed_at', 'hash']) + + remote_broker = self._get_broker('a', 'c', node_index=1) + self.assertEqual([remote_broker._db_file], + get_db_files(remote_broker.db_file)) + self.assertEqual(remote_context['objects'][4:5], + remote_broker.get_objects()) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + local_broker.get_shard_ranges(include_own=True), + expect_success=True) + + remote_broker = self._get_broker('a', 'c', node_index=1) + self.assertEqual([remote_broker._db_file], + get_db_files(remote_broker.db_file)) + self.assertEqual(remote_context['objects'][4:5], + remote_broker.get_objects()) + + def test_replication_local_sharded_remote_unsharded(self): + self._check_replication_local_sharded_remote_unsharded({}) + + def test_replication_local_sharded_remote_unsharded_large_diff(self): + self._check_replication_local_sharded_remote_unsharded({'per_diff': 1}) + + def _check_replication_local_sharded_remote_sharding(self, repl_conf): + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + epoch = Timestamp.now() + self._goto_sharding_state(local_broker, epoch=epoch) + local_context['shard_ranges'][0].object_count = 99 + local_context['shard_ranges'][0].bytes_used = 999 + local_context['shard_ranges'][0].state = ShardRange.ACTIVE + self._merge_shard_range(index=0, **local_context) + self._merge_object(index=slice(0, 5), **local_context) + self._goto_sharded_state(local_broker) + + remote_context = self._setup_replication_test(1) + self._merge_object(index=6, **remote_context) + remote_broker = remote_context['broker'] + remote_info_orig = remote_broker.get_info() + self._goto_sharding_state(remote_broker, epoch=epoch) + self._merge_shard_range(index=0, **remote_context) + self._merge_object(index=7, **remote_context) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + # remote has newer timestamp for shard range + remote_broker.get_shard_ranges(include_own=True), + expect_success=True) + + # sharded broker takes object count from shard range whereas remote + # sharding broker takes it from object table + self.assert_info_synced( + local_broker, 1, + mismatches=['db_state', 'object_count', 'bytes_used', + 'status_changed_at', 'hash']) + + remote_broker = self._get_broker('a', 'c', node_index=1) + shard_db = make_db_file_path(remote_broker._db_file, epoch) + self.assertEqual([remote_broker._db_file, shard_db], + get_db_files(remote_broker.db_file)) + # remote fresh db objects are unchanged + self.assertEqual(remote_context['objects'][7:8], + remote_broker.get_objects()) + # remote old hash.db objects are unchanged + remote_old_broker = self.backend( + remote_broker._db_file, account=remote_broker.account, + container=remote_broker.container, force_db_file=True) + self.assertEqual( + remote_context['objects'][6:7], + remote_old_broker.get_objects()) + remote_info = remote_old_broker.get_info() + remote_info_orig.pop('db_state') + remote_info.pop('db_state') + self.assertEqual(remote_info_orig, remote_info) + self.assertEqual(local_broker.get_shard_ranges(), + remote_broker.get_shard_ranges()) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + remote_broker.get_shard_ranges(include_own=True), + expect_success=True) + + def test_replication_local_sharded_remote_sharding(self): + self._check_replication_local_sharded_remote_sharding({}) + + def test_replication_local_sharded_remote_sharding_large_diff(self): + self._check_replication_local_sharded_remote_sharding({'per_diff': 1}) + + def _check_replication_local_sharded_remote_sharded(self, repl_conf): + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + epoch = Timestamp.now() + self._goto_sharding_state(local_broker, epoch) + local_context['shard_ranges'][0].object_count = 99 + local_context['shard_ranges'][0].bytes_used = 999 + local_context['shard_ranges'][0].state = ShardRange.ACTIVE + self._merge_shard_range(index=0, **local_context) + self._merge_object(index=slice(0, 6), **local_context) + self._goto_sharded_state(local_broker) + + remote_context = self._setup_replication_test(1) + self._merge_object(index=6, **remote_context) + remote_broker = remote_context['broker'] + self._goto_sharding_state(remote_broker, epoch) + remote_context['shard_ranges'][0].object_count = 101 + remote_context['shard_ranges'][0].bytes_used = 1010 + remote_context['shard_ranges'][0].state = ShardRange.ACTIVE + self._merge_shard_range(index=0, **remote_context) + self._merge_object(index=7, **remote_context) + self._goto_sharded_state(remote_broker) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + # remote has newer timestamp for shard range + remote_broker.get_shard_ranges(include_own=True), + expect_success=True) + + self.assert_info_synced( + local_broker, 1, + mismatches=['status_changed_at', 'hash']) + + remote_broker = self._get_broker('a', 'c', node_index=1) + shard_db = make_db_file_path(remote_broker._db_file, epoch) + self.assertEqual([shard_db], + get_db_files(remote_broker.db_file)) + self.assertEqual(remote_context['objects'][7:8], + remote_broker.get_objects()) + # remote shard range was newer than local so object count is not + # updated by sync'd shard range + self.assertEqual( + 101, remote_broker.get_shard_ranges()[0].object_count) + + self._check_only_shard_ranges_replicated( + local_broker, 1, repl_conf, + # remote has newer timestamp for shard range + remote_broker.get_shard_ranges(include_own=True), + expect_success=True) + + def test_replication_local_sharded_remote_sharded(self): + self._check_replication_local_sharded_remote_sharded({}) + + def test_replication_local_sharded_remote_sharded_large_diff(self): + self._check_replication_local_sharded_remote_sharded({'per_diff': 1}) + + def test_replication_rsync_then_merge_aborts_before_merge_sharding(self): + # verify that rsync_then_merge aborts if remote starts sharding during + # the rsync + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + self._merge_object(index=slice(0, 3), **local_context) + remote_context = self._setup_replication_test(1) + remote_broker = remote_context['broker'] + remote_broker.logger = debug_logger() + self._merge_object(index=5, **remote_context) + + orig_func = replicator.ContainerReplicatorRpc.rsync_then_merge + + def mock_rsync_then_merge(*args): + remote_broker.merge_shard_ranges( + ShardRange('.shards_a/cc', Timestamp.now())) + self._goto_sharding_state(remote_broker, Timestamp.now()) + return orig_func(*args) + + with mock.patch( + 'swift.container.replicator.ContainerReplicatorRpc.' + 'rsync_then_merge', + mock_rsync_then_merge): + with mock.patch( + 'swift.container.backend.ContainerBroker.' + 'get_items_since') as mock_get_items_since: + daemon, repl_calls, rsync_calls = self.check_replicate( + local_broker, 1, expect_success=False, + repl_conf={'per_diff': 1}) + + mock_get_items_since.assert_not_called() + self.assertEqual(['sync', 'get_shard_ranges', 'rsync_then_merge'], + [call[0] for call in repl_calls]) + self.assertEqual(local_broker.db_file, rsync_calls[0][0]) + self.assertEqual(local_broker.get_info()['id'], + os.path.basename(rsync_calls[0][1])) + self.assertFalse(rsync_calls[1:]) + + def test_replication_rsync_then_merge_aborts_before_merge_sharded(self): + # verify that rsync_then_merge aborts if remote completes sharding + # during the rsync + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + self._merge_object(index=slice(0, 3), **local_context) + remote_context = self._setup_replication_test(1) + remote_broker = remote_context['broker'] + remote_broker.logger = debug_logger() + self._merge_object(index=5, **remote_context) + + orig_func = replicator.ContainerReplicatorRpc.rsync_then_merge + + def mock_rsync_then_merge(*args): + remote_broker.merge_shard_ranges( + ShardRange('.shards_a/cc', Timestamp.now())) + self._goto_sharding_state(remote_broker, Timestamp.now()) + self._goto_sharded_state(remote_broker) + return orig_func(*args) + + with mock.patch( + 'swift.container.replicator.ContainerReplicatorRpc.' + 'rsync_then_merge', + mock_rsync_then_merge): + with mock.patch( + 'swift.container.backend.ContainerBroker.' + 'get_items_since') as mock_get_items_since: + daemon, repl_calls, rsync_calls = self.check_replicate( + local_broker, 1, expect_success=False, + repl_conf={'per_diff': 1}) + + mock_get_items_since.assert_not_called() + self.assertEqual(['sync', 'get_shard_ranges', 'rsync_then_merge'], + [call[0] for call in repl_calls]) + self.assertEqual(local_broker.db_file, rsync_calls[0][0]) + self.assertEqual(local_broker.get_info()['id'], + os.path.basename(rsync_calls[0][1])) + self.assertFalse(rsync_calls[1:]) + + def test_replication_rsync_then_merge_aborts_after_merge_sharding(self): + # verify that rsync_then_merge aborts if remote starts sharding during + # the merge + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + self._merge_object(index=slice(0, 3), **local_context) + remote_context = self._setup_replication_test(1) + remote_broker = remote_context['broker'] + remote_broker.logger = debug_logger() + self._merge_object(index=5, **remote_context) + + orig_get_items_since = backend.ContainerBroker.get_items_since + calls = [] + + def fake_get_items_since(broker, *args): + # remote starts sharding while rpc call is merging + if not calls: + remote_broker.merge_shard_ranges( + ShardRange('.shards_a/cc', Timestamp.now())) + self._goto_sharding_state(remote_broker, Timestamp.now()) + calls.append(args) + return orig_get_items_since(broker, *args) + + with mock.patch( + 'swift.container.backend.ContainerBroker.get_items_since', + fake_get_items_since): + daemon, repl_calls, rsync_calls = self.check_replicate( + local_broker, 1, expect_success=False, + repl_conf={'per_diff': 1}) + + self.assertEqual(['sync', 'get_shard_ranges', 'rsync_then_merge'], + [call[0] for call in repl_calls]) + self.assertEqual(local_broker.db_file, rsync_calls[0][0]) + self.assertEqual(local_broker.get_info()['id'], + os.path.basename(rsync_calls[0][1])) + self.assertFalse(rsync_calls[1:]) + + def test_replication_rsync_then_merge_aborts_after_merge_sharded(self): + # verify that rsync_then_merge aborts if remote completes sharding + # during the merge + local_context = self._setup_replication_test(0) + local_broker = local_context['broker'] + self._merge_object(index=slice(0, 3), **local_context) + remote_context = self._setup_replication_test(1) + remote_broker = remote_context['broker'] + remote_broker.logger = debug_logger() + self._merge_object(index=5, **remote_context) + + orig_get_items_since = backend.ContainerBroker.get_items_since + calls = [] + + def fake_get_items_since(broker, *args): + # remote starts sharding while rpc call is merging + result = orig_get_items_since(broker, *args) + if calls: + remote_broker.merge_shard_ranges( + ShardRange('.shards_a/cc', Timestamp.now())) + self._goto_sharding_state(remote_broker, Timestamp.now()) + self._goto_sharded_state(remote_broker) + calls.append(args) + return result + + with mock.patch( + 'swift.container.backend.ContainerBroker.get_items_since', + fake_get_items_since): + daemon, repl_calls, rsync_calls = self.check_replicate( + local_broker, 1, expect_success=False, + repl_conf={'per_diff': 1}) + + self.assertEqual(['sync', 'get_shard_ranges', 'rsync_then_merge'], + [call[0] for call in repl_calls]) + self.assertEqual(local_broker.db_file, rsync_calls[0][0]) + self.assertEqual(local_broker.get_info()['id'], + os.path.basename(rsync_calls[0][1])) + self.assertFalse(rsync_calls[1:]) + if __name__ == '__main__': unittest.main() diff --git a/test/unit/container/test_server.py b/test/unit/container/test_server.py index 8327e8754b..0e4201b08a 100644 --- a/test/unit/container/test_server.py +++ b/test/unit/container/test_server.py @@ -1460,7 +1460,7 @@ class TestContainerController(unittest.TestCase): self.assertEqual(True, db.is_deleted()) # now save a copy of this db (and remove it from the "current node") db = self.controller._get_container_broker('sda1', 'p', 'a', 'c') - db_path = db.db_file + db_path = db._db_file other_path = os.path.join(self.testdir, 'othernode.db') os.rename(db_path, other_path) # that should make it missing on this node @@ -1474,6 +1474,8 @@ class TestContainerController(unittest.TestCase): def mock_exists(db_path): rv = _real_exists(db_path) + if db_path != db._db_file: + return rv if not mock_called: # be as careful as we might hope backend replication can be... with lock_parent_directory(db_path, timeout=1): From 723eac907c53cc4082703acf63ef77683d698acc Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Tue, 1 May 2018 16:21:03 +0100 Subject: [PATCH 5/9] Add shard range support to container server Support PUTs to container server with json serialized ShardRanges in body. Shard range PUTs may autocreate containers. Support GET of shard ranges from container server.Shard range GETs support X-Backend-Include-Deleted to include deleted shard ranges in list and X-Backend-Override-Delete to get shard ranges when container has been marked as deleted. The X-Backend-Record-Type = ['object'|'shard'|'auto'] is introduced to differentiate container server requests for object versus shard ranges. When 'auto' is used with a GET request the container server will return whichever record type is appropriate for fetchng object listings, depending on whether the container is sharded or not. Support container PUTs with body in direct_client .py Co-Authored-By: Matthew Oliver Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Change-Id: I029782ae348f38c5fb76d2759609f67a06c883ef --- swift/common/direct_client.py | 150 ++-- swift/container/server.py | 102 ++- swift/proxy/controllers/container.py | 4 +- test/unit/common/test_direct_client.py | 70 ++ test/unit/container/test_server.py | 1065 +++++++++++++++++++++++- test/unit/proxy/test_server.py | 23 + 6 files changed, 1334 insertions(+), 80 deletions(-) diff --git a/swift/common/direct_client.py b/swift/common/direct_client.py index fad4440f64..9f112afa95 100644 --- a/swift/common/direct_client.py +++ b/swift/common/direct_client.py @@ -54,22 +54,72 @@ class DirectClientException(ClientException): http_reason=resp.reason, http_headers=headers) -def _make_req(node, part, method, path, _headers, stype, - conn_timeout=5, response_timeout=15): +def _make_req(node, part, method, path, headers, stype, + conn_timeout=5, response_timeout=15, send_timeout=15, + contents=None, content_length=None, chunk_size=65535): """ Make request to backend storage node. (i.e. 'Account', 'Container', 'Object') :param node: a node dict from a ring - :param part: an integer, the partion number + :param part: an integer, the partition number :param method: a string, the HTTP method (e.g. 'PUT', 'DELETE', etc) :param path: a string, the request path :param headers: a dict, header name => value :param stype: a string, describing the type of service + :param conn_timeout: timeout while waiting for connection; default is 5 + seconds + :param response_timeout: timeout while waiting for response; default is 15 + seconds + :param send_timeout: timeout for sending request body; default is 15 + seconds + :param contents: an iterable or string to read object data from + :param content_length: value to send as content-length header + :param chunk_size: if defined, chunk size of data to send :returns: an HTTPResponse object + :raises DirectClientException: if the response status is not 2xx + :raises eventlet.Timeout: if either conn_timeout or response_timeout is + exceeded """ + if contents is not None: + if content_length is not None: + headers['Content-Length'] = str(content_length) + else: + for n, v in headers.items(): + if n.lower() == 'content-length': + content_length = int(v) + if not contents: + headers['Content-Length'] = '0' + if isinstance(contents, six.string_types): + contents = [contents] + if content_length is None: + headers['Transfer-Encoding'] = 'chunked' + with Timeout(conn_timeout): conn = http_connect(node['ip'], node['port'], node['device'], part, - method, path, headers=_headers) + method, path, headers=headers) + + if contents is not None: + contents_f = FileLikeIter(contents) + + with Timeout(send_timeout): + if content_length is None: + chunk = contents_f.read(chunk_size) + while chunk: + conn.send('%x\r\n%s\r\n' % (len(chunk), chunk)) + chunk = contents_f.read(chunk_size) + conn.send('0\r\n\r\n') + else: + left = content_length + while left > 0: + size = chunk_size + if size > left: + size = left + chunk = contents_f.read(size) + if not chunk: + break + conn.send(chunk) + left -= len(chunk) + with Timeout(response_timeout): resp = conn.getresponse() resp.read() @@ -82,7 +132,7 @@ def _get_direct_account_container(path, stype, node, part, marker=None, limit=None, prefix=None, delimiter=None, conn_timeout=5, response_timeout=15, - end_marker=None, reverse=None): + end_marker=None, reverse=None, headers=None): """Base class for get direct account and container. Do not use directly use the get_direct_account or @@ -105,7 +155,7 @@ def _get_direct_account_container(path, stype, node, part, with Timeout(conn_timeout): conn = http_connect(node['ip'], node['port'], node['device'], part, 'GET', path, query_string=qs, - headers=gen_headers()) + headers=gen_headers(hdrs_in=headers)) with Timeout(response_timeout): resp = conn.getresponse() if not is_success(resp.status): @@ -121,11 +171,12 @@ def _get_direct_account_container(path, stype, node, part, return resp_headers, json.loads(resp.read()) -def gen_headers(hdrs_in=None, add_ts=False): +def gen_headers(hdrs_in=None, add_ts=False, add_user_agent=True): hdrs_out = HeaderKeyDict(hdrs_in) if hdrs_in else HeaderKeyDict() if add_ts: hdrs_out['X-Timestamp'] = Timestamp.now().internal - hdrs_out['User-Agent'] = 'direct-client %s' % os.getpid() + if add_user_agent: + hdrs_out['User-Agent'] = 'direct-client %s' % os.getpid() return hdrs_out @@ -197,7 +248,7 @@ def direct_head_container(node, part, account, container, conn_timeout=5, def direct_get_container(node, part, account, container, marker=None, limit=None, prefix=None, delimiter=None, conn_timeout=5, response_timeout=15, end_marker=None, - reverse=None): + reverse=None, headers=None): """ Get container listings directly from the container server. @@ -213,6 +264,7 @@ def direct_get_container(node, part, account, container, marker=None, :param response_timeout: timeout in seconds for getting the response :param end_marker: end_marker query :param reverse: reverse the returned listing + :param headers: headers to be included in the request :returns: a tuple of (response headers, a list of objects) The response headers will be a HeaderKeyDict. """ @@ -224,7 +276,8 @@ def direct_get_container(node, part, account, container, marker=None, end_marker=end_marker, reverse=reverse, conn_timeout=conn_timeout, - response_timeout=response_timeout) + response_timeout=response_timeout, + headers=headers) def direct_delete_container(node, part, account, container, conn_timeout=5, @@ -250,6 +303,37 @@ def direct_delete_container(node, part, account, container, conn_timeout=5, 'Container', conn_timeout, response_timeout) +def direct_put_container(node, part, account, container, conn_timeout=5, + response_timeout=15, headers=None, contents=None, + content_length=None, chunk_size=65535): + """ + Make a PUT request to a container server. + + :param node: node dictionary from the ring + :param part: partition the container is on + :param account: account name + :param container: container name + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: additional headers to include in the request + :param contents: an iterable or string to send in request body (optional) + :param content_length: value to send as content-length header (optional) + :param chunk_size: chunk size of data to send (optional) + :raises ClientException: HTTP PUT request failed + """ + if headers is None: + headers = {} + + lower_headers = set(k.lower() for k in headers) + headers_out = gen_headers(headers, + add_ts='x-timestamp' not in lower_headers, + add_user_agent='user-agent' not in lower_headers) + path = '/%s/%s' % (account, container) + _make_req(node, part, 'PUT', path, headers_out, 'Container', conn_timeout, + response_timeout, contents=contents, + content_length=content_length, chunk_size=chunk_size) + + def direct_put_container_object(node, part, account, container, obj, conn_timeout=5, response_timeout=15, headers=None): @@ -385,56 +469,18 @@ def direct_put_object(node, part, account, container, name, contents, headers = {} if etag: headers['ETag'] = etag.strip('"') - if content_length is not None: - headers['Content-Length'] = str(content_length) - else: - for n, v in headers.items(): - if n.lower() == 'content-length': - content_length = int(v) if content_type is not None: headers['Content-Type'] = content_type else: headers['Content-Type'] = 'application/octet-stream' - if not contents: - headers['Content-Length'] = '0' - if isinstance(contents, six.string_types): - contents = [contents] # Incase the caller want to insert an object with specific age add_ts = 'X-Timestamp' not in headers - if content_length is None: - headers['Transfer-Encoding'] = 'chunked' + resp = _make_req( + node, part, 'PUT', path, gen_headers(headers, add_ts=add_ts), + 'Object', conn_timeout, response_timeout, contents=contents, + content_length=content_length, chunk_size=chunk_size) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'PUT', path, headers=gen_headers(headers, add_ts)) - - contents_f = FileLikeIter(contents) - - if content_length is None: - chunk = contents_f.read(chunk_size) - while chunk: - conn.send('%x\r\n%s\r\n' % (len(chunk), chunk)) - chunk = contents_f.read(chunk_size) - conn.send('0\r\n\r\n') - else: - left = content_length - while left > 0: - size = chunk_size - if size > left: - size = left - chunk = contents_f.read(size) - if not chunk: - break - conn.send(chunk) - left -= len(chunk) - - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Object', 'PUT', - node, part, path, resp) return resp.getheader('etag').strip('"') diff --git a/swift/container/server.py b/swift/container/server.py index a3c233b664..f8e830cb59 100644 --- a/swift/container/server.py +++ b/swift/container/server.py @@ -24,7 +24,8 @@ from eventlet import Timeout import swift.common.db from swift.container.sync_store import ContainerSyncStore -from swift.container.backend import ContainerBroker, DATADIR +from swift.container.backend import ContainerBroker, DATADIR, \ + RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED from swift.container.replicator import ContainerReplicatorRpc from swift.common.db import DatabaseAlreadyExists from swift.common.container_sync_realms import ContainerSyncRealms @@ -33,7 +34,9 @@ from swift.common.request_helpers import get_param, \ from swift.common.utils import get_logger, hash_path, public, \ Timestamp, storage_directory, validate_sync_to, \ config_true_value, timing_stats, replication, \ - override_bytes_from_content_type, get_log_line + override_bytes_from_content_type, get_log_line, ShardRange, \ + list_from_csv + from swift.common.constraints import valid_timestamp, check_utf8, check_drive from swift.common import constraints from swift.common.bufferedhttp import http_connect @@ -72,6 +75,7 @@ def gen_resp_headers(info, is_deleted=False): 'X-Timestamp': Timestamp(info.get('created_at', 0)).normal, 'X-PUT-Timestamp': Timestamp( info.get('put_timestamp', 0)).normal, + 'X-Backend-Sharding-State': info.get('db_state', UNSHARDED), }) return headers @@ -408,6 +412,22 @@ class ContainerController(BaseStorageServer): req.headers.get('x-content-type-timestamp'), req.headers.get('x-meta-timestamp')) return HTTPCreated(request=req) + + record_type = req.headers.get('x-backend-record-type', '').lower() + if record_type == RECORD_TYPE_SHARD: + try: + # validate incoming data... + shard_ranges = [ShardRange.from_dict(sr) + for sr in json.loads(req.body)] + except (ValueError, KeyError, TypeError) as err: + return HTTPBadRequest('Invalid body: %r' % err) + created = self._maybe_autocreate(broker, req_timestamp, account, + requested_policy_index) + self._update_metadata(req, broker, req_timestamp, 'PUT') + if shard_ranges: + # TODO: consider writing the shard ranges into the pending + # file, but if so ensure an all-or-none semantic for the write + broker.merge_shard_ranges(shard_ranges) else: # put container if requested_policy_index is None: # use the default index sent by the proxy if available @@ -423,14 +443,14 @@ class ContainerController(BaseStorageServer): resp = self.account_update(req, account, container, broker) if resp: return resp - if created: - return HTTPCreated(request=req, - headers={'x-backend-storage-policy-index': - broker.storage_policy_index}) - else: - return HTTPAccepted(request=req, - headers={'x-backend-storage-policy-index': - broker.storage_policy_index}) + if created: + return HTTPCreated(request=req, + headers={'x-backend-storage-policy-index': + broker.storage_policy_index}) + else: + return HTTPAccepted(request=req, + headers={'x-backend-storage-policy-index': + broker.storage_policy_index}) @public @timing_stats(sample_rate=0.1) @@ -469,13 +489,18 @@ class ContainerController(BaseStorageServer): :params record: object entry record :returns: modified record """ - (name, created, size, content_type, etag) = record[:5] - if content_type is None: - return {'subdir': name.decode('utf8')} - response = {'bytes': size, 'hash': etag, 'name': name.decode('utf8'), - 'content_type': content_type} + if isinstance(record, ShardRange): + created = record.timestamp + response = dict(record) + else: + (name, created, size, content_type, etag) = record[:5] + if content_type is None: + return {'subdir': name.decode('utf8')} + response = { + 'bytes': size, 'hash': etag, 'name': name.decode('utf8'), + 'content_type': content_type} + override_bytes_from_content_type(response, logger=self.logger) response['last_modified'] = Timestamp(created).isoformat - override_bytes_from_content_type(response, logger=self.logger) return response @public @@ -509,12 +534,45 @@ class ContainerController(BaseStorageServer): pending_timeout=0.1, stale_reads_ok=True) info, is_deleted = broker.get_info_is_deleted() - resp_headers = gen_resp_headers(info, is_deleted=is_deleted) - if is_deleted: - return HTTPNotFound(request=req, headers=resp_headers) - container_list = broker.list_objects_iter( - limit, marker, end_marker, prefix, delimiter, path, - storage_policy_index=info['storage_policy_index'], reverse=reverse) + record_type = req.headers.get('x-backend-record-type', '').lower() + if record_type == 'auto' and info.get('db_state') in (SHARDING, + SHARDED): + record_type = 'shard' + if record_type == 'shard': + override_deleted = info and config_true_value( + req.headers.get('x-backend-override-deleted', False)) + resp_headers = gen_resp_headers( + info, is_deleted=is_deleted and not override_deleted) + if is_deleted and not override_deleted: + return HTTPNotFound(request=req, headers=resp_headers) + resp_headers['X-Backend-Record-Type'] = 'shard' + includes = get_param(req, 'includes') + states = get_param(req, 'states') + fill_gaps = False + if states: + states = list_from_csv(states) + fill_gaps = any(('listing' in states, 'updating' in states)) + try: + states = broker.resolve_shard_range_states(states) + except ValueError: + return HTTPBadRequest(request=req, body='Bad state') + include_deleted = config_true_value( + req.headers.get('x-backend-include-deleted', False)) + container_list = broker.get_shard_ranges( + marker, end_marker, includes, reverse, states=states, + include_deleted=include_deleted, fill_gaps=fill_gaps) + else: + resp_headers = gen_resp_headers(info, is_deleted=is_deleted) + if is_deleted: + return HTTPNotFound(request=req, headers=resp_headers) + resp_headers['X-Backend-Record-Type'] = 'object' + # Use the retired db while container is in process of sharding, + # otherwise use current db + src_broker = broker.get_brokers()[0] + container_list = src_broker.list_objects_iter( + limit, marker, end_marker, prefix, delimiter, path, + storage_policy_index=info['storage_policy_index'], + reverse=reverse) return self.create_listing(req, out_content_type, info, resp_headers, broker.metadata, container_list, container) diff --git a/swift/proxy/controllers/container.py b/swift/proxy/controllers/container.py index 15c67858ea..34c02a3f1a 100644 --- a/swift/proxy/controllers/container.py +++ b/swift/proxy/controllers/container.py @@ -84,7 +84,9 @@ class ContainerController(Controller): def GETorHEAD(self, req): """Handler for HTTP GET/HEAD requests.""" ai = self.account_info(self.account_name, req) - if not ai[1]: + auto_account = self.account_name.startswith( + self.app.auto_create_account_prefix) + if not (auto_account or ai[1]): if 'swift.authorize' in req.environ: aresp = req.environ['swift.authorize'](req) if aresp: diff --git a/test/unit/common/test_direct_client.py b/test/unit/common/test_direct_client.py index a832f31c6f..fc2dffc696 100644 --- a/test/unit/common/test_direct_client.py +++ b/test/unit/common/test_direct_client.py @@ -95,6 +95,11 @@ def mocked_http_conn(*args, **kwargs): yield fake_conn +@contextmanager +def noop_timeout(duration): + yield + + @patch_policies class TestDirectClient(unittest.TestCase): @@ -117,6 +122,10 @@ class TestDirectClient(unittest.TestCase): self.account, self.container, self.obj)) self.user_agent = 'direct-client %s' % os.getpid() + patcher = mock.patch.object(direct_client, 'Timeout', noop_timeout) + patcher.start() + self.addCleanup(patcher.stop) + def test_gen_headers(self): stub_user_agent = 'direct-client %s' % os.getpid() @@ -450,6 +459,67 @@ class TestDirectClient(unittest.TestCase): self.assertEqual(err.http_status, 500) self.assertTrue('DELETE' in str(err)) + def test_direct_put_container(self): + body = 'Let us begin with a quick introduction' + headers = {'x-foo': 'bar', 'Content-Length': str(len(body)), + 'Content-Type': 'application/json', + 'User-Agent': 'my UA'} + + with mocked_http_conn(204) as conn: + rv = direct_client.direct_put_container( + self.node, self.part, self.account, self.container, + contents=body, headers=headers) + self.assertEqual(conn.host, self.node['ip']) + self.assertEqual(conn.port, self.node['port']) + self.assertEqual(conn.method, 'PUT') + self.assertEqual(conn.path, self.container_path) + self.assertEqual(conn.req_headers['Content-Length'], + str(len(body))) + self.assertEqual(conn.req_headers['Content-Type'], + 'application/json') + self.assertEqual(conn.req_headers['User-Agent'], 'my UA') + self.assertTrue('x-timestamp' in conn.req_headers) + self.assertEqual('bar', conn.req_headers.get('x-foo')) + self.assertEqual(md5(body).hexdigest(), conn.etag.hexdigest()) + self.assertIsNone(rv) + + def test_direct_put_container_chunked(self): + body = 'Let us begin with a quick introduction' + headers = {'x-foo': 'bar', 'Content-Type': 'application/json'} + + with mocked_http_conn(204) as conn: + rv = direct_client.direct_put_container( + self.node, self.part, self.account, self.container, + contents=body, headers=headers) + self.assertEqual(conn.host, self.node['ip']) + self.assertEqual(conn.port, self.node['port']) + self.assertEqual(conn.method, 'PUT') + self.assertEqual(conn.path, self.container_path) + self.assertEqual(conn.req_headers['Transfer-Encoding'], 'chunked') + self.assertEqual(conn.req_headers['Content-Type'], + 'application/json') + self.assertTrue('x-timestamp' in conn.req_headers) + self.assertEqual('bar', conn.req_headers.get('x-foo')) + self.assertNotIn('Content-Length', conn.req_headers) + expected_sent = '%0x\r\n%s\r\n0\r\n\r\n' % (len(body), body) + self.assertEqual(md5(expected_sent).hexdigest(), + conn.etag.hexdigest()) + self.assertIsNone(rv) + + def test_direct_put_container_fail(self): + with mock.patch('swift.common.bufferedhttp.http_connect_raw', + side_effect=Exception('conn failed')): + with self.assertRaises(Exception) as cm: + direct_client.direct_put_container( + self.node, self.part, self.account, self.container) + self.assertEqual('conn failed', str(cm.exception)) + + with mocked_http_conn(Exception('resp failed')): + with self.assertRaises(Exception) as cm: + direct_client.direct_put_container( + self.node, self.part, self.account, self.container) + self.assertEqual('resp failed', str(cm.exception)) + def test_direct_put_container_object(self): headers = {'x-foo': 'bar'} diff --git a/test/unit/container/test_server.py b/test/unit/container/test_server.py index 0e4201b08a..e50f74901c 100644 --- a/test/unit/container/test_server.py +++ b/test/unit/container/test_server.py @@ -22,6 +22,7 @@ import itertools from contextlib import contextmanager from shutil import rmtree from tempfile import mkdtemp +from test.unit import make_timestamp_iter, mock_timestamp_now from time import gmtime from xml.dom import minidom import time @@ -40,7 +41,8 @@ import swift.container from swift.container import server as container_server from swift.common import constraints from swift.common.utils import (Timestamp, mkdirs, public, replication, - storage_directory, lock_parent_directory) + storage_directory, lock_parent_directory, + ShardRange) from test.unit import fake_http_connect, debug_logger, mock_check_drive from swift.common.storage_policy import (POLICIES, StoragePolicy) from swift.common.request_helpers import get_sys_meta_prefix @@ -86,6 +88,16 @@ class TestContainerController(unittest.TestCase): """ pass + def _put_shard_range(self, shard_range): + put_timestamp = shard_range.timestamp.internal + headers = {'X-Backend-Record-Type': 'shard', + 'X-Timestamp': put_timestamp} + body = json.dumps([dict(shard_range)]) + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers, + body=body) + resp = req.get_response(self.controller) + self.assertIn(resp.status_int, (201, 202)) + def _check_put_container_storage_policy(self, req, policy_index): resp = req.get_response(self.controller) self.assertEqual(201, resp.status_int) @@ -95,6 +107,11 @@ class TestContainerController(unittest.TestCase): self.assertEqual(str(policy_index), resp.headers['X-Backend-Storage-Policy-Index']) + def _assert_shard_ranges_equal(self, x, y): + # ShardRange.__eq__ only compares lower and upper; here we generate + # dict representations to compare all attributes + self.assertEqual([dict(sr) for sr in x], [dict(sr) for sr in y]) + def test_creation(self): # later config should be extended to assert more config options replicator = container_server.ContainerController( @@ -1372,21 +1389,100 @@ class TestContainerController(unittest.TestCase): self.assertEqual(resp.status_int, 500) def test_DELETE(self): + ts_iter = make_timestamp_iter() req = Request.blank( '/sda1/p/a/c', - environ={'REQUEST_METHOD': 'PUT'}, headers={'X-Timestamp': '1'}) + environ={'REQUEST_METHOD': 'PUT'}, + headers={'X-Timestamp': next(ts_iter).internal}) resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 201) + + # PUT an *empty* shard range + sr = ShardRange('.shards_a/c', next(ts_iter), 'l', 'u', 0, 0, + state=ShardRange.ACTIVE) req = Request.blank( '/sda1/p/a/c', - environ={'REQUEST_METHOD': 'DELETE'}, headers={'X-Timestamp': '2'}) + environ={'REQUEST_METHOD': 'PUT'}, + headers={'X-Timestamp': next(ts_iter).internal, + 'X-Backend-Record-Type': 'shard'}, + body=json.dumps([dict(sr)])) + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 202) + + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'DELETE'}, + headers={'X-Timestamp': next(ts_iter).internal}) resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 204) + req = Request.blank( '/sda1/p/a/c', - environ={'REQUEST_METHOD': 'GET'}, headers={'X-Timestamp': '3'}) + environ={'REQUEST_METHOD': 'GET'}, + headers={'X-Timestamp': next(ts_iter).internal}) resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 404) + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'GET'}, + headers={'X-Timestamp': next(ts_iter).internal, + 'X-Backend-Record-Type': 'shard'}, + params={'format': 'json'}) + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 404) + + # the override-deleted header is ignored for object records + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'GET'}, + headers={'X-Timestamp': next(ts_iter).internal, + 'X-Backend-Override-Deleted': 'true'}, + params={'format': 'json'}) + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 404) + + # but override-deleted header makes shard ranges available after DELETE + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'GET'}, + headers={'X-Timestamp': next(ts_iter).internal, + 'X-Backend-Record-Type': 'shard', + 'X-Backend-Override-Deleted': 'true'}, + params={'format': 'json'}) + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 200) + self.assertEqual([dict(sr, last_modified=sr.timestamp.isoformat)], + json.loads(resp.body)) + self.assertIn('X-Backend-Record-Type', resp.headers) + self.assertEqual('shard', resp.headers['X-Backend-Record-Type']) + + # ... unless the override header equates to False + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'GET'}, + headers={'X-Timestamp': next(ts_iter).internal, + 'X-Backend-Record-Type': 'shard', + 'X-Backend-Override-Deleted': 'no'}, + params={'format': 'json'}) + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 404) + self.assertNotIn('X-Backend-Record-Type', resp.headers) + + # ...or the db file is unlinked + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + self.assertTrue(os.path.exists(broker.db_file)) + os.unlink(broker.db_file) + self.assertFalse(os.path.exists(broker.db_file)) + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'GET'}, + headers={'X-Timestamp': next(ts_iter).internal, + 'X-Backend-Record-Type': 'shard', + 'X-Backend-Override-Deleted': 'true'}, + params={'format': 'json'}) + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 404) + self.assertNotIn('X-Backend-Record-Type', resp.headers) def test_DELETE_PUT_recreate(self): path = '/sda1/p/a/c' @@ -2042,6 +2138,947 @@ class TestContainerController(unittest.TestCase): resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 412) + def test_PUT_shard_range_autocreates_shard_container(self): + ts_iter = make_timestamp_iter() + shard_range = ShardRange('.shards_a/shard_c', next(ts_iter)) + put_timestamp = next(ts_iter).internal + headers = {'X-Backend-Record-Type': 'shard', + 'X-Timestamp': put_timestamp, + 'X-Container-Sysmeta-Test': 'set', + 'X-Container-Meta-Test': 'persisted'} + + # PUT shard range to non-existent container with non-autocreate prefix + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers, + body=json.dumps([dict(shard_range)])) + resp = req.get_response(self.controller) + self.assertEqual(404, resp.status_int) + + # PUT shard range to non-existent container with autocreate prefix, + # missing storage policy + headers['X-Timestamp'] = next(ts_iter).internal + req = Request.blank( + '/sda1/p/.shards_a/shard_c', method='PUT', headers=headers, + body=json.dumps([dict(shard_range)])) + resp = req.get_response(self.controller) + self.assertEqual(400, resp.status_int) + self.assertIn('X-Backend-Storage-Policy-Index header is required', + resp.body) + + # PUT shard range to non-existent container with autocreate prefix + headers['X-Timestamp'] = next(ts_iter).internal + policy_index = random.choice(POLICIES).idx + headers['X-Backend-Storage-Policy-Index'] = str(policy_index) + req = Request.blank( + '/sda1/p/.shards_a/shard_c', method='PUT', headers=headers, + body=json.dumps([dict(shard_range)])) + resp = req.get_response(self.controller) + self.assertEqual(201, resp.status_int) + + # repeat PUT of shard range to autocreated container - 204 response + headers['X-Timestamp'] = next(ts_iter).internal + headers.pop('X-Backend-Storage-Policy-Index') # no longer required + req = Request.blank( + '/sda1/p/.shards_a/shard_c', method='PUT', headers=headers, + body=json.dumps([dict(shard_range)])) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + + # regular PUT to autocreated container - 204 response + headers['X-Timestamp'] = next(ts_iter).internal + req = Request.blank( + '/sda1/p/.shards_a/shard_c', method='PUT', + headers={'X-Timestamp': next(ts_iter).internal}, + body=json.dumps([dict(shard_range)])) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + + def test_PUT_shard_range_to_deleted_container(self): + ts_iter = make_timestamp_iter() + put_time = next(ts_iter).internal + # create a container, get it to sharded state and then delete it + req = Request.blank('/sda1/p/a/c', method='PUT', + headers={'X-Timestamp': put_time}) + resp = req.get_response(self.controller) + self.assertEqual(201, resp.status_int) + + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + self.assertTrue(broker.set_sharded_state()) + + delete_time = next(ts_iter).internal + req = Request.blank('/sda1/p/a/c', method='DELETE', + headers={'X-Timestamp': delete_time}) + resp = req.get_response(self.controller) + self.assertEqual(204, resp.status_int) + self.assertTrue(broker.is_deleted()) + self.assertEqual(delete_time, broker.get_info()['delete_timestamp']) + self.assertEqual(put_time, broker.get_info()['put_timestamp']) + req = Request.blank('/sda1/p/a/c', method='GET') + resp = req.get_response(self.controller) + self.assertEqual(404, resp.status_int) + + # shard range PUT is accepted but container remains deleted + shard_range = ShardRange('.shards_a/shard_c', next(ts_iter), + state=ShardRange.ACTIVE) + headers = {'X-Backend-Record-Type': 'shard', + 'X-Timestamp': next(ts_iter).internal, + 'X-Container-Sysmeta-Test': 'set', + 'X-Container-Meta-Test': 'persisted'} + + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers, + body=json.dumps([dict(shard_range)])) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + self.assertTrue(broker.get_info_is_deleted()[1]) + self.assertEqual(delete_time, broker.get_info()['delete_timestamp']) + self.assertEqual(put_time, broker.get_info()['put_timestamp']) + req = Request.blank('/sda1/p/a/c', method='GET') + resp = req.get_response(self.controller) + self.assertEqual(404, resp.status_int) + + # unless shard range has non-zero stats, then container is revived + shard_range.update_meta(99, 1234, meta_timestamp=next(ts_iter)) + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers, + body=json.dumps([dict(shard_range)])) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + self.assertFalse(broker.get_info_is_deleted()[1]) + self.assertEqual(delete_time, broker.get_info()['delete_timestamp']) + self.assertEqual(put_time, broker.get_info()['put_timestamp']) + req = Request.blank('/sda1/p/a/c', method='GET') + resp = req.get_response(self.controller) + self.assertEqual(204, resp.status_int) + self.assertEqual('99', resp.headers['X-Container-Object-Count']) + + def test_PUT_shard_range_json_in_body(self): + ts_iter = make_timestamp_iter() + oldest_ts = next(ts_iter) # used for stale shard range PUT later + shard_bounds = [('', 'ham', ShardRange.ACTIVE), + ('ham', 'salami', ShardRange.ACTIVE), + ('salami', '', ShardRange.CREATED)] + shard_ranges = [ + ShardRange('.shards_a/_%s' % upper, next(ts_iter), + lower, upper, + i * 100, i * 1000, meta_timestamp=next(ts_iter), + state=state, state_timestamp=next(ts_iter)) + for i, (lower, upper, state) in enumerate(shard_bounds)] + + put_timestamp = next(ts_iter).internal + headers = {'X-Backend-Record-Type': 'shard', + 'X-Timestamp': put_timestamp, + 'X-Container-Sysmeta-Test': 'set', + 'X-Container-Meta-Test': 'persisted'} + body = json.dumps([dict(sr) for sr in shard_ranges[:2]]) + + # PUT some shard ranges to non-existent container + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers, + body=body) + resp = req.get_response(self.controller) + self.assertEqual(404, resp.status_int) + + # create the container with a regular PUT + req = Request.blank( + '/sda1/p/a/c', method='PUT', + headers={'X-Timestamp': put_timestamp}, body=body) + resp = req.get_response(self.controller) + self.assertEqual(201, resp.status_int) + + # now we can PUT shard ranges + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers, + body=body) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + + # check broker + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + # sysmeta and user meta is updated + exp_meta = {'X-Container-Sysmeta-Test': 'set', + 'X-Container-Meta-Test': 'persisted'} + self.assertEqual( + exp_meta, dict((k, v[0]) for k, v in broker.metadata.items())) + self.assertEqual(put_timestamp, broker.get_info()['put_timestamp']) + self._assert_shard_ranges_equal(shard_ranges[:2], + broker.get_shard_ranges()) + + # empty json dict + body = json.dumps({}) + headers['X-Timestamp'] = next(ts_iter).internal + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + self.assertEqual( + exp_meta, dict((k, v[0]) for k, v in broker.metadata.items())) + self._assert_shard_ranges_equal(shard_ranges[:2], + broker.get_shard_ranges()) + self.assertEqual(put_timestamp, broker.get_info()['put_timestamp']) + + older_ts = next(ts_iter) # used for stale shard range PUT later + # updated and new shard ranges + shard_ranges[1].bytes_used += 100 + shard_ranges[1].meta_timestamp = next(ts_iter) + body = json.dumps([dict(sr) for sr in shard_ranges[1:]]) + headers['X-Timestamp'] = next(ts_iter).internal + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + self.assertEqual( + exp_meta, dict((k, v[0]) for k, v in broker.metadata.items())) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + self.assertEqual(put_timestamp, broker.get_info()['put_timestamp']) + + # stale shard range + stale_shard_range = shard_ranges[1].copy() + stale_shard_range.bytes_used = 0 + stale_shard_range.object_count = 0 + stale_shard_range.meta_timestamp = older_ts + stale_shard_range.state = ShardRange.CREATED + stale_shard_range.state_timestamp = oldest_ts + body = json.dumps([dict(stale_shard_range)]) + headers['X-Timestamp'] = next(ts_iter).internal + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + self.assertEqual( + exp_meta, dict((k, v[0]) for k, v in broker.metadata.items())) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + self.assertEqual(put_timestamp, broker.get_info()['put_timestamp']) + + # deleted shard range + shard_ranges[0].deleted = 1 + shard_ranges[0].timestamp = next(ts_iter) + body = json.dumps([dict(shard_ranges[0])]) + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + self.assertEqual( + exp_meta, dict((k, v[0]) for k, v in broker.metadata.items())) + self._assert_shard_ranges_equal( + shard_ranges, broker.get_shard_ranges(include_deleted=True)) + self.assertEqual(put_timestamp, broker.get_info()['put_timestamp']) + + def check_bad_body(body): + bad_put_timestamp = next(ts_iter).internal + headers['X-Timestamp'] = bad_put_timestamp + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + resp = req.get_response(self.controller) + self.assertEqual(400, resp.status_int) + self.assertIn('Invalid body', resp.body) + self.assertEqual( + exp_meta, dict((k, v[0]) for k, v in broker.metadata.items())) + self._assert_shard_ranges_equal( + shard_ranges, broker.get_shard_ranges(include_deleted=True)) + self.assertEqual(put_timestamp, broker.get_info()['put_timestamp']) + + check_bad_body('not json') + check_bad_body('') + check_bad_body('["not a shard range"]') + check_bad_body('[[]]') + bad_shard_range = dict(ShardRange('a/c', next(ts_iter))) + bad_shard_range.pop('timestamp') + check_bad_body(json.dumps([bad_shard_range])) + + def check_not_shard_record_type(headers): + # body ignored + body = json.dumps([dict(sr) for sr in shard_ranges]) + # note, regular PUT so put timestamp is updated + put_timestamp = next(ts_iter).internal + headers['X-Timestamp'] = put_timestamp + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + resp = req.get_response(self.controller) + self.assertEqual(202, resp.status_int) + self._assert_shard_ranges_equal( + shard_ranges, broker.get_shard_ranges(include_deleted=True)) + self.assertEqual(put_timestamp, broker.get_info()['put_timestamp']) + + check_not_shard_record_type({'X-Backend-Record-Type': 'object', + 'X-Timestamp': next(ts_iter).internal}) + + check_not_shard_record_type({'X-Timestamp': next(ts_iter).internal}) + + def test_PUT_GET_shard_ranges(self): + # make a container + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() # used when mocking Timestamp.now() + headers = {'X-Timestamp': next(ts_iter).normal} + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers) + self.assertEqual(201, req.get_response(self.controller).status_int) + # PUT some objects + objects = [{'name': 'obj_%d' % i, + 'x-timestamp': next(ts_iter).normal, + 'x-content-type': 'text/plain', + 'x-etag': 'etag_%d' % i, + 'x-size': 1024 * i + } for i in range(2)] + for obj in objects: + req = Request.blank('/sda1/p/a/c/%s' % obj['name'], method='PUT', + headers=obj) + self._update_object_put_headers(req) + resp = req.get_response(self.controller) + self.assertEqual(201, resp.status_int) + # PUT some shard ranges + shard_bounds = [('', 'apple', ShardRange.SHRINKING), + ('apple', 'ham', ShardRange.CLEAVED), + ('ham', 'salami', ShardRange.ACTIVE), + ('salami', 'yoghurt', ShardRange.CREATED), + ('yoghurt', '', ShardRange.FOUND), + ] + shard_ranges = [ + ShardRange('.sharded_a/_%s' % upper, next(ts_iter), + lower, upper, + i * 100, i * 1000, meta_timestamp=next(ts_iter), + state=state, state_timestamp=next(ts_iter)) + for i, (lower, upper, state) in enumerate(shard_bounds)] + for shard_range in shard_ranges: + self._put_shard_range(shard_range) + + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + self.assertTrue(broker.is_root_container()) # sanity + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + + # sanity check - no shard ranges when GET is only for objects + def check_object_GET(path): + req = Request.blank(path, method='GET') + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 200) + self.assertEqual(resp.content_type, 'application/json') + expected = [ + dict(hash=obj['x-etag'], bytes=obj['x-size'], + content_type=obj['x-content-type'], + last_modified=Timestamp(obj['x-timestamp']).isoformat, + name=obj['name']) for obj in objects] + self.assertEqual(expected, json.loads(resp.body)) + self.assertIn('X-Backend-Record-Type', resp.headers) + self.assertEqual('object', resp.headers['X-Backend-Record-Type']) + + check_object_GET('/sda1/p/a/c?format=json') + + # GET only shard ranges + def check_shard_GET(expected_shard_ranges, path, params=''): + req = Request.blank('/sda1/p/%s?format=json%s' % + (path, params), method='GET', + headers={'X-Backend-Record-Type': 'shard'}) + with mock_timestamp_now(ts_now): + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 200) + self.assertEqual(resp.content_type, 'application/json') + expected = [ + dict(sr, last_modified=Timestamp(sr.timestamp).isoformat) + for sr in expected_shard_ranges] + self.assertEqual(expected, json.loads(resp.body)) + self.assertIn('X-Backend-Record-Type', resp.headers) + self.assertEqual('shard', resp.headers['X-Backend-Record-Type']) + + # all shards + check_shard_GET(shard_ranges, 'a/c') + check_shard_GET(reversed(shard_ranges), 'a/c', params='&reverse=true') + # only created shards + check_shard_GET(shard_ranges[3:4], 'a/c', params='&states=created') + # only found shards + check_shard_GET(shard_ranges[4:5], 'a/c', params='&states=found') + # only cleaved shards + check_shard_GET(shard_ranges[1:2], 'a/c', + params='&states=cleaved') + # only active shards + check_shard_GET(shard_ranges[2:3], 'a/c', + params='&states=active&end_marker=pickle') + # only cleaved or active shards, reversed + check_shard_GET( + reversed(shard_ranges[1:3]), 'a/c', + params='&states=cleaved,active&reverse=true&marker=pickle') + # only shrinking shards + check_shard_GET(shard_ranges[:1], 'a/c', + params='&states=shrinking&end_marker=pickle') + check_shard_GET(shard_ranges[:1], 'a/c', + params='&states=shrinking&reverse=true&marker=pickle') + # only active or shrinking shards + check_shard_GET([shard_ranges[0], shard_ranges[2]], 'a/c', + params='&states=shrinking,active&end_marker=pickle') + check_shard_GET( + [shard_ranges[2], shard_ranges[0]], 'a/c', + params='&states=active,shrinking&reverse=true&marker=pickle') + # only active or shrinking shards using listing alias + check_shard_GET(shard_ranges[:3], 'a/c', + params='&states=listing&end_marker=pickle') + check_shard_GET( + reversed(shard_ranges[:3]), 'a/c', + params='&states=listing&reverse=true&marker=pickle') + # only created, cleaved, active, shrinking shards using updating alias + check_shard_GET(shard_ranges[1:4], 'a/c', + params='&states=updating&end_marker=treacle') + check_shard_GET( + reversed(shard_ranges[1:4]), 'a/c', + params='&states=updating&reverse=true&marker=treacle') + + # listing shards don't cover entire namespace so expect an extra filler + extra_shard_range = ShardRange( + 'a/c', ts_now, shard_ranges[2].upper, ShardRange.MAX, 2, 1024, + state=ShardRange.ACTIVE) + expected = shard_ranges[:3] + [extra_shard_range] + check_shard_GET(expected, 'a/c', params='&states=listing') + check_shard_GET(reversed(expected), 'a/c', + params='&states=listing&reverse=true') + expected = [shard_ranges[2], extra_shard_range] + check_shard_GET(expected, 'a/c', + params='&states=listing&marker=pickle') + check_shard_GET( + reversed(expected), 'a/c', + params='&states=listing&reverse=true&end_marker=pickle') + # updating shards don't cover entire namespace so expect a filler + extra_shard_range = ShardRange( + 'a/c', ts_now, shard_ranges[3].upper, ShardRange.MAX, 2, 1024, + state=ShardRange.ACTIVE) + expected = shard_ranges[1:4] + [extra_shard_range] + check_shard_GET(expected, 'a/c', params='&states=updating') + check_shard_GET(reversed(expected), 'a/c', + params='&states=updating&reverse=true') + # when no listing shard ranges cover the requested namespace range then + # filler is for entire requested namespace + extra_shard_range = ShardRange( + 'a/c', ts_now, 'treacle', ShardRange.MAX, 2, 1024, + state=ShardRange.ACTIVE) + check_shard_GET([extra_shard_range], 'a/c', + params='&states=listing&marker=treacle') + check_shard_GET( + [extra_shard_range], 'a/c', + params='&states=listing&reverse=true&end_marker=treacle') + extra_shard_range = ShardRange( + 'a/c', ts_now, 'treacle', 'walnut', 2, 1024, + state=ShardRange.ACTIVE) + params = '&states=listing&marker=treacle&end_marker=walnut' + check_shard_GET([extra_shard_range], 'a/c', params=params) + params = '&states=listing&reverse=true&marker=walnut' + \ + '&end_marker=treacle' + check_shard_GET([extra_shard_range], 'a/c', params=params) + # specific object + check_shard_GET(shard_ranges[1:2], 'a/c', params='&includes=cheese') + check_shard_GET(shard_ranges[1:2], 'a/c', params='&includes=ham') + check_shard_GET(shard_ranges[2:3], 'a/c', params='&includes=pickle') + check_shard_GET(shard_ranges[2:3], 'a/c', params='&includes=salami') + check_shard_GET(shard_ranges[3:4], 'a/c', params='&includes=walnut') + check_shard_GET(shard_ranges[3:4], 'a/c', + params='&includes=walnut&reverse=true') + # with marker + check_shard_GET(shard_ranges[1:], 'a/c', params='&marker=cheese') + check_shard_GET(reversed(shard_ranges[:2]), 'a/c', + params='&marker=cheese&reverse=true') + check_shard_GET(shard_ranges[2:], 'a/c', params='&marker=ham') + check_shard_GET(reversed(shard_ranges[:2]), 'a/c', + params='&marker=ham&reverse=true') + check_shard_GET(shard_ranges[2:], 'a/c', params='&marker=pickle') + check_shard_GET(reversed(shard_ranges[:3]), 'a/c', + params='&marker=pickle&reverse=true') + check_shard_GET(shard_ranges[3:], 'a/c', params='&marker=salami') + check_shard_GET(reversed(shard_ranges[:3]), 'a/c', + params='&marker=salami&reverse=true') + check_shard_GET(shard_ranges[3:], 'a/c', params='&marker=walnut') + check_shard_GET(reversed(shard_ranges[:4]), 'a/c', + params='&marker=walnut&reverse=true') + # with end marker + check_shard_GET(shard_ranges[:2], 'a/c', params='&end_marker=cheese') + check_shard_GET(reversed(shard_ranges[1:]), 'a/c', + params='&end_marker=cheese&reverse=true') + # everything in range 'apple' - 'ham' is <= end_marker of 'ham' so that + # range is not included because end_marker is non-inclusive + check_shard_GET(shard_ranges[:2], 'a/c', params='&end_marker=ham') + check_shard_GET(reversed(shard_ranges[2:]), 'a/c', + params='&end_marker=ham&reverse=true') + check_shard_GET(shard_ranges[:3], 'a/c', params='&end_marker=pickle') + check_shard_GET(reversed(shard_ranges[2:]), 'a/c', + params='&end_marker=pickle&reverse=true') + check_shard_GET(shard_ranges[:3], 'a/c', params='&end_marker=salami') + check_shard_GET(reversed(shard_ranges[3:]), 'a/c', + params='&end_marker=salami&reverse=true') + check_shard_GET(shard_ranges[:4], 'a/c', params='&end_marker=walnut') + check_shard_GET(reversed(shard_ranges[3:]), 'a/c', + params='&end_marker=walnut&reverse=true') + # with marker and end marker + check_shard_GET(shard_ranges[1:2], 'a/c', + params='&marker=cheese&end_marker=egg') + check_shard_GET(shard_ranges[1:2], 'a/c', + params='&end_marker=cheese&marker=egg&reverse=true') + check_shard_GET(shard_ranges[1:3], 'a/c', + params='&marker=egg&end_marker=jam') + check_shard_GET(reversed(shard_ranges[1:3]), 'a/c', + params='&end_marker=egg&marker=jam&reverse=true') + check_shard_GET(shard_ranges[1:4], 'a/c', + params='&marker=cheese&end_marker=walnut') + check_shard_GET(reversed(shard_ranges[1:4]), 'a/c', + params='&end_marker=cheese&marker=walnut&reverse=true') + check_shard_GET(shard_ranges[2:4], 'a/c', + params='&marker=jam&end_marker=walnut') + check_shard_GET(reversed(shard_ranges[2:4]), 'a/c', + params='&end_marker=jam&marker=walnut&reverse=true') + check_shard_GET(shard_ranges[3:4], 'a/c', + params='&marker=toast&end_marker=walnut') + check_shard_GET(shard_ranges[3:4], 'a/c', + params='&end_marker=toast&marker=walnut&reverse=true') + check_shard_GET([], 'a/c', + params='&marker=egg&end_marker=cheese') + check_shard_GET([], 'a/c', + params='&marker=cheese&end_marker=egg&reverse=true') + + # delete a shard range + shard_range = shard_ranges[1] + shard_range.set_deleted(timestamp=next(ts_iter)) + self._put_shard_range(shard_range) + + self._assert_shard_ranges_equal(shard_ranges[:1] + shard_ranges[2:], + broker.get_shard_ranges()) + + check_shard_GET(shard_ranges[:1] + shard_ranges[2:], 'a/c') + check_shard_GET(shard_ranges[2:3], 'a/c', params='&includes=jam') + # specify obj, marker or end_marker not in any shard range + check_shard_GET([], 'a/c', params='&includes=cheese') + check_shard_GET([], 'a/c', params='&includes=cheese&reverse=true') + check_shard_GET([], 'a/c', params='&includes=ham') + check_shard_GET(shard_ranges[2:], 'a/c/', params='&marker=cheese') + check_shard_GET(shard_ranges[:1], 'a/c/', + params='&marker=cheese&reverse=true') + check_shard_GET(shard_ranges[:1], 'a/c/', params='&end_marker=cheese') + check_shard_GET(reversed(shard_ranges[2:]), 'a/c/', + params='&end_marker=cheese&reverse=true') + + self.assertFalse(self.controller.logger.get_lines_for_level('warning')) + self.assertFalse(self.controller.logger.get_lines_for_level('error')) + + def test_GET_shard_ranges_using_state_aliases(self): + # make a shard container + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() # used when mocking Timestamp.now() + shard_ranges = [] + lower = '' + for state in sorted(ShardRange.STATES.keys()): + upper = str(state) + shard_ranges.append( + ShardRange('.shards_a/c_%s' % upper, next(ts_iter), + lower, upper, state * 100, state * 1000, + meta_timestamp=next(ts_iter), + state=state, state_timestamp=next(ts_iter))) + lower = upper + + def do_test(root_path, path, params, expected_states): + expected = [ + sr for sr in shard_ranges if sr.state in expected_states] + own_shard_range = ShardRange(path, next(ts_iter), '', '', + state=ShardRange.ACTIVE) + expected.append(own_shard_range.copy( + lower=expected[-1].upper, meta_timestamp=ts_now)) + expected = [dict(sr, last_modified=sr.timestamp.isoformat) + for sr in expected] + headers = {'X-Timestamp': next(ts_iter).normal} + + # create container + req = Request.blank( + '/sda1/p/%s' % path, method='PUT', headers=headers) + self.assertIn( + req.get_response(self.controller).status_int, (201, 202)) + # PUT some shard ranges + headers = {'X-Timestamp': next(ts_iter).normal, + 'X-Container-Sysmeta-Shard-Root': root_path, + 'X-Backend-Record-Type': 'shard'} + body = json.dumps( + [dict(sr) for sr in shard_ranges + [own_shard_range]]) + req = Request.blank( + '/sda1/p/%s' % path, method='PUT', headers=headers, body=body) + self.assertEqual(202, req.get_response(self.controller).status_int) + + req = Request.blank('/sda1/p/%s?format=json%s' % + (path, params), method='GET', + headers={'X-Backend-Record-Type': 'shard'}) + with mock_timestamp_now(ts_now): + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 200) + self.assertEqual(resp.content_type, 'application/json') + self.assertEqual(expected, json.loads(resp.body)) + self.assertIn('X-Backend-Record-Type', resp.headers) + self.assertEqual('shard', resp.headers['X-Backend-Record-Type']) + + # root's shard ranges for listing + root_path = container_path = 'a/c' + params = '&states=listing' + expected_states = [ + ShardRange.CLEAVED, ShardRange.ACTIVE, ShardRange.SHARDING, + ShardRange.SHRINKING] + do_test(root_path, container_path, params, expected_states) + + # shard's shard ranges for listing + container_path = '.shards_a/c' + params = '&states=listing' + do_test(root_path, container_path, params, expected_states) + + # root's shard ranges for updating + params = '&states=updating' + expected_states = [ + ShardRange.CREATED, ShardRange.CLEAVED, ShardRange.ACTIVE, + ShardRange.SHARDING] + container_path = root_path + do_test(root_path, container_path, params, expected_states) + + # shard's shard ranges for updating + container_path = '.shards_a/c' + do_test(root_path, container_path, params, expected_states) + + def test_GET_shard_ranges_include_deleted(self): + # make a shard container + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() # used when mocking Timestamp.now() + shard_ranges = [] + lower = '' + for state in sorted(ShardRange.STATES.keys()): + upper = str(state) + shard_ranges.append( + ShardRange('.shards_a/c_%s' % upper, next(ts_iter), + lower, upper, state * 100, state * 1000, + meta_timestamp=next(ts_iter), + state=state, state_timestamp=next(ts_iter))) + lower = upper + # create container + headers = {'X-Timestamp': next(ts_iter).normal} + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers) + self.assertIn( + req.get_response(self.controller).status_int, (201, 202)) + # PUT some shard ranges + headers = {'X-Timestamp': next(ts_iter).normal, + 'X-Backend-Record-Type': 'shard'} + body = json.dumps([dict(sr) for sr in shard_ranges]) + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + self.assertEqual(202, req.get_response(self.controller).status_int) + + def do_test(include_deleted, expected): + expected = [dict(sr, last_modified=sr.timestamp.isoformat) + for sr in expected] + headers = {'X-Backend-Record-Type': 'shard', + 'X-Backend-Include-Deleted': str(include_deleted)} + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers=headers) + with mock_timestamp_now(ts_now): + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 200) + self.assertEqual(resp.content_type, 'application/json') + self.assertEqual(expected, json.loads(resp.body)) + self.assertIn('X-Backend-Record-Type', resp.headers) + self.assertEqual('shard', resp.headers['X-Backend-Record-Type']) + + do_test(False, shard_ranges) + do_test(True, shard_ranges) + + headers = {'X-Timestamp': next(ts_iter).normal, + 'X-Backend-Record-Type': 'shard'} + for sr in shard_ranges[::2]: + sr.set_deleted(timestamp=next(ts_iter)) + body = json.dumps([dict(sr) for sr in shard_ranges]) + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + self.assertEqual(202, req.get_response(self.controller).status_int) + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + self._assert_shard_ranges_equal( + shard_ranges[1::2], broker.get_shard_ranges()) + do_test(False, shard_ranges[1::2]) + do_test(True, shard_ranges) + + headers = {'X-Timestamp': next(ts_iter).normal, + 'X-Backend-Record-Type': 'shard'} + for sr in shard_ranges[1::2]: + sr.set_deleted(timestamp=next(ts_iter)) + body = json.dumps([dict(sr) for sr in shard_ranges]) + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + self.assertEqual(202, req.get_response(self.controller).status_int) + self.assertFalse(broker.get_shard_ranges()) + do_test(False, []) + do_test(True, shard_ranges) + + def test_GET_shard_ranges_errors(self): + # verify that x-backend-record-type is not included in error responses + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() # used when mocking Timestamp.now() + shard_ranges = [] + lower = '' + for state in sorted(ShardRange.STATES.keys()): + upper = str(state) + shard_ranges.append( + ShardRange('.shards_a/c_%s' % upper, next(ts_iter), + lower, upper, state * 100, state * 1000, + meta_timestamp=next(ts_iter), + state=state, state_timestamp=next(ts_iter))) + lower = upper + # create container + headers = {'X-Timestamp': next(ts_iter).normal} + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers) + self.assertIn( + req.get_response(self.controller).status_int, (201, 202)) + # PUT some shard ranges + headers = {'X-Timestamp': next(ts_iter).normal, + 'X-Backend-Record-Type': 'shard'} + body = json.dumps([dict(sr) for sr in shard_ranges]) + req = Request.blank( + '/sda1/p/a/c', method='PUT', headers=headers, body=body) + self.assertEqual(202, req.get_response(self.controller).status_int) + + def do_test(params): + params['format'] = 'json' + headers = {'X-Backend-Record-Type': 'shard'} + req = Request.blank('/sda1/p/a/c', method='GET', + headers=headers, params=params) + with mock_timestamp_now(ts_now): + resp = req.get_response(self.controller) + self.assertEqual(resp.content_type, 'text/html') + self.assertNotIn('X-Backend-Record-Type', resp.headers) + self.assertNotIn('X-Backend-Sharding-State', resp.headers) + self.assertNotIn('X-Container-Object-Count', resp.headers) + self.assertNotIn('X-Container-Bytes-Used', resp.headers) + self.assertNotIn('X-Timestamp', resp.headers) + self.assertNotIn('X-PUT-Timestamp', resp.headers) + return resp + + resp = do_test({'states': 'bad'}) + self.assertEqual(resp.status_int, 400) + resp = do_test({'delimiter': 'bad'}) + self.assertEqual(resp.status_int, 412) + resp = do_test({'limit': str(constraints.CONTAINER_LISTING_LIMIT + 1)}) + self.assertEqual(resp.status_int, 412) + with mock.patch('swift.container.server.check_drive', + lambda *args: False): + resp = do_test({}) + self.assertEqual(resp.status_int, 507) + + # delete the container + req = Request.blank('/sda1/p/a/c', method='DELETE', + headers={'X-Timestamp': next(ts_iter).normal}) + self.assertEqual(204, req.get_response(self.controller).status_int) + + resp = do_test({'states': 'bad'}) + self.assertEqual(resp.status_int, 404) + + def test_GET_auto_record_type(self): + # make a container + ts_iter = make_timestamp_iter() + ts_now = Timestamp.now() # used when mocking Timestamp.now() + headers = {'X-Timestamp': next(ts_iter).normal} + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers) + self.assertEqual(201, req.get_response(self.controller).status_int) + # PUT some objects + objects = [{'name': 'obj_%d' % i, + 'x-timestamp': next(ts_iter).normal, + 'x-content-type': 'text/plain', + 'x-etag': 'etag_%d' % i, + 'x-size': 1024 * i + } for i in range(2)] + for obj in objects: + req = Request.blank('/sda1/p/a/c/%s' % obj['name'], method='PUT', + headers=obj) + self._update_object_put_headers(req) + resp = req.get_response(self.controller) + self.assertEqual(201, resp.status_int) + # PUT some shard ranges + shard_bounds = [('', 'm', ShardRange.CLEAVED), + ('m', '', ShardRange.CREATED)] + shard_ranges = [ + ShardRange('.sharded_a/_%s' % upper, next(ts_iter), + lower, upper, + i * 100, i * 1000, meta_timestamp=next(ts_iter), + state=state, state_timestamp=next(ts_iter)) + for i, (lower, upper, state) in enumerate(shard_bounds)] + for shard_range in shard_ranges: + self._put_shard_range(shard_range) + + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + + def assert_GET_objects(req, expected_objects): + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 200) + self.assertEqual(resp.content_type, 'application/json') + expected = [ + dict(hash=obj['x-etag'], bytes=obj['x-size'], + content_type=obj['x-content-type'], + last_modified=Timestamp(obj['x-timestamp']).isoformat, + name=obj['name']) for obj in expected_objects] + self.assertEqual(expected, json.loads(resp.body)) + self.assertIn('X-Backend-Record-Type', resp.headers) + self.assertEqual( + 'object', resp.headers.pop('X-Backend-Record-Type')) + resp.headers.pop('Content-Length') + return resp + + def assert_GET_shard_ranges(req, expected_shard_ranges): + with mock_timestamp_now(ts_now): + resp = req.get_response(self.controller) + self.assertEqual(resp.status_int, 200) + self.assertEqual(resp.content_type, 'application/json') + expected = [ + dict(sr, last_modified=Timestamp(sr.timestamp).isoformat) + for sr in expected_shard_ranges] + self.assertEqual(expected, json.loads(resp.body)) + self.assertIn('X-Backend-Record-Type', resp.headers) + self.assertEqual( + 'shard', resp.headers.pop('X-Backend-Record-Type')) + resp.headers.pop('Content-Length') + return resp + + # unsharded + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'auto'}) + resp = assert_GET_objects(req, objects) + headers = resp.headers + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'shard'}) + resp = assert_GET_shard_ranges(req, shard_ranges) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'object'}) + resp = assert_GET_objects(req, objects) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json', method='GET') + resp = assert_GET_objects(req, objects) + self.assertEqual(headers, resp.headers) + + # move to sharding state + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'auto'}) + resp = assert_GET_shard_ranges(req, shard_ranges) + headers = resp.headers + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'shard'}) + resp = assert_GET_shard_ranges(req, shard_ranges) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'object'}) + resp = assert_GET_objects(req, objects) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json', method='GET') + resp = assert_GET_objects(req, objects) + self.assertEqual(headers, resp.headers) + + # limit is applied to objects but not shard ranges + req = Request.blank('/sda1/p/a/c?format=json&limit=1', method='GET', + headers={'X-Backend-Record-Type': 'auto'}) + resp = assert_GET_shard_ranges(req, shard_ranges) + headers = resp.headers + req = Request.blank('/sda1/p/a/c?format=json&limit=1', method='GET', + headers={'X-Backend-Record-Type': 'shard'}) + resp = assert_GET_shard_ranges(req, shard_ranges) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json&limit=1', method='GET', + headers={'X-Backend-Record-Type': 'object'}) + resp = assert_GET_objects(req, objects[:1]) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json&limit=1', method='GET') + resp = assert_GET_objects(req, objects[:1]) + self.assertEqual(headers, resp.headers) + + # move to sharded state + self.assertTrue(broker.set_sharded_state()) + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'auto'}) + resp = assert_GET_shard_ranges(req, shard_ranges) + headers = resp.headers + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'shard'}) + resp = assert_GET_shard_ranges(req, shard_ranges) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json', method='GET', + headers={'X-Backend-Record-Type': 'object'}) + resp = assert_GET_objects(req, []) + self.assertEqual(headers, resp.headers) + req = Request.blank('/sda1/p/a/c?format=json', method='GET') + resp = assert_GET_objects(req, []) + self.assertEqual(headers, resp.headers) + + def test_PUT_GET_to_sharding_container(self): + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + ts_iter = make_timestamp_iter() + headers = {'X-Timestamp': next(ts_iter).normal} + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers) + self.assertEqual(201, req.get_response(self.controller).status_int) + + def do_update(name, timestamp=None, headers=None): + # Make a PUT request to container controller to update an object + timestamp = timestamp or next(ts_iter) + headers = headers or {} + headers.update({'X-Timestamp': timestamp.internal, + 'X-Size': 17, + 'X-Content-Type': 'text/plain', + 'X-Etag': 'fake etag'}) + req = Request.blank( + '/sda1/p/a/c/%s' % name, method='PUT', headers=headers) + self._update_object_put_headers(req) + resp = req.get_response(self.controller) + self.assertEqual(201, resp.status_int) + + def get_api_listing(): + req = Request.blank( + '/sda1/p/a/c', method='GET', params={'format': 'json'}) + resp = req.get_response(self.controller) + self.assertEqual(200, resp.status_int) + return [obj['name'] for obj in json.loads(resp.body)] + + def assert_broker_rows(broker, expected_names, expected_max_row): + self.assertEqual(expected_max_row, broker.get_max_row()) + with broker.get() as conn: + curs = conn.execute(''' + SELECT * FROM object WHERE ROWID > -1 ORDER BY ROWID ASC + ''') + actual = [r[1] for r in curs] + + self.assertEqual(expected_names, actual) + + do_update('unsharded') + self.assertEqual(['unsharded'], get_api_listing()) + assert_broker_rows(broker, ['unsharded'], 1) + + # move container to sharding state + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + assert_broker_rows(broker.get_brokers()[0], ['unsharded'], 1) + assert_broker_rows(broker.get_brokers()[1], [], 1) + + # add another update - should not merge into the older db and therefore + # not appear in api listing + do_update('sharding') + self.assertEqual(['unsharded'], get_api_listing()) + assert_broker_rows(broker.get_brokers()[0], ['unsharded'], 1) + assert_broker_rows(broker.get_brokers()[1], ['sharding'], 2) + + orig_lister = swift.container.backend.ContainerBroker.list_objects_iter + + def mock_list_objects_iter(*args, **kwargs): + # cause an update to land in the pending file after it has been + # flushed by get_info() calls in the container PUT method, but + # before it is flushed by the call to list_objects_iter + do_update('racing_update') + return orig_lister(*args, **kwargs) + + with mock.patch( + 'swift.container.backend.ContainerBroker.list_objects_iter', + mock_list_objects_iter): + listing = get_api_listing() + + self.assertEqual(['unsharded'], listing) + assert_broker_rows(broker.get_brokers()[0], ['unsharded'], 1) + assert_broker_rows(broker.get_brokers()[1], ['sharding'], 2) + + # next listing will flush pending file + listing = get_api_listing() + self.assertEqual(['unsharded'], listing) + assert_broker_rows(broker.get_brokers()[0], ['unsharded'], 1) + assert_broker_rows(broker.get_brokers()[1], + ['sharding', 'racing_update'], 3) + def test_GET_json(self): # make a container req = Request.blank( @@ -2391,7 +3428,7 @@ class TestContainerController(unittest.TestCase): req = Request.blank( '/sda1/p/a/c', environ={'REQUEST_METHOD': 'PUT', 'HTTP_X_TIMESTAMP': '0'}) - resp = req.get_response(self.controller) + req.get_response(self.controller) # fill the container for i in range(3): req = Request.blank( @@ -2409,6 +3446,24 @@ class TestContainerController(unittest.TestCase): resp = req.get_response(self.controller) result = resp.body.split() self.assertEqual(result, ['2', ]) + # test limit with end_marker + req = Request.blank('/sda1/p/a/c?limit=2&end_marker=1', + environ={'REQUEST_METHOD': 'GET'}) + resp = req.get_response(self.controller) + result = resp.body.split() + self.assertEqual(result, ['0', ]) + # test limit, reverse with end_marker + req = Request.blank('/sda1/p/a/c?limit=2&end_marker=1&reverse=True', + environ={'REQUEST_METHOD': 'GET'}) + resp = req.get_response(self.controller) + result = resp.body.split() + self.assertEqual(result, ['2', ]) + # test marker > end_marker + req = Request.blank('/sda1/p/a/c?marker=2&end_marker=1', + environ={'REQUEST_METHOD': 'GET'}) + resp = req.get_response(self.controller) + result = resp.body.split() + self.assertEqual(result, []) def test_weird_content_types(self): snowman = u'\u2603' diff --git a/test/unit/proxy/test_server.py b/test/unit/proxy/test_server.py index bee74c380a..c4223c656d 100644 --- a/test/unit/proxy/test_server.py +++ b/test/unit/proxy/test_server.py @@ -8356,6 +8356,29 @@ class TestContainerController(unittest.TestCase): self.assertEqual(res.content_length, 0) self.assertNotIn('transfer-encoding', res.headers) + def test_GET_account_non_existent(self): + with save_globals(): + set_http_connect(404, 404, 404) + controller = proxy_server.ContainerController(self.app, 'a', 'c') + req = Request.blank('/v1/a/c') + self.app.update_request(req) + res = controller.GET(req) + self.assertEqual(res.status_int, 404) + self.assertNotIn('container/a/c', res.environ['swift.infocache']) + + def test_GET_auto_create_prefix_account_non_existent(self): + with save_globals(): + set_http_connect(404, 404, 404, 204, 204, 204) + controller = proxy_server.ContainerController(self.app, '.a', 'c') + req = Request.blank('/v1/a/c') + self.app.update_request(req) + res = controller.GET(req) + self.assertEqual(res.status_int, 204) + ic = res.environ['swift.infocache'] + self.assertEqual(ic['container/.a/c']['status'], 204) + self.assertEqual(res.content_length, 0) + self.assertNotIn('transfer-encoding', res.headers) + def test_GET_calls_authorize(self): called = [False] From e940bc6cb1c065d92239f5f298adee3eb669aff3 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Wed, 2 May 2018 10:06:12 +0100 Subject: [PATCH 6/9] Enable proxy to build listings from shards When a container is sharding or sharded the proxy container controller now builds container listings by concatenating components from shard ranges. Co-Authored-By: Matthew Oliver Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Co-Authored-By: Samuel Merritt Change-Id: Ia4cfebbe50338a761b8b6e9903b1869cb1f5b47e --- swift/proxy/controllers/base.py | 97 +- swift/proxy/controllers/container.py | 109 ++- test/unit/proxy/controllers/test_base.py | 154 +++- test/unit/proxy/controllers/test_container.py | 858 +++++++++++++++++- 4 files changed, 1209 insertions(+), 9 deletions(-) diff --git a/swift/proxy/controllers/base.py b/swift/proxy/controllers/base.py index df0ea71b89..cca8f6cc14 100644 --- a/swift/proxy/controllers/base.py +++ b/swift/proxy/controllers/base.py @@ -28,6 +28,7 @@ from six.moves.urllib.parse import quote import os import time +import json import functools import inspect import itertools @@ -40,11 +41,11 @@ from eventlet import sleep from eventlet.timeout import Timeout import six -from swift.common.wsgi import make_pre_authed_env +from swift.common.wsgi import make_pre_authed_env, make_pre_authed_request from swift.common.utils import Timestamp, config_true_value, \ public, split_path, list_from_csv, GreenthreadSafeIterator, \ GreenAsyncPile, quorum_size, parse_content_type, \ - document_iters_to_http_response_body + document_iters_to_http_response_body, ShardRange from swift.common.bufferedhttp import http_connect from swift.common import constraints from swift.common.exceptions import ChunkReadTimeout, ChunkWriteTimeout, \ @@ -188,6 +189,7 @@ def headers_to_container_info(headers, status_int=HTTP_OK): }, 'meta': meta, 'sysmeta': sysmeta, + 'sharding_state': headers.get('x-backend-sharding-state', 'unsharded'), } @@ -375,6 +377,9 @@ def get_container_info(env, app, swift_source=None): else: info[field] = int(info[field]) + if info.get('sharding_state') is None: + info['sharding_state'] = 'unsharded' + return info @@ -1994,3 +1999,91 @@ class Controller(object): else: raise ValueError( "server_type can only be 'account' or 'container'") + + def _get_container_listing(self, req, account, container, headers=None, + params=None): + """ + Fetch container listing from given `account/container`. + + :param req: original Request instance. + :param account: account in which `container` is stored. + :param container: container from which listing should be fetched. + :param headers: headers to be included with the request + :param params: query string parameters to be used. + :return: a tuple of (deserialized json data structure, swob Response) + """ + params = params or {} + version, _a, _c, _other = req.split_path(3, 4, True) + path = '/'.join(['', version, account, container]) + + subreq = make_pre_authed_request( + req.environ, method='GET', path=quote(path), headers=req.headers, + swift_source='SH') + if headers: + subreq.headers.update(headers) + subreq.params = params + self.app.logger.debug( + 'Get listing from %s %s' % (subreq.path_qs, headers)) + response = self.app.handle_request(subreq) + + if not is_success(response.status_int): + self.app.logger.warning( + 'Failed to get container listing from %s: %s', + subreq.path_qs, response.status_int) + return None, response + + try: + data = json.loads(response.body) + if not isinstance(data, list): + raise ValueError('not a list') + return data, response + except ValueError as err: + self.app.logger.error( + 'Problem with listing response from %s: %r', + subreq.path_qs, err) + return None, response + + def _get_shard_ranges(self, req, account, container, includes=None, + states=None): + """ + Fetch shard ranges from given `account/container`. If `includes` is + given then the shard range for that object name is requested, otherwise + all shard ranges are requested. + + :param req: original Request instance. + :param account: account from which shard ranges should be fetched. + :param container: container from which shard ranges should be fetched. + :param includes: (optional) restricts the list of fetched shard ranges + to those which include the given name. + :param states: (optional) the states of shard ranges to be fetched. + :return: a list of instances of :class:`swift.common.utils.ShardRange`, + or None if there was a problem fetching the shard ranges + """ + params = req.params.copy() + params.pop('limit', None) + params['format'] = 'json' + if includes: + params['includes'] = includes + if states: + params['states'] = states + headers = {'X-Backend-Record-Type': 'shard'} + listing, response = self._get_container_listing( + req, account, container, headers=headers, params=params) + if listing is None: + return None + + record_type = response.headers.get('x-backend-record-type') + if record_type != 'shard': + err = 'unexpected record type %r' % record_type + self.app.logger.error("Failed to get shard ranges from %s: %s", + req.path_qs, err) + return None + + try: + return [ShardRange.from_dict(shard_range) + for shard_range in listing] + except (ValueError, TypeError, KeyError) as err: + self.app.logger.error( + "Failed to get shard ranges from %s: invalid data: %r", + req.path_qs, err) + return None diff --git a/swift/proxy/controllers/container.py b/swift/proxy/controllers/container.py index 34c02a3f1a..f95a31f35a 100644 --- a/swift/proxy/controllers/container.py +++ b/swift/proxy/controllers/container.py @@ -14,10 +14,12 @@ # limitations under the License. from swift import gettext_ as _ +import json from six.moves.urllib.parse import unquote -from swift.common.utils import public, csv_append, Timestamp -from swift.common.constraints import check_metadata +from swift.common.utils import public, csv_append, Timestamp, \ + config_true_value, ShardRange +from swift.common.constraints import check_metadata, CONTAINER_LISTING_LIMIT from swift.common.http import HTTP_ACCEPTED, is_success from swift.proxy.controllers.base import Controller, delay_denial, \ cors_validation, set_info_cache, clear_info_cache @@ -103,10 +105,20 @@ class ContainerController(Controller): node_iter = self.app.iter_nodes(self.app.container_ring, part) params = req.params params['format'] = 'json' + record_type = req.headers.get('X-Backend-Record-Type', '').lower() + if not record_type: + record_type = 'auto' + req.headers['X-Backend-Record-Type'] = 'auto' + params['states'] = 'listing' req.params = params resp = self.GETorHEAD_base( req, _('Container'), node_iter, part, req.swift_entity_path, concurrency) + resp_record_type = resp.headers.get('X-Backend-Record-Type', '') + if all((req.method == "GET", record_type == 'auto', + resp_record_type.lower() == 'shard')): + resp = self._get_from_shards(req, resp) + # Cache this. We just made a request to a storage node and got # up-to-date information for the container. resp.headers['X-Backend-Recheck-Container-Existence'] = str( @@ -126,6 +138,99 @@ class ContainerController(Controller): del resp.headers[key] return resp + def _get_from_shards(self, req, resp): + # construct listing using shards described by the response body + shard_ranges = [ShardRange.from_dict(data) + for data in json.loads(resp.body)] + self.app.logger.debug('GET listing from %s shards for: %s', + len(shard_ranges), req.path_qs) + if not shard_ranges: + # can't find ranges or there was a problem getting the ranges. So + # return what we have. + return resp + + objects = [] + req_limit = int(req.params.get('limit', CONTAINER_LISTING_LIMIT)) + params = req.params.copy() + params.pop('states', None) + req.headers.pop('X-Backend-Record-Type', None) + reverse = config_true_value(params.get('reverse')) + marker = params.get('marker') + end_marker = params.get('end_marker') + + limit = req_limit + for shard_range in shard_ranges: + params['limit'] = limit + # Always set marker to ensure that object names less than or equal + # to those already in the listing are not fetched + if objects: + last_name = objects[-1].get('name', + objects[-1].get('subdir', u'')) + params['marker'] = last_name.encode('utf-8') + elif reverse and marker and marker > shard_range.lower: + params['marker'] = marker + elif marker and marker <= shard_range.upper: + params['marker'] = marker + else: + params['marker'] = shard_range.upper_str if reverse \ + else shard_range.lower_str + if params['marker'] and reverse: + params['marker'] += '\x00' + + # Always set end_marker to ensure that misplaced objects beyond + # the expected shard range are not fetched + if end_marker and end_marker in shard_range: + params['end_marker'] = end_marker + else: + params['end_marker'] = shard_range.lower_str if reverse \ + else shard_range.upper_str + if params['end_marker'] and not reverse: + params['end_marker'] += '\x00' + + if (shard_range.account == self.account_name and + shard_range.container == self.container_name): + # directed back to same container - force GET of objects + headers = {'X-Backend-Record-Type': 'object'} + else: + headers = None + self.app.logger.debug('Getting from %s %s with %s', + shard_range, shard_range.name, headers) + objs, shard_resp = self._get_container_listing( + req, shard_range.account, shard_range.container, + headers=headers, params=params) + + if not objs: + # tolerate errors or empty shard containers + continue + + objects.extend(objs) + limit -= len(objs) + + if limit <= 0: + break + elif (end_marker and reverse and + end_marker >= objects[-1]['name'].encode('utf-8')): + break + elif (end_marker and not reverse and + end_marker <= objects[-1]['name'].encode('utf-8')): + break + + resp.body = json.dumps(objects) + constrained = any(req.params.get(constraint) for constraint in ( + 'marker', 'end_marker', 'path', 'prefix', 'delimiter')) + if not constrained and len(objects) < req_limit: + self.app.logger.debug('Setting object count to %s' % len(objects)) + # prefer the actual listing stats over the potentially outdated + # root stats. This condition is only likely when a sharded + # container is shrinking or in tests; typically a sharded container + # will have more than CONTAINER_LISTING_LIMIT objects so any + # unconstrained listing will be capped by the limit and total + # object stats cannot therefore be inferred from the listing. + resp.headers['X-Container-Object-Count'] = len(objects) + resp.headers['X-Container-Bytes-Used'] = sum( + [o['bytes'] for o in objects]) + return resp + @public @delay_denial @cors_validation diff --git a/test/unit/proxy/controllers/test_base.py b/test/unit/proxy/controllers/test_base.py index 60d17c9ec8..93d71f6288 100644 --- a/test/unit/proxy/controllers/test_base.py +++ b/test/unit/proxy/controllers/test_base.py @@ -14,6 +14,7 @@ # limitations under the License. import itertools +import json from collections import defaultdict import unittest import mock @@ -23,11 +24,14 @@ from swift.proxy.controllers.base import headers_to_container_info, \ Controller, GetOrHeadHandler, bytes_to_skip from swift.common.swob import Request, HTTPException, RESPONSE_REASONS from swift.common import exceptions -from swift.common.utils import split_path +from swift.common.utils import split_path, ShardRange, Timestamp from swift.common.header_key_dict import HeaderKeyDict from swift.common.http import is_success from swift.common.storage_policy import StoragePolicy, StoragePolicyCollection -from test.unit import fake_http_connect, FakeRing, FakeMemcache, PatchPolicies +from test.unit import ( + fake_http_connect, FakeRing, FakeMemcache, PatchPolicies, FakeLogger, + make_timestamp_iter, + mocked_http_conn) from swift.proxy import server as proxy_server from swift.common.request_helpers import ( get_sys_meta_prefix, get_object_transient_sysmeta @@ -172,7 +176,8 @@ class TestFuncs(unittest.TestCase): def setUp(self): self.app = proxy_server.Application(None, FakeMemcache(), account_ring=FakeRing(), - container_ring=FakeRing()) + container_ring=FakeRing(), + logger=FakeLogger()) def test_get_info_zero_recheck(self): mock_cache = mock.Mock() @@ -1030,3 +1035,146 @@ class TestFuncs(unittest.TestCase): # prime numbers self.assertEqual(bytes_to_skip(11, 7), 4) self.assertEqual(bytes_to_skip(97, 7873823), 55) + + def test_get_shard_ranges_for_container_get(self): + ts_iter = make_timestamp_iter() + shard_ranges = [dict(ShardRange( + '.sharded_a/sr%d' % i, next(ts_iter), '%d_lower' % i, + '%d_upper' % i, object_count=i, bytes_used=1024 * i, + meta_timestamp=next(ts_iter))) + for i in range(3)] + base = Controller(self.app) + req = Request.blank('/v1/a/c', method='GET') + resp_headers = {'X-Backend-Record-Type': 'shard'} + with mocked_http_conn( + 200, 200, body_iter=iter(['', json.dumps(shard_ranges)]), + headers=resp_headers + ) as fake_conn: + actual = base._get_shard_ranges(req, 'a', 'c') + + # account info + captured = fake_conn.requests + self.assertEqual('HEAD', captured[0]['method']) + self.assertEqual('a', captured[0]['path'][7:]) + # container GET + self.assertEqual('GET', captured[1]['method']) + self.assertEqual('a/c', captured[1]['path'][7:]) + self.assertEqual('format=json', captured[1]['qs']) + self.assertEqual( + 'shard', captured[1]['headers'].get('X-Backend-Record-Type')) + self.assertEqual(shard_ranges, [dict(pr) for pr in actual]) + self.assertFalse(self.app.logger.get_lines_for_level('error')) + + def test_get_shard_ranges_for_object_put(self): + ts_iter = make_timestamp_iter() + shard_ranges = [dict(ShardRange( + '.sharded_a/sr%d' % i, next(ts_iter), '%d_lower' % i, + '%d_upper' % i, object_count=i, bytes_used=1024 * i, + meta_timestamp=next(ts_iter))) + for i in range(3)] + base = Controller(self.app) + req = Request.blank('/v1/a/c/o', method='PUT') + resp_headers = {'X-Backend-Record-Type': 'shard'} + with mocked_http_conn( + 200, 200, body_iter=iter(['', json.dumps(shard_ranges[1:2])]), + headers=resp_headers + ) as fake_conn: + actual = base._get_shard_ranges(req, 'a', 'c', '1_test') + + # account info + captured = fake_conn.requests + self.assertEqual('HEAD', captured[0]['method']) + self.assertEqual('a', captured[0]['path'][7:]) + # container GET + self.assertEqual('GET', captured[1]['method']) + self.assertEqual('a/c', captured[1]['path'][7:]) + params = sorted(captured[1]['qs'].split('&')) + self.assertEqual( + ['format=json', 'includes=1_test'], params) + self.assertEqual( + 'shard', captured[1]['headers'].get('X-Backend-Record-Type')) + self.assertEqual(shard_ranges[1:2], [dict(pr) for pr in actual]) + self.assertFalse(self.app.logger.get_lines_for_level('error')) + + def _check_get_shard_ranges_bad_data(self, body): + base = Controller(self.app) + req = Request.blank('/v1/a/c/o', method='PUT') + # empty response + headers = {'X-Backend-Record-Type': 'shard'} + with mocked_http_conn(200, 200, body_iter=iter(['', body]), + headers=headers): + actual = base._get_shard_ranges(req, 'a', 'c', '1_test') + self.assertIsNone(actual) + lines = self.app.logger.get_lines_for_level('error') + return lines + + def test_get_shard_ranges_empty_body(self): + error_lines = self._check_get_shard_ranges_bad_data('') + self.assertIn('Problem with listing response', error_lines[0]) + self.assertIn('No JSON', error_lines[0]) + self.assertFalse(error_lines[1:]) + + def test_get_shard_ranges_not_a_list(self): + error_lines = self._check_get_shard_ranges_bad_data(json.dumps({})) + self.assertIn('Problem with listing response', error_lines[0]) + self.assertIn('not a list', error_lines[0]) + self.assertFalse(error_lines[1:]) + + def test_get_shard_ranges_key_missing(self): + error_lines = self._check_get_shard_ranges_bad_data(json.dumps([{}])) + self.assertIn('Failed to get shard ranges', error_lines[0]) + self.assertIn('KeyError', error_lines[0]) + self.assertFalse(error_lines[1:]) + + def test_get_shard_ranges_invalid_shard_range(self): + sr = ShardRange('a/c', Timestamp.now()) + bad_sr_data = dict(sr, name='bad_name') + error_lines = self._check_get_shard_ranges_bad_data( + json.dumps([bad_sr_data])) + self.assertIn('Failed to get shard ranges', error_lines[0]) + self.assertIn('ValueError', error_lines[0]) + self.assertFalse(error_lines[1:]) + + def test_get_shard_ranges_missing_record_type(self): + base = Controller(self.app) + req = Request.blank('/v1/a/c/o', method='PUT') + sr = ShardRange('a/c', Timestamp.now()) + body = json.dumps([dict(sr)]) + with mocked_http_conn( + 200, 200, body_iter=iter(['', body])): + actual = base._get_shard_ranges(req, 'a', 'c', '1_test') + self.assertIsNone(actual) + error_lines = self.app.logger.get_lines_for_level('error') + self.assertIn('Failed to get shard ranges', error_lines[0]) + self.assertIn('unexpected record type', error_lines[0]) + self.assertIn('/a/c', error_lines[0]) + self.assertFalse(error_lines[1:]) + + def test_get_shard_ranges_wrong_record_type(self): + base = Controller(self.app) + req = Request.blank('/v1/a/c/o', method='PUT') + sr = ShardRange('a/c', Timestamp.now()) + body = json.dumps([dict(sr)]) + headers = {'X-Backend-Record-Type': 'object'} + with mocked_http_conn( + 200, 200, body_iter=iter(['', body]), + headers=headers): + actual = base._get_shard_ranges(req, 'a', 'c', '1_test') + self.assertIsNone(actual) + error_lines = self.app.logger.get_lines_for_level('error') + self.assertIn('Failed to get shard ranges', error_lines[0]) + self.assertIn('unexpected record type', error_lines[0]) + self.assertIn('/a/c', error_lines[0]) + self.assertFalse(error_lines[1:]) + + def test_get_shard_ranges_request_failed(self): + base = Controller(self.app) + req = Request.blank('/v1/a/c/o', method='PUT') + with mocked_http_conn(200, 404, 404, 404): + actual = base._get_shard_ranges(req, 'a', 'c', '1_test') + self.assertIsNone(actual) + self.assertFalse(self.app.logger.get_lines_for_level('error')) + warning_lines = self.app.logger.get_lines_for_level('warning') + self.assertIn('Failed to get container listing', warning_lines[0]) + self.assertIn('/a/c', warning_lines[0]) + self.assertFalse(warning_lines[1:]) diff --git a/test/unit/proxy/controllers/test_container.py b/test/unit/proxy/controllers/test_container.py index 03d53c2fde..e85e50362a 100644 --- a/test/unit/proxy/controllers/test_container.py +++ b/test/unit/proxy/controllers/test_container.py @@ -12,17 +12,24 @@ # implied. # See the License for the specific language governing permissions and # limitations under the License. +import json import mock import socket import unittest from eventlet import Timeout +from six.moves import urllib +from swift.common.constraints import CONTAINER_LISTING_LIMIT from swift.common.swob import Request +from swift.common.utils import ShardRange, Timestamp from swift.proxy import server as proxy_server -from swift.proxy.controllers.base import headers_to_container_info, Controller -from test.unit import fake_http_connect, FakeRing, FakeMemcache +from swift.proxy.controllers.base import headers_to_container_info, Controller, \ + get_container_info +from test import annotate_failure +from test.unit import fake_http_connect, FakeRing, FakeMemcache, \ + make_timestamp_iter from swift.common.storage_policy import StoragePolicy from swift.common.request_helpers import get_sys_meta_prefix @@ -72,6 +79,7 @@ class TestContainerController(TestRingBase): new=FakeAccountInfoContainerController): return _orig_get_controller(*args, **kwargs) self.app.get_controller = wrapped_get_controller + self.ts_iter = make_timestamp_iter() def _make_callback_func(self, context): def callback(ipaddr, port, device, partition, method, path, @@ -329,6 +337,852 @@ class TestContainerController(TestRingBase): ] self._assert_responses('POST', POST_TEST_CASES) + def _make_shard_objects(self, shard_range): + lower = ord(shard_range.lower[0]) if shard_range.lower else ord('@') + upper = ord(shard_range.upper[0]) if shard_range.upper else ord('z') + + objects = [{'name': chr(i), 'bytes': i, 'hash': 'hash%s' % chr(i), + 'content_type': 'text/plain', 'deleted': 0, + 'last_modified': next(self.ts_iter).isoformat} + for i in range(lower + 1, upper + 1)] + return objects + + def _check_GET_shard_listing(self, mock_responses, expected_objects, + expected_requests, query_string='', + reverse=False): + # mock_responses is a list of tuples (status, json body, headers) + # expected objects is a list of dicts + # expected_requests is a list of tuples (path, hdrs dict, params dict) + + # sanity check that expected objects is name ordered with no repeats + def name(obj): + return obj.get('name', obj.get('subdir')) + + for (prev, next_) in zip(expected_objects, expected_objects[1:]): + if reverse: + self.assertGreater(name(prev), name(next_)) + else: + self.assertLess(name(prev), name(next_)) + container_path = '/v1/a/c' + query_string + codes = (resp[0] for resp in mock_responses) + bodies = iter([json.dumps(resp[1]) for resp in mock_responses]) + exp_headers = [resp[2] for resp in mock_responses] + request = Request.blank(container_path) + with mocked_http_conn( + *codes, body_iter=bodies, headers=exp_headers) as fake_conn: + resp = request.get_response(self.app) + for backend_req in fake_conn.requests: + self.assertEqual(request.headers['X-Trans-Id'], + backend_req['headers']['X-Trans-Id']) + self.assertTrue(backend_req['headers']['User-Agent'].startswith( + 'proxy-server')) + self.assertEqual(200, resp.status_int) + actual_objects = json.loads(resp.body) + self.assertEqual(len(expected_objects), len(actual_objects)) + self.assertEqual(expected_objects, actual_objects) + self.assertEqual(len(expected_requests), len(fake_conn.requests)) + for i, ((exp_path, exp_headers, exp_params), req) in enumerate( + zip(expected_requests, fake_conn.requests)): + with annotate_failure('Request check at index %d.' % i): + # strip off /sdx/0/ from path + self.assertEqual(exp_path, req['path'][7:]) + self.assertEqual( + dict(exp_params, format='json'), + dict(urllib.parse.parse_qsl(req['qs'], True))) + for k, v in exp_headers.items(): + self.assertIn(k, req['headers']) + self.assertEqual(v, req['headers'][k]) + self.assertNotIn('X-Backend-Override-Delete', req['headers']) + return resp + + def check_response(self, resp, root_resp_hdrs, expected_objects=None): + info_hdrs = dict(root_resp_hdrs) + if expected_objects is None: + # default is to expect whatever the root container sent + expected_obj_count = root_resp_hdrs['X-Container-Object-Count'] + expected_bytes_used = root_resp_hdrs['X-Container-Bytes-Used'] + else: + expected_bytes_used = sum([o['bytes'] for o in expected_objects]) + expected_obj_count = len(expected_objects) + info_hdrs['X-Container-Bytes-Used'] = expected_bytes_used + info_hdrs['X-Container-Object-Count'] = expected_obj_count + self.assertEqual(expected_bytes_used, + int(resp.headers['X-Container-Bytes-Used'])) + self.assertEqual(expected_obj_count, + int(resp.headers['X-Container-Object-Count'])) + self.assertEqual('sharded', resp.headers['X-Backend-Sharding-State']) + for k, v in root_resp_hdrs.items(): + if k.lower().startswith('x-container-meta'): + self.assertEqual(v, resp.headers[k]) + # check that info cache is correct for root container + info = get_container_info(resp.request.environ, self.app) + self.assertEqual(headers_to_container_info(info_hdrs), info) + + def test_GET_sharded_container(self): + shard_bounds = (('', 'ham'), ('ham', 'pie'), ('pie', '')) + shard_ranges = [ + ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper) + for lower, upper in shard_bounds] + sr_dicts = [dict(sr) for sr in shard_ranges] + sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges] + shard_resp_hdrs = [ + {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': len(sr_objs[i]), + 'X-Container-Bytes-Used': + sum([obj['bytes'] for obj in sr_objs[i]]), + 'X-Container-Meta-Flavour': 'flavour%d' % i, + 'X-Backend-Storage-Policy-Index': 0} + for i in range(3)] + + all_objects = [] + for objects in sr_objs: + all_objects.extend(objects) + size_all_objects = sum([obj['bytes'] for obj in all_objects]) + num_all_objects = len(all_objects) + limit = CONTAINER_LISTING_LIMIT + expected_objects = all_objects + root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded', + # pretend root object stats are not yet updated + 'X-Container-Object-Count': num_all_objects - 1, + 'X-Container-Bytes-Used': size_all_objects - 1, + 'X-Container-Meta-Flavour': 'peach', + 'X-Backend-Storage-Policy-Index': 0} + root_shard_resp_hdrs = dict(root_resp_hdrs) + root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + # GET all objects + # include some failed responses + mock_responses = [ + # status, body, headers + (404, '', {}), + (200, sr_dicts, root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (200, sr_objs[1], shard_resp_hdrs[1]), + (200, sr_objs[2], shard_resp_hdrs[2]) + ] + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 404 + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', limit=str(limit), + states='listing')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), # 200 + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='p', end_marker='', states='listing', + limit=str(limit - len(sr_objs[0] + sr_objs[1])))) # 200 + ] + + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs, + expected_objects=expected_objects) + + # GET all objects - sharding, final shard range points back to root + root_range = ShardRange('a/c', Timestamp.now(), 'pie', '') + mock_responses = [ + # status, body, headers + (200, sr_dicts[:2] + [dict(root_range)], root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (200, sr_objs[1], shard_resp_hdrs[1]), + (200, sr_objs[2], root_resp_hdrs) + ] + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', limit=str(limit), + states='listing')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), # 200 + (root_range.name, {'X-Backend-Record-Type': 'object'}, + dict(marker='p', end_marker='', + limit=str(limit - len(sr_objs[0] + sr_objs[1])))) # 200 + ] + + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs, + expected_objects=expected_objects) + + # GET all objects in reverse + mock_responses = [ + # status, body, headers + (200, list(reversed(sr_dicts)), root_shard_resp_hdrs), + (200, list(reversed(sr_objs[2])), shard_resp_hdrs[2]), + (200, list(reversed(sr_objs[1])), shard_resp_hdrs[1]), + (200, list(reversed(sr_objs[0])), shard_resp_hdrs[0]), + ] + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing', reverse='true')), + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='pie', reverse='true', + limit=str(limit), states='listing')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='q', end_marker='ham', states='listing', + reverse='true', limit=str(limit - len(sr_objs[2])))), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='i', end_marker='', states='listing', reverse='true', + limit=str(limit - len(sr_objs[2] + sr_objs[1])))), # 200 + ] + + resp = self._check_GET_shard_listing( + mock_responses, list(reversed(expected_objects)), + expected_requests, query_string='?reverse=true', reverse=True) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs, + expected_objects=expected_objects) + + # GET with limit param + limit = len(sr_objs[0]) + len(sr_objs[1]) + 1 + expected_objects = all_objects[:limit] + mock_responses = [ + (404, '', {}), + (200, sr_dicts, root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (200, sr_objs[1], shard_resp_hdrs[1]), + (200, sr_objs[2][:1], shard_resp_hdrs[2]) + ] + expected_requests = [ + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(limit=str(limit), states='listing')), # 404 + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(limit=str(limit), states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker='', end_marker='ham\x00', states='listing', + limit=str(limit))), + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker='p', end_marker='', states='listing', + limit=str(limit - len(sr_objs[0] + sr_objs[1])))) + ] + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests, + query_string='?limit=%s' % limit) + self.check_response(resp, root_resp_hdrs) + + # GET with marker + marker = sr_objs[1][2]['name'] + first_included = len(sr_objs[0]) + 2 + limit = CONTAINER_LISTING_LIMIT + expected_objects = all_objects[first_included:] + mock_responses = [ + (404, '', {}), + (200, sr_dicts[1:], root_shard_resp_hdrs), + (404, '', {}), + (200, sr_objs[1][2:], shard_resp_hdrs[1]), + (200, sr_objs[2], shard_resp_hdrs[2]) + ] + expected_requests = [ + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(marker=marker, states='listing')), # 404 + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(marker=marker, states='listing')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, # 404 + dict(marker=marker, end_marker='pie\x00', states='listing', + limit=str(limit))), + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker=marker, end_marker='pie\x00', states='listing', + limit=str(limit))), + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker='p', end_marker='', states='listing', + limit=str(limit - len(sr_objs[1][2:])))), + ] + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests, + query_string='?marker=%s' % marker) + self.check_response(resp, root_resp_hdrs) + + # GET with end marker + end_marker = sr_objs[1][6]['name'] + first_excluded = len(sr_objs[0]) + 6 + expected_objects = all_objects[:first_excluded] + mock_responses = [ + (404, '', {}), + (200, sr_dicts[:2], root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (404, '', {}), + (200, sr_objs[1][:6], shard_resp_hdrs[1]) + ] + expected_requests = [ + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(end_marker=end_marker, states='listing')), # 404 + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(end_marker=end_marker, states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker='', end_marker='ham\x00', states='listing', + limit=str(limit))), + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, # 404 + dict(marker='h', end_marker=end_marker, states='listing', + limit=str(limit - len(sr_objs[0])))), + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker='h', end_marker=end_marker, states='listing', + limit=str(limit - len(sr_objs[0])))), + ] + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests, + query_string='?end_marker=%s' % end_marker) + self.check_response(resp, root_resp_hdrs) + + # marker and end_marker and limit + limit = 2 + expected_objects = all_objects[first_included:first_excluded] + mock_responses = [ + (200, sr_dicts[1:2], root_shard_resp_hdrs), + (200, sr_objs[1][2:6], shard_resp_hdrs[1]) + ] + expected_requests = [ + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing', limit=str(limit), + marker=marker, end_marker=end_marker)), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker=marker, end_marker=end_marker, states='listing', + limit=str(limit))), + ] + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests, + query_string='?marker=%s&end_marker=%s&limit=%s' + % (marker, end_marker, limit)) + self.check_response(resp, root_resp_hdrs) + + # reverse with marker, end_marker + expected_objects.reverse() + mock_responses = [ + (200, sr_dicts[1:2], root_shard_resp_hdrs), + (200, list(reversed(sr_objs[1][2:6])), shard_resp_hdrs[1]) + ] + expected_requests = [ + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(marker=end_marker, reverse='true', end_marker=marker, + limit=str(limit), states='listing',)), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, # 200 + dict(marker=end_marker, end_marker=marker, states='listing', + limit=str(limit), reverse='true')), + ] + self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests, + query_string='?marker=%s&end_marker=%s&limit=%s&reverse=true' + % (end_marker, marker, limit), reverse=True) + self.check_response(resp, root_resp_hdrs) + + def test_GET_sharded_container_with_delimiter(self): + shard_bounds = (('', 'ham'), ('ham', 'pie'), ('pie', '')) + shard_ranges = [ + ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper) + for lower, upper in shard_bounds] + sr_dicts = [dict(sr) for sr in shard_ranges] + shard_resp_hdrs = {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': 2, + 'X-Container-Bytes-Used': 4, + 'X-Backend-Storage-Policy-Index': 0} + + limit = CONTAINER_LISTING_LIMIT + root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded', + # pretend root object stats are not yet updated + 'X-Container-Object-Count': 6, + 'X-Container-Bytes-Used': 12, + 'X-Backend-Storage-Policy-Index': 0} + root_shard_resp_hdrs = dict(root_resp_hdrs) + root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + sr_0_obj = {'name': 'apple', + 'bytes': 1, + 'hash': 'hash', + 'content_type': 'text/plain', + 'deleted': 0, + 'last_modified': next(self.ts_iter).isoformat} + sr_2_obj = {'name': 'pumpkin', + 'bytes': 1, + 'hash': 'hash', + 'content_type': 'text/plain', + 'deleted': 0, + 'last_modified': next(self.ts_iter).isoformat} + subdir = {'subdir': 'ha/'} + mock_responses = [ + # status, body, headers + (200, sr_dicts, root_shard_resp_hdrs), + (200, [sr_0_obj, subdir], shard_resp_hdrs), + (200, [], shard_resp_hdrs), + (200, [sr_2_obj], shard_resp_hdrs) + ] + expected_requests = [ + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing', delimiter='/')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', limit=str(limit), + states='listing', delimiter='/')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='ha/', end_marker='pie\x00', states='listing', + limit=str(limit - 2), delimiter='/')), # 200 + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='ha/', end_marker='', states='listing', + limit=str(limit - 2), delimiter='/')) # 200 + ] + + expected_objects = [sr_0_obj, subdir, sr_2_obj] + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests, + query_string='?delimiter=/') + self.check_response(resp, root_resp_hdrs) + + def test_GET_sharded_container_overlapping_shards(self): + # verify ordered listing even if unexpected overlapping shard ranges + shard_bounds = (('', 'ham', ShardRange.CLEAVED), + ('', 'pie', ShardRange.ACTIVE), + ('lemon', '', ShardRange.ACTIVE)) + shard_ranges = [ + ShardRange('.shards_a/c_' + upper, Timestamp.now(), lower, upper, + state=state) + for lower, upper, state in shard_bounds] + sr_dicts = [dict(sr) for sr in shard_ranges] + sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges] + shard_resp_hdrs = [ + {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': len(sr_objs[i]), + 'X-Container-Bytes-Used': + sum([obj['bytes'] for obj in sr_objs[i]]), + 'X-Container-Meta-Flavour': 'flavour%d' % i, + 'X-Backend-Storage-Policy-Index': 0} + for i in range(3)] + + all_objects = [] + for objects in sr_objs: + all_objects.extend(objects) + size_all_objects = sum([obj['bytes'] for obj in all_objects]) + num_all_objects = len(all_objects) + limit = CONTAINER_LISTING_LIMIT + root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded', + # pretend root object stats are not yet updated + 'X-Container-Object-Count': num_all_objects - 1, + 'X-Container-Bytes-Used': size_all_objects - 1, + 'X-Container-Meta-Flavour': 'peach', + 'X-Backend-Storage-Policy-Index': 0} + root_shard_resp_hdrs = dict(root_resp_hdrs) + root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + # forwards listing + + # expect subset of second shard range + objs_1 = [o for o in sr_objs[1] if o['name'] > sr_objs[0][-1]['name']] + # expect subset of third shard range + objs_2 = [o for o in sr_objs[2] if o['name'] > sr_objs[1][-1]['name']] + mock_responses = [ + # status, body, headers + (200, sr_dicts, root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (200, objs_1, shard_resp_hdrs[1]), + (200, objs_2, shard_resp_hdrs[2]) + ] + # NB marker always advances to last object name + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', states='listing', + limit=str(limit))), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), # 200 + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='p', end_marker='', states='listing', + limit=str(limit - len(sr_objs[0] + objs_1)))) # 200 + ] + + expected_objects = sr_objs[0] + objs_1 + objs_2 + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs, + expected_objects=expected_objects) + + # reverse listing + + # expect subset of third shard range + objs_0 = [o for o in sr_objs[0] if o['name'] < sr_objs[1][0]['name']] + # expect subset of second shard range + objs_1 = [o for o in sr_objs[1] if o['name'] < sr_objs[2][0]['name']] + mock_responses = [ + # status, body, headers + (200, list(reversed(sr_dicts)), root_shard_resp_hdrs), + (200, list(reversed(sr_objs[2])), shard_resp_hdrs[2]), + (200, list(reversed(objs_1)), shard_resp_hdrs[1]), + (200, list(reversed(objs_0)), shard_resp_hdrs[0]), + ] + # NB marker always advances to last object name + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing', reverse='true')), # 200 + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='lemon', states='listing', + limit=str(limit), + reverse='true')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='m', end_marker='', reverse='true', states='listing', + limit=str(limit - len(sr_objs[2])))), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='A', end_marker='', reverse='true', states='listing', + limit=str(limit - len(sr_objs[2] + objs_1)))) # 200 + ] + + expected_objects = list(reversed(objs_0 + objs_1 + sr_objs[2])) + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests, + query_string='?reverse=true', reverse=True) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs, + expected_objects=expected_objects) + + def test_GET_sharded_container_gap_in_shards(self): + # verify ordered listing even if unexpected gap between shard ranges + shard_bounds = (('', 'ham'), ('onion', 'pie'), ('rhubarb', '')) + shard_ranges = [ + ShardRange('.shards_a/c_' + upper, Timestamp.now(), lower, upper) + for lower, upper in shard_bounds] + sr_dicts = [dict(sr) for sr in shard_ranges] + sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges] + shard_resp_hdrs = [ + {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': len(sr_objs[i]), + 'X-Container-Bytes-Used': + sum([obj['bytes'] for obj in sr_objs[i]]), + 'X-Container-Meta-Flavour': 'flavour%d' % i, + 'X-Backend-Storage-Policy-Index': 0} + for i in range(3)] + + all_objects = [] + for objects in sr_objs: + all_objects.extend(objects) + size_all_objects = sum([obj['bytes'] for obj in all_objects]) + num_all_objects = len(all_objects) + limit = CONTAINER_LISTING_LIMIT + root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded', + 'X-Container-Object-Count': num_all_objects, + 'X-Container-Bytes-Used': size_all_objects, + 'X-Container-Meta-Flavour': 'peach', + 'X-Backend-Storage-Policy-Index': 0} + root_shard_resp_hdrs = dict(root_resp_hdrs) + root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + mock_responses = [ + # status, body, headers + (200, sr_dicts, root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (200, sr_objs[1], shard_resp_hdrs[1]), + (200, sr_objs[2], shard_resp_hdrs[2]) + ] + # NB marker always advances to last object name + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', states='listing', + limit=str(limit))), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), # 200 + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='p', end_marker='', states='listing', + limit=str(limit - len(sr_objs[0] + sr_objs[1])))) # 200 + ] + + resp = self._check_GET_shard_listing( + mock_responses, all_objects, expected_requests) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs) + + def test_GET_sharded_container_empty_shard(self): + # verify ordered listing when a shard is empty + shard_bounds = (('', 'ham'), ('ham', 'pie'), ('lemon', '')) + shard_ranges = [ + ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper) + for lower, upper in shard_bounds] + sr_dicts = [dict(sr) for sr in shard_ranges] + sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges] + # empty second shard range + sr_objs[1] = [] + shard_resp_hdrs = [ + {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': len(sr_objs[i]), + 'X-Container-Bytes-Used': + sum([obj['bytes'] for obj in sr_objs[i]]), + 'X-Container-Meta-Flavour': 'flavour%d' % i, + 'X-Backend-Storage-Policy-Index': 0} + for i in range(3)] + + all_objects = [] + for objects in sr_objs: + all_objects.extend(objects) + size_all_objects = sum([obj['bytes'] for obj in all_objects]) + num_all_objects = len(all_objects) + limit = CONTAINER_LISTING_LIMIT + root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded', + 'X-Container-Object-Count': num_all_objects, + 'X-Container-Bytes-Used': size_all_objects, + 'X-Container-Meta-Flavour': 'peach', + 'X-Backend-Storage-Policy-Index': 0} + root_shard_resp_hdrs = dict(root_resp_hdrs) + root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + mock_responses = [ + # status, body, headers + (200, sr_dicts, root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (200, sr_objs[1], shard_resp_hdrs[1]), + (200, sr_objs[2], shard_resp_hdrs[2]) + ] + # NB marker always advances to last object name + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', states='listing', + limit=str(limit))), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), # 200 + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='', states='listing', + limit=str(limit - len(sr_objs[0] + sr_objs[1])))) # 200 + ] + + resp = self._check_GET_shard_listing( + mock_responses, all_objects, expected_requests) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs) + + # marker in empty second range + mock_responses = [ + # status, body, headers + (200, sr_dicts[1:], root_shard_resp_hdrs), + (200, sr_objs[1], shard_resp_hdrs[1]), + (200, sr_objs[2], shard_resp_hdrs[2]) + ] + # NB marker unchanged when getting from third range + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing', marker='koolaid')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='koolaid', end_marker='pie\x00', states='listing', + limit=str(limit))), # 200 + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='koolaid', end_marker='', states='listing', + limit=str(limit))) # 200 + ] + + resp = self._check_GET_shard_listing( + mock_responses, sr_objs[2], expected_requests, + query_string='?marker=koolaid') + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs) + + # marker in empty second range, reverse + mock_responses = [ + # status, body, headers + (200, list(reversed(sr_dicts[:2])), root_shard_resp_hdrs), + (200, list(reversed(sr_objs[1])), shard_resp_hdrs[1]), + (200, list(reversed(sr_objs[0])), shard_resp_hdrs[2]) + ] + # NB marker unchanged when getting from first range + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing', marker='koolaid', reverse='true')), # 200 + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='koolaid', end_marker='ham', reverse='true', + states='listing', limit=str(limit))), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='koolaid', end_marker='', reverse='true', + states='listing', limit=str(limit))) # 200 + ] + + resp = self._check_GET_shard_listing( + mock_responses, list(reversed(sr_objs[0])), expected_requests, + query_string='?marker=koolaid&reverse=true', reverse=True) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs) + + def _check_GET_sharded_container_shard_error(self, error): + # verify ordered listing when a shard is empty + shard_bounds = (('', 'ham'), ('ham', 'pie'), ('lemon', '')) + shard_ranges = [ + ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper) + for lower, upper in shard_bounds] + sr_dicts = [dict(sr) for sr in shard_ranges] + sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges] + # empty second shard range + sr_objs[1] = [] + shard_resp_hdrs = [ + {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': len(sr_objs[i]), + 'X-Container-Bytes-Used': + sum([obj['bytes'] for obj in sr_objs[i]]), + 'X-Container-Meta-Flavour': 'flavour%d' % i, + 'X-Backend-Storage-Policy-Index': 0} + for i in range(3)] + + all_objects = [] + for objects in sr_objs: + all_objects.extend(objects) + size_all_objects = sum([obj['bytes'] for obj in all_objects]) + num_all_objects = len(all_objects) + limit = CONTAINER_LISTING_LIMIT + root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded', + 'X-Container-Object-Count': num_all_objects, + 'X-Container-Bytes-Used': size_all_objects, + 'X-Container-Meta-Flavour': 'peach', + 'X-Backend-Storage-Policy-Index': 0} + root_shard_resp_hdrs = dict(root_resp_hdrs) + root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + mock_responses = [ + # status, body, headers + (200, sr_dicts, root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0])] + \ + [(error, [], {})] * 2 * self.CONTAINER_REPLICAS + \ + [(200, sr_objs[2], shard_resp_hdrs[2])] + + # NB marker always advances to last object name + expected_requests = [ + # path, headers, params + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 200 + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', states='listing', + limit=str(limit)))] \ + + [(shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))) + ] * 2 * self.CONTAINER_REPLICAS \ + + [(shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='', states='listing', + limit=str(limit - len(sr_objs[0] + sr_objs[1]))))] + + resp = self._check_GET_shard_listing( + mock_responses, all_objects, expected_requests) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs) + + def test_GET_sharded_container_shard_errors(self): + self._check_GET_sharded_container_shard_error(404) + self._check_GET_sharded_container_shard_error(500) + + def test_GET_sharded_container_sharding_shard(self): + # one shard is in process of sharding + shard_bounds = (('', 'ham'), ('ham', 'pie'), ('pie', '')) + shard_ranges = [ + ShardRange('.shards_a/c_' + upper, Timestamp.now(), lower, upper) + for lower, upper in shard_bounds] + sr_dicts = [dict(sr) for sr in shard_ranges] + sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges] + shard_resp_hdrs = [ + {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': len(sr_objs[i]), + 'X-Container-Bytes-Used': + sum([obj['bytes'] for obj in sr_objs[i]]), + 'X-Container-Meta-Flavour': 'flavour%d' % i, + 'X-Backend-Storage-Policy-Index': 0} + for i in range(3)] + shard_1_shard_resp_hdrs = dict(shard_resp_hdrs[1]) + shard_1_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + # second shard is sharding and has cleaved two out of three sub shards + shard_resp_hdrs[1]['X-Backend-Sharding-State'] = 'sharding' + sub_shard_bounds = (('ham', 'juice'), ('juice', 'lemon')) + sub_shard_ranges = [ + ShardRange('a/c_sub_' + upper, Timestamp.now(), lower, upper) + for lower, upper in sub_shard_bounds] + sub_sr_dicts = [dict(sr) for sr in sub_shard_ranges] + sub_sr_objs = [self._make_shard_objects(sr) for sr in sub_shard_ranges] + sub_shard_resp_hdrs = [ + {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': len(sub_sr_objs[i]), + 'X-Container-Bytes-Used': + sum([obj['bytes'] for obj in sub_sr_objs[i]]), + 'X-Container-Meta-Flavour': 'flavour%d' % i, + 'X-Backend-Storage-Policy-Index': 0} + for i in range(2)] + + all_objects = [] + for objects in sr_objs: + all_objects.extend(objects) + size_all_objects = sum([obj['bytes'] for obj in all_objects]) + num_all_objects = len(all_objects) + limit = CONTAINER_LISTING_LIMIT + root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded', + 'X-Container-Object-Count': num_all_objects, + 'X-Container-Bytes-Used': size_all_objects, + 'X-Container-Meta-Flavour': 'peach', + 'X-Backend-Storage-Policy-Index': 0} + root_shard_resp_hdrs = dict(root_resp_hdrs) + root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard' + + mock_responses = [ + # status, body, headers + (200, sr_dicts, root_shard_resp_hdrs), + (200, sr_objs[0], shard_resp_hdrs[0]), + (200, sub_sr_dicts + [sr_dicts[1]], shard_1_shard_resp_hdrs), + (200, sub_sr_objs[0], sub_shard_resp_hdrs[0]), + (200, sub_sr_objs[1], sub_shard_resp_hdrs[1]), + (200, sr_objs[1][len(sub_sr_objs[0] + sub_sr_objs[1]):], + shard_resp_hdrs[1]), + (200, sr_objs[2], shard_resp_hdrs[2]) + ] + # NB marker always advances to last object name + expected_requests = [ + # get root shard ranges + ('a/c', {'X-Backend-Record-Type': 'auto'}, + dict(states='listing')), # 200 + # get first shard objects + (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='', end_marker='ham\x00', states='listing', + limit=str(limit))), # 200 + # get second shard sub-shard ranges + (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='pie\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), + # get first sub-shard objects + (sub_shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='h', end_marker='juice\x00', states='listing', + limit=str(limit - len(sr_objs[0])))), + # get second sub-shard objects + (sub_shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='j', end_marker='lemon\x00', states='listing', + limit=str(limit - len(sr_objs[0] + sub_sr_objs[0])))), + # get remainder of first shard objects + (shard_ranges[1].name, {'X-Backend-Record-Type': 'object'}, + dict(marker='l', end_marker='pie\x00', + limit=str(limit - len(sr_objs[0] + sub_sr_objs[0] + + sub_sr_objs[1])))), # 200 + # get third shard objects + (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'}, + dict(marker='p', end_marker='', states='listing', + limit=str(limit - len(sr_objs[0] + sr_objs[1])))) # 200 + ] + expected_objects = ( + sr_objs[0] + sub_sr_objs[0] + sub_sr_objs[1] + + sr_objs[1][len(sub_sr_objs[0] + sub_sr_objs[1]):] + sr_objs[2]) + resp = self._check_GET_shard_listing( + mock_responses, expected_objects, expected_requests) + # root object count will overridden by actual length of listing + self.check_response(resp, root_resp_hdrs) + @patch_policies( [StoragePolicy(0, 'zero', True, object_ring=FakeRing(replicas=4))]) From 4a3efe61a978b9d7adeabd556ce4a3820e6af555 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Wed, 2 May 2018 10:21:11 +0100 Subject: [PATCH 7/9] Redirect object updates to shard containers Enable the proxy to fetch a shard container location from the container server in order to redirect an object update to the shard. Enable the container server to redirect object updates to shard containers. Enable object updater to accept redirection of an object update. Co-Authored-By: Matthew Oliver Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Change-Id: I6ff85827eecdea746b3626c0d401f68139cce19d --- swift/common/utils.py | 24 ++ swift/container/server.py | 51 +++- swift/obj/server.py | 70 ++++- swift/obj/updater.py | 165 ++++++---- swift/proxy/controllers/obj.py | 39 ++- test/unit/common/test_utils.py | 45 +++ test/unit/container/test_server.py | 195 +++++++++++- test/unit/obj/test_server.py | 225 ++++++++++++-- test/unit/obj/test_updater.py | 466 ++++++++++++++++++++++++++++- test/unit/proxy/test_server.py | 266 +++++++++++----- 10 files changed, 1351 insertions(+), 195 deletions(-) diff --git a/swift/common/utils.py b/swift/common/utils.py index 40c2cb7a8f..4a1c6e3911 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -5302,6 +5302,30 @@ def distribute_evenly(items, num_buckets): return out +def get_redirect_data(response): + """ + Extract a redirect location from a response's headers. + + :param response: a response + :return: a tuple of (path, Timestamp) if a Location header is found, + otherwise None + :raises ValueError: if the Location header is found but a + X-Backend-Redirect-Timestamp is not found, or if there is a problem + with the format of etiher header + """ + headers = HeaderKeyDict(response.getheaders()) + if 'Location' not in headers: + return None + location = urlparse(headers['Location']).path + account, container, _junk = split_path(location, 2, 3, True) + timestamp_val = headers.get('X-Backend-Redirect-Timestamp') + try: + timestamp = Timestamp(timestamp_val) + except (TypeError, ValueError): + raise ValueError('Invalid timestamp value: %s' % timestamp_val) + return '%s/%s' % (account, container), timestamp + + def parse_db_filename(filename): """ Splits a db filename into three parts: the hash, the epoch, and the diff --git a/swift/container/server.py b/swift/container/server.py index f8e830cb59..48a8d2c2e9 100644 --- a/swift/container/server.py +++ b/swift/container/server.py @@ -25,7 +25,7 @@ from eventlet import Timeout import swift.common.db from swift.container.sync_store import ContainerSyncStore from swift.container.backend import ContainerBroker, DATADIR, \ - RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED + RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED, SHARD_UPDATE_STATES from swift.container.replicator import ContainerReplicatorRpc from swift.common.db import DatabaseAlreadyExists from swift.common.container_sync_realms import ContainerSyncRealms @@ -34,8 +34,7 @@ from swift.common.request_helpers import get_param, \ from swift.common.utils import get_logger, hash_path, public, \ Timestamp, storage_directory, validate_sync_to, \ config_true_value, timing_stats, replication, \ - override_bytes_from_content_type, get_log_line, ShardRange, \ - list_from_csv + override_bytes_from_content_type, get_log_line, ShardRange, list_from_csv from swift.common.constraints import valid_timestamp, check_utf8, check_drive from swift.common import constraints @@ -49,7 +48,7 @@ from swift.common.header_key_dict import HeaderKeyDict from swift.common.swob import HTTPAccepted, HTTPBadRequest, HTTPConflict, \ HTTPCreated, HTTPInternalServerError, HTTPNoContent, HTTPNotFound, \ HTTPPreconditionFailed, HTTPMethodNotAllowed, Request, Response, \ - HTTPInsufficientStorage, HTTPException + HTTPInsufficientStorage, HTTPException, HTTPMovedPermanently def gen_resp_headers(info, is_deleted=False): @@ -265,6 +264,40 @@ class ContainerController(BaseStorageServer): self.logger.exception('Failed to update sync_store %s during %s' % (broker.db_file, method)) + def _redirect_to_shard(self, req, broker, obj_name): + """ + If the request indicates that it can accept a redirection, look for a + shard range that contains ``obj_name`` and if one exists return a + HTTPMovedPermanently response. + + :param req: an instance of :class:`~swift.common.swob.Request` + :param broker: a container broker + :param obj_name: an object name + :return: an instance of :class:`swift.common.swob.HTTPMovedPermanently` + if a shard range exists for the given ``obj_name``, otherwise None. + """ + if not config_true_value( + req.headers.get('x-backend-accept-redirect', False)): + return None + + shard_ranges = broker.get_shard_ranges( + includes=obj_name, states=SHARD_UPDATE_STATES) + if not shard_ranges: + return None + + # note: obj_name may be included in both a created sub-shard and its + # sharding parent. get_shard_ranges will return the created sub-shard + # in preference to the parent, which is the desired result. + containing_range = shard_ranges[0] + location = "/%s/%s" % (containing_range.name, obj_name) + headers = {'Location': location, + 'X-Backend-Redirect-Timestamp': + containing_range.timestamp.internal} + + # we do not want the host added to the location + req.environ['swift.leave_relative_location'] = True + return HTTPMovedPermanently(headers=headers, request=req) + @public @timing_stats() def DELETE(self, req): @@ -287,6 +320,11 @@ class ContainerController(BaseStorageServer): if not os.path.exists(broker.db_file): return HTTPNotFound() if obj: # delete object + # redirect if a shard range exists for the object name + redirect = self._redirect_to_shard(req, broker, obj) + if redirect: + return redirect + broker.delete_object(obj, req.headers.get('x-timestamp'), obj_policy_index) return HTTPNoContent(request=req) @@ -404,6 +442,11 @@ class ContainerController(BaseStorageServer): obj_policy_index = requested_policy_index or 0 self._maybe_autocreate(broker, req_timestamp, account, obj_policy_index) + # redirect if a shard exists for this object name + response = self._redirect_to_shard(req, broker, obj) + if response: + return response + broker.put_object(obj, req_timestamp.internal, int(req.headers['x-size']), req.headers['x-content-type'], diff --git a/swift/obj/server.py b/swift/obj/server.py index 36bd758d3f..2f584bb319 100644 --- a/swift/obj/server.py +++ b/swift/obj/server.py @@ -35,7 +35,7 @@ from swift.common.utils import public, get_logger, \ normalize_delete_at_timestamp, get_log_line, Timestamp, \ get_expirer_container, parse_mime_headers, \ iter_multipart_mime_documents, extract_swift_bytes, safe_json_loads, \ - config_auto_int_value + config_auto_int_value, split_path, get_redirect_data from swift.common.bufferedhttp import http_connect from swift.common.constraints import check_object_creation, \ valid_timestamp, check_utf8 @@ -44,7 +44,7 @@ from swift.common.exceptions import ConnectionTimeout, DiskFileQuarantined, \ DiskFileDeviceUnavailable, DiskFileExpired, ChunkReadTimeout, \ ChunkReadError, DiskFileXattrNotSupported from swift.obj import ssync_receiver -from swift.common.http import is_success +from swift.common.http import is_success, HTTP_MOVED_PERMANENTLY from swift.common.base_storage_server import BaseStorageServer from swift.common.header_key_dict import HeaderKeyDict from swift.common.request_helpers import get_name_and_placement, \ @@ -245,7 +245,7 @@ class ObjectController(BaseStorageServer): def async_update(self, op, account, container, obj, host, partition, contdevice, headers_out, objdevice, policy, - logger_thread_locals=None): + logger_thread_locals=None, container_path=None): """ Sends or saves an async update. @@ -263,11 +263,21 @@ class ObjectController(BaseStorageServer): :param logger_thread_locals: The thread local values to be set on the self.logger to retain transaction logging information. + :param container_path: optional path in the form `` + to which the update should be sent. If given this path will be used + instead of constructing a path from the ``account`` and + ``container`` params. """ if logger_thread_locals: self.logger.thread_locals = logger_thread_locals headers_out['user-agent'] = 'object-server %s' % os.getpid() - full_path = '/%s/%s/%s' % (account, container, obj) + if container_path: + # use explicitly specified container path + full_path = '/%s/%s' % (container_path, obj) + else: + full_path = '/%s/%s/%s' % (account, container, obj) + + redirect_data = None if all([host, partition, contdevice]): try: with ConnectionTimeout(self.conn_timeout): @@ -277,15 +287,23 @@ class ObjectController(BaseStorageServer): with Timeout(self.node_timeout): response = conn.getresponse() response.read() - if is_success(response.status): - return - else: - self.logger.error(_( - 'ERROR Container update failed ' - '(saving for async update later): %(status)d ' - 'response from %(ip)s:%(port)s/%(dev)s'), - {'status': response.status, 'ip': ip, 'port': port, - 'dev': contdevice}) + if is_success(response.status): + return + + if response.status == HTTP_MOVED_PERMANENTLY: + try: + redirect_data = get_redirect_data(response) + except ValueError as err: + self.logger.error( + 'Container update failed for %r; problem with ' + 'redirect location: %s' % (obj, err)) + else: + self.logger.error(_( + 'ERROR Container update failed ' + '(saving for async update later): %(status)d ' + 'response from %(ip)s:%(port)s/%(dev)s'), + {'status': response.status, 'ip': ip, 'port': port, + 'dev': contdevice}) except (Exception, Timeout): self.logger.exception(_( 'ERROR container update failed with ' @@ -293,6 +311,13 @@ class ObjectController(BaseStorageServer): {'ip': ip, 'port': port, 'dev': contdevice}) data = {'op': op, 'account': account, 'container': container, 'obj': obj, 'headers': headers_out} + if redirect_data: + self.logger.debug( + 'Update to %(path)s redirected to %(redirect)s', + {'path': full_path, 'redirect': redirect_data[0]}) + container_path = redirect_data[0] + if container_path: + data['container_path'] = container_path timestamp = headers_out.get('x-meta-timestamp', headers_out.get('x-timestamp')) self._diskfile_router[policy].pickle_async_update( @@ -319,6 +344,7 @@ class ObjectController(BaseStorageServer): contdevices = [d.strip() for d in headers_in.get('X-Container-Device', '').split(',')] contpartition = headers_in.get('X-Container-Partition', '') + contpath = headers_in.get('X-Backend-Container-Path') if len(conthosts) != len(contdevices): # This shouldn't happen unless there's a bug in the proxy, @@ -331,6 +357,21 @@ class ObjectController(BaseStorageServer): 'devices': headers_in.get('X-Container-Device', '')}) return + if contpath: + try: + # TODO: this is very late in request handling to be validating + # a header - if we did *not* check and the header was bad + # presumably the update would fail and we would fall back to an + # async update to the root container, which might be best + # course of action rather than aborting update altogether? + split_path('/' + contpath, minsegs=2, maxsegs=2) + except ValueError: + self.logger.error( + "Invalid X-Backend-Container-Path, should be of the form " + "'account/container' but got %r." % contpath) + # fall back to updating root container + contpath = None + if contpartition: updates = zip(conthosts, contdevices) else: @@ -344,7 +385,8 @@ class ObjectController(BaseStorageServer): gt = spawn(self.async_update, op, account, container, obj, conthost, contpartition, contdevice, headers_out, objdevice, policy, - logger_thread_locals=self.logger.thread_locals) + logger_thread_locals=self.logger.thread_locals, + container_path=contpath) update_greenthreads.append(gt) # Wait a little bit to see if the container updates are successful. # If we immediately return after firing off the greenthread above, then diff --git a/swift/obj/updater.py b/swift/obj/updater.py index df21c01d7b..febb754ce9 100644 --- a/swift/obj/updater.py +++ b/swift/obj/updater.py @@ -28,12 +28,14 @@ from swift.common.constraints import check_drive from swift.common.exceptions import ConnectionTimeout from swift.common.ring import Ring from swift.common.utils import get_logger, renamer, write_pickle, \ - dump_recon_cache, config_true_value, ratelimit_sleep, eventlet_monkey_patch + dump_recon_cache, config_true_value, ratelimit_sleep, split_path, \ + eventlet_monkey_patch, get_redirect_data from swift.common.daemon import Daemon from swift.common.header_key_dict import HeaderKeyDict from swift.common.storage_policy import split_policy_string, PolicyError from swift.obj.diskfile import get_tmp_dir, ASYNCDIR_BASE -from swift.common.http import is_success, HTTP_INTERNAL_SERVER_ERROR +from swift.common.http import is_success, HTTP_INTERNAL_SERVER_ERROR, \ + HTTP_MOVED_PERMANENTLY class SweepStats(object): @@ -41,12 +43,13 @@ class SweepStats(object): Stats bucket for an update sweep """ def __init__(self, errors=0, failures=0, quarantines=0, successes=0, - unlinks=0): + unlinks=0, redirects=0): self.errors = errors self.failures = failures self.quarantines = quarantines self.successes = successes self.unlinks = unlinks + self.redirects = redirects def copy(self): return type(self)(self.errors, self.failures, self.quarantines, @@ -57,7 +60,8 @@ class SweepStats(object): self.failures - other.failures, self.quarantines - other.quarantines, self.successes - other.successes, - self.unlinks - other.unlinks) + self.unlinks - other.unlinks, + self.redirects - other.redirects) def reset(self): self.errors = 0 @@ -65,6 +69,7 @@ class SweepStats(object): self.quarantines = 0 self.successes = 0 self.unlinks = 0 + self.redirects = 0 def __str__(self): keys = ( @@ -73,6 +78,7 @@ class SweepStats(object): (self.quarantines, 'quarantines'), (self.unlinks, 'unlinks'), (self.errors, 'errors'), + (self.redirects, 'redirects'), ) return ', '.join('%d %s' % pair for pair in keys) @@ -279,7 +285,8 @@ class ObjectUpdater(Daemon): 'in %(elapsed).02fs seconds:, ' '%(successes)d successes, %(failures)d failures, ' '%(quarantines)d quarantines, ' - '%(unlinks)d unlinks, %(errors)d errors ' + '%(unlinks)d unlinks, %(errors)d errors, ' + '%(redirects)d redirects ' '(pid: %(pid)d)'), {'device': device, 'elapsed': time.time() - start_time, @@ -288,7 +295,8 @@ class ObjectUpdater(Daemon): 'failures': sweep_totals.failures, 'quarantines': sweep_totals.quarantines, 'unlinks': sweep_totals.unlinks, - 'errors': sweep_totals.errors}) + 'errors': sweep_totals.errors, + 'redirects': sweep_totals.redirects}) def process_object_update(self, update_path, device, policy): """ @@ -309,44 +317,83 @@ class ObjectUpdater(Daemon): os.path.basename(update_path)) renamer(update_path, target_path, fsync=False) return - successes = update.get('successes', []) - part, nodes = self.get_container_ring().get_nodes( - update['account'], update['container']) - obj = '/%s/%s/%s' % \ - (update['account'], update['container'], update['obj']) - headers_out = HeaderKeyDict(update['headers']) - headers_out['user-agent'] = 'object-updater %s' % os.getpid() - headers_out.setdefault('X-Backend-Storage-Policy-Index', - str(int(policy))) - events = [spawn(self.object_update, - node, part, update['op'], obj, headers_out) - for node in nodes if node['id'] not in successes] - success = True - new_successes = False - for event in events: - event_success, node_id = event.wait() - if event_success is True: - successes.append(node_id) - new_successes = True + + def do_update(): + successes = update.get('successes', []) + headers_out = HeaderKeyDict(update['headers'].copy()) + headers_out['user-agent'] = 'object-updater %s' % os.getpid() + headers_out.setdefault('X-Backend-Storage-Policy-Index', + str(int(policy))) + headers_out.setdefault('X-Backend-Accept-Redirect', 'true') + container_path = update.get('container_path') + if container_path: + acct, cont = split_path('/' + container_path, minsegs=2) else: - success = False - if success: - self.stats.successes += 1 - self.logger.increment('successes') - self.logger.debug('Update sent for %(obj)s %(path)s', - {'obj': obj, 'path': update_path}) - self.stats.unlinks += 1 - self.logger.increment('unlinks') - os.unlink(update_path) - else: - self.stats.failures += 1 - self.logger.increment('failures') - self.logger.debug('Update failed for %(obj)s %(path)s', - {'obj': obj, 'path': update_path}) - if new_successes: - update['successes'] = successes - write_pickle(update, update_path, os.path.join( - device, get_tmp_dir(policy))) + acct, cont = update['account'], update['container'] + part, nodes = self.get_container_ring().get_nodes(acct, cont) + obj = '/%s/%s/%s' % (acct, cont, update['obj']) + events = [spawn(self.object_update, + node, part, update['op'], obj, headers_out) + for node in nodes if node['id'] not in successes] + success = True + new_successes = rewrite_pickle = False + redirect = None + redirects = set() + for event in events: + event_success, node_id, redirect = event.wait() + if event_success is True: + successes.append(node_id) + new_successes = True + else: + success = False + if redirect: + redirects.add(redirect) + + if success: + self.stats.successes += 1 + self.logger.increment('successes') + self.logger.debug('Update sent for %(obj)s %(path)s', + {'obj': obj, 'path': update_path}) + self.stats.unlinks += 1 + self.logger.increment('unlinks') + os.unlink(update_path) + elif redirects: + # erase any previous successes + update.pop('successes', None) + redirect = max(redirects, key=lambda x: x[-1])[0] + redirect_history = update.setdefault('redirect_history', []) + if redirect in redirect_history: + # force next update to be sent to root, reset history + update['container_path'] = None + update['redirect_history'] = [] + else: + update['container_path'] = redirect + redirect_history.append(redirect) + self.stats.redirects += 1 + self.logger.increment("redirects") + self.logger.debug( + 'Update redirected for %(obj)s %(path)s to %(shard)s', + {'obj': obj, 'path': update_path, + 'shard': update['container_path']}) + rewrite_pickle = True + else: + self.stats.failures += 1 + self.logger.increment('failures') + self.logger.debug('Update failed for %(obj)s %(path)s', + {'obj': obj, 'path': update_path}) + if new_successes: + update['successes'] = successes + rewrite_pickle = True + + return rewrite_pickle, redirect + + rewrite_pickle, redirect = do_update() + if redirect: + # make one immediate retry to the redirect location + rewrite_pickle, redirect = do_update() + if rewrite_pickle: + write_pickle(update, update_path, os.path.join( + device, get_tmp_dir(policy))) def object_update(self, node, part, op, obj, headers_out): """ @@ -357,7 +404,12 @@ class ObjectUpdater(Daemon): :param op: operation performed (ex: 'PUT' or 'DELETE') :param obj: object name being updated :param headers_out: headers to send with the update + :return: a tuple of (``success``, ``node_id``, ``redirect``) + where ``success`` is True if the update succeeded, ``node_id`` is + the_id of the node updated and ``redirect`` is either None or a + tuple of (a path, a timestamp string). """ + redirect = None try: with ConnectionTimeout(self.conn_timeout): conn = http_connect(node['ip'], node['port'], node['device'], @@ -365,15 +417,24 @@ class ObjectUpdater(Daemon): with Timeout(self.node_timeout): resp = conn.getresponse() resp.read() - success = is_success(resp.status) - if not success: - self.logger.debug( - _('Error code %(status)d is returned from remote ' - 'server %(ip)s: %(port)s / %(device)s'), - {'status': resp.status, 'ip': node['ip'], - 'port': node['port'], 'device': node['device']}) - return (success, node['id']) + + if resp.status == HTTP_MOVED_PERMANENTLY: + try: + redirect = get_redirect_data(resp) + except ValueError as err: + self.logger.error( + 'Container update failed for %r; problem with ' + 'redirect location: %s' % (obj, err)) + + success = is_success(resp.status) + if not success: + self.logger.debug( + _('Error code %(status)d is returned from remote ' + 'server %(ip)s: %(port)s / %(device)s'), + {'status': resp.status, 'ip': node['ip'], + 'port': node['port'], 'device': node['device']}) + return success, node['id'], redirect except (Exception, Timeout): self.logger.exception(_('ERROR with remote server ' '%(ip)s:%(port)s/%(device)s'), node) - return HTTP_INTERNAL_SERVER_ERROR, node['id'] + return HTTP_INTERNAL_SERVER_ERROR, node['id'], redirect diff --git a/swift/proxy/controllers/obj.py b/swift/proxy/controllers/obj.py index d8aadf7935..7a41ef3c53 100644 --- a/swift/proxy/controllers/obj.py +++ b/swift/proxy/controllers/obj.py @@ -266,6 +266,20 @@ class BaseObjectController(Controller): """Handler for HTTP HEAD requests.""" return self.GETorHEAD(req) + def _get_update_target(self, req, container_info): + # find the sharded container to which we'll send the update + db_state = container_info.get('sharding_state', 'unsharded') + if db_state in ('sharded', 'sharding'): + shard_ranges = self._get_shard_ranges( + req, self.account_name, self.container_name, + includes=self.object_name, states='updating') + if shard_ranges: + partition, nodes = self.app.container_ring.get_nodes( + shard_ranges[0].account, shard_ranges[0].container) + return partition, nodes, shard_ranges[0].name + + return container_info['partition'], container_info['nodes'], None + @public @cors_validation @delay_denial @@ -273,8 +287,8 @@ class BaseObjectController(Controller): """HTTP POST request handler.""" container_info = self.container_info( self.account_name, self.container_name, req) - container_partition = container_info['partition'] - container_nodes = container_info['nodes'] + container_partition, container_nodes, container_path = \ + self._get_update_target(req, container_info) req.acl = container_info['write_acl'] if 'swift.authorize' in req.environ: aresp = req.environ['swift.authorize'](req) @@ -304,13 +318,14 @@ class BaseObjectController(Controller): headers = self._backend_requests( req, len(nodes), container_partition, container_nodes, - delete_at_container, delete_at_part, delete_at_nodes) + delete_at_container, delete_at_part, delete_at_nodes, + container_path=container_path) return self._post_object(req, obj_ring, partition, headers) def _backend_requests(self, req, n_outgoing, container_partition, containers, delete_at_container=None, delete_at_partition=None, - delete_at_nodes=None): + delete_at_nodes=None, container_path=None): policy_index = req.headers['X-Backend-Storage-Policy-Index'] policy = POLICIES.get_by_index(policy_index) headers = [self.generate_request_headers(req, additional=req.headers) @@ -324,6 +339,8 @@ class BaseObjectController(Controller): headers[index]['X-Container-Device'] = csv_append( headers[index].get('X-Container-Device'), container['device']) + if container_path: + headers[index]['X-Backend-Container-Path'] = container_path def set_delete_at_headers(index, delete_at_node): headers[index]['X-Delete-At-Container'] = delete_at_container @@ -752,8 +769,8 @@ class BaseObjectController(Controller): policy_index = req.headers.get('X-Backend-Storage-Policy-Index', container_info['storage_policy']) obj_ring = self.app.get_object_ring(policy_index) - container_nodes = container_info['nodes'] - container_partition = container_info['partition'] + container_partition, container_nodes, container_path = \ + self._get_update_target(req, container_info) partition, nodes = obj_ring.get_nodes( self.account_name, self.container_name, self.object_name) @@ -800,7 +817,8 @@ class BaseObjectController(Controller): # add special headers to be handled by storage nodes outgoing_headers = self._backend_requests( req, len(nodes), container_partition, container_nodes, - delete_at_container, delete_at_part, delete_at_nodes) + delete_at_container, delete_at_part, delete_at_nodes, + container_path=container_path) # send object to storage nodes resp = self._store_object( @@ -823,8 +841,8 @@ class BaseObjectController(Controller): next_part_power = getattr(obj_ring, 'next_part_power', None) if next_part_power: req.headers['X-Backend-Next-Part-Power'] = next_part_power - container_partition = container_info['partition'] - container_nodes = container_info['nodes'] + container_partition, container_nodes, container_path = \ + self._get_update_target(req, container_info) req.acl = container_info['write_acl'] req.environ['swift_sync_key'] = container_info['sync_key'] if 'swift.authorize' in req.environ: @@ -851,7 +869,8 @@ class BaseObjectController(Controller): node_count += local_handoffs headers = self._backend_requests( - req, node_count, container_partition, container_nodes) + req, node_count, container_partition, container_nodes, + container_path=container_path) return self._delete_object(req, obj_ring, partition, headers) diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index 33a437262a..bfb83bf871 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -4273,6 +4273,51 @@ cluster_dfw1 = http://dfw1.host/v1/ self.assertEqual([], utils.get_db_files(path_3)) self.assertEqual([], utils.get_db_files('/path/to/nowhere')) + def test_get_redirect_data(self): + ts_now = utils.Timestamp.now() + headers = {'X-Backend-Redirect-Timestamp': ts_now.internal} + response = FakeResponse(200, headers, '') + self.assertIsNone(utils.get_redirect_data(response)) + + headers = {'Location': '/a/c/o', + 'X-Backend-Redirect-Timestamp': ts_now.internal} + response = FakeResponse(200, headers, '') + path, ts = utils.get_redirect_data(response) + self.assertEqual('a/c', path) + self.assertEqual(ts_now, ts) + + headers = {'Location': '/a/c', + 'X-Backend-Redirect-Timestamp': ts_now.internal} + response = FakeResponse(200, headers, '') + path, ts = utils.get_redirect_data(response) + self.assertEqual('a/c', path) + self.assertEqual(ts_now, ts) + + def do_test(headers): + response = FakeResponse(200, headers, '') + with self.assertRaises(ValueError) as cm: + utils.get_redirect_data(response) + return cm.exception + + exc = do_test({'Location': '/a', + 'X-Backend-Redirect-Timestamp': ts_now.internal}) + self.assertIn('Invalid path', str(exc)) + + exc = do_test({'Location': '', + 'X-Backend-Redirect-Timestamp': ts_now.internal}) + self.assertIn('Invalid path', str(exc)) + + exc = do_test({'Location': '/a/c', + 'X-Backend-Redirect-Timestamp': 'bad'}) + self.assertIn('Invalid timestamp', str(exc)) + + exc = do_test({'Location': '/a/c'}) + self.assertIn('Invalid timestamp', str(exc)) + + exc = do_test({'Location': '/a/c', + 'X-Backend-Redirect-Timestamp': '-1'}) + self.assertIn('Invalid timestamp', str(exc)) + class ResellerConfReader(unittest.TestCase): diff --git a/test/unit/container/test_server.py b/test/unit/container/test_server.py index e50f74901c..916f0e146d 100644 --- a/test/unit/container/test_server.py +++ b/test/unit/container/test_server.py @@ -47,7 +47,7 @@ from test.unit import fake_http_connect, debug_logger, mock_check_drive from swift.common.storage_policy import (POLICIES, StoragePolicy) from swift.common.request_helpers import get_sys_meta_prefix -from test import listen_zero +from test import listen_zero, annotate_failure from test.unit import patch_policies @@ -3079,6 +3079,199 @@ class TestContainerController(unittest.TestCase): assert_broker_rows(broker.get_brokers()[1], ['sharding', 'racing_update'], 3) + def _check_object_update_redirected_to_shard(self, method): + expected_status = 204 if method == 'DELETE' else 201 + broker = self.controller._get_container_broker('sda1', 'p', 'a', 'c') + ts_iter = make_timestamp_iter() + headers = {'X-Timestamp': next(ts_iter).normal} + req = Request.blank('/sda1/p/a/c', method='PUT', headers=headers) + self.assertEqual(201, req.get_response(self.controller).status_int) + + def do_update(name, timestamp=None, headers=None): + # Make a PUT request to container controller to update an object + timestamp = timestamp or next(ts_iter) + headers = headers or {} + headers.update({'X-Timestamp': timestamp.internal, + 'X-Size': 17, + 'X-Content-Type': 'text/plain', + 'X-Etag': 'fake etag'}) + req = Request.blank( + '/sda1/p/a/c/%s' % name, method=method, headers=headers) + self._update_object_put_headers(req) + return req.get_response(self.controller) + + def get_listing(broker_index): + # index -1 is always the freshest db + sub_broker = broker.get_brokers()[broker_index] + return sub_broker.get_objects() + + def assert_not_redirected(obj_name, timestamp=None, headers=None): + resp = do_update(obj_name, timestamp=timestamp, headers=headers) + self.assertEqual(expected_status, resp.status_int) + self.assertNotIn('Location', resp.headers) + self.assertNotIn('X-Backend-Redirect-Timestamp', resp.headers) + + def assert_redirected(obj_name, shard_range, headers=None): + resp = do_update(obj_name, headers=headers) + self.assertEqual(301, resp.status_int) + self.assertEqual('/%s/%s' % (shard_range.name, obj_name), + resp.headers['Location']) + self.assertEqual(shard_range.timestamp.internal, + resp.headers['X-Backend-Redirect-Timestamp']) + + # sanity check + ts_bashful_orig = next(ts_iter) + mocked_fn = 'swift.container.backend.ContainerBroker.get_shard_ranges' + with mock.patch(mocked_fn) as mock_get_shard_ranges: + assert_not_redirected('bashful', ts_bashful_orig) + mock_get_shard_ranges.assert_not_called() + + shard_ranges = { + 'dopey': ShardRange( + '.sharded_a/sr_dopey', next(ts_iter), '', 'dopey'), + 'happy': ShardRange( + '.sharded_a/sr_happy', next(ts_iter), 'dopey', 'happy'), + '': ShardRange('.sharded_a/sr_', next(ts_iter), 'happy', '') + } + # start with only the middle shard range + self._put_shard_range(shard_ranges['happy']) + + # db not yet sharding but shard ranges exist + sr_happy = shard_ranges['happy'] + redirect_states = ( + ShardRange.CREATED, ShardRange.CLEAVED, ShardRange.ACTIVE, + ShardRange.SHARDING) + headers = {'X-Backend-Accept-Redirect': 'true'} + for state in ShardRange.STATES: + self.assertTrue( + sr_happy.update_state(state, + state_timestamp=next(ts_iter))) + self._put_shard_range(sr_happy) + with annotate_failure(state): + obj_name = 'grumpy%s' % state + if state in redirect_states: + assert_redirected(obj_name, sr_happy, headers=headers) + self.assertNotIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + else: + assert_not_redirected(obj_name, headers=headers) + self.assertIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + obj_name = 'grumpy%s_no_header' % state + with mock.patch(mocked_fn) as mock_get_shard_ranges: + assert_not_redirected(obj_name) + mock_get_shard_ranges.assert_not_called() + self.assertIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + + # set broker to sharding state + broker.enable_sharding(next(ts_iter)) + self.assertTrue(broker.set_sharding_state()) + for state in ShardRange.STATES: + self.assertTrue( + sr_happy.update_state(state, + state_timestamp=next(ts_iter))) + self._put_shard_range(sr_happy) + with annotate_failure(state): + obj_name = 'grumpier%s' % state + if state in redirect_states: + assert_redirected(obj_name, sr_happy, headers=headers) + self.assertNotIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + else: + assert_not_redirected(obj_name, headers=headers) + # update goes to fresh db, misplaced + self.assertIn( + obj_name, [obj['name'] for obj in get_listing(-1)]) + self.assertNotIn( + obj_name, [obj['name'] for obj in get_listing(0)]) + obj_name = 'grumpier%s_no_header' % state + with mock.patch(mocked_fn) as mock_get_shard_ranges: + assert_not_redirected(obj_name) + mock_get_shard_ranges.assert_not_called() + self.assertIn( + obj_name, [obj['name'] for obj in get_listing(-1)]) + # update is misplaced, not in retiring db + self.assertNotIn( + obj_name, [obj['name'] for obj in get_listing(0)]) + + # no shard for this object yet so it is accepted by root container + # and stored in misplaced objects... + assert_not_redirected('dopey', timestamp=next(ts_iter)) + self.assertIn('dopey', [obj['name'] for obj in get_listing(-1)]) + self.assertNotIn('dopey', [obj['name'] for obj in get_listing(0)]) + + # now PUT the first shard range + sr_dopey = shard_ranges['dopey'] + sr_dopey.update_state(ShardRange.CLEAVED, + state_timestamp=next(ts_iter)) + self._put_shard_range(sr_dopey) + for state in ShardRange.STATES: + self.assertTrue( + sr_happy.update_state(state, + state_timestamp=next(ts_iter))) + self._put_shard_range(sr_happy) + with annotate_failure(state): + obj_name = 'dopey%s' % state + if state in redirect_states: + assert_redirected(obj_name, sr_happy, headers=headers) + self.assertNotIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + self.assertNotIn(obj_name, + [obj['name'] for obj in get_listing(0)]) + else: + assert_not_redirected(obj_name, headers=headers) + self.assertIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + self.assertNotIn(obj_name, + [obj['name'] for obj in get_listing(0)]) + obj_name = 'dopey%s_no_header' % state + with mock.patch(mocked_fn) as mock_get_shard_ranges: + assert_not_redirected(obj_name) + mock_get_shard_ranges.assert_not_called() + self.assertIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + self.assertNotIn(obj_name, + [obj['name'] for obj in get_listing(0)]) + + # further updates to bashful and dopey are now redirected... + assert_redirected('bashful', sr_dopey, headers=headers) + assert_redirected('dopey', sr_dopey, headers=headers) + # ...and existing updates in this container are *not* updated + self.assertEqual([ts_bashful_orig.internal], + [obj['created_at'] for obj in get_listing(0) + if obj['name'] == 'bashful']) + + # set broker to sharded state + self.assertTrue(broker.set_sharded_state()) + for state in ShardRange.STATES: + self.assertTrue( + sr_happy.update_state(state, + state_timestamp=next(ts_iter))) + self._put_shard_range(sr_happy) + with annotate_failure(state): + obj_name = 'grumpiest%s' % state + if state in redirect_states: + assert_redirected(obj_name, sr_happy, headers=headers) + self.assertNotIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + else: + assert_not_redirected(obj_name, headers=headers) + self.assertIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + obj_name = 'grumpiest%s_no_header' % state + with mock.patch(mocked_fn) as mock_get_shard_ranges: + assert_not_redirected(obj_name) + mock_get_shard_ranges.assert_not_called() + self.assertIn(obj_name, + [obj['name'] for obj in get_listing(-1)]) + + def test_PUT_object_update_redirected_to_shard(self): + self._check_object_update_redirected_to_shard('PUT') + + def test_DELETE_object_update_redirected_to_shard(self): + self._check_object_update_redirected_to_shard('DELETE') + def test_GET_json(self): # make a container req = Request.blank( diff --git a/test/unit/obj/test_server.py b/test/unit/obj/test_server.py index 0571a80724..7a77603f4c 100644 --- a/test/unit/obj/test_server.py +++ b/test/unit/obj/test_server.py @@ -1053,7 +1053,7 @@ class TestObjectController(unittest.TestCase): mock_ring = mock.MagicMock() mock_ring.get_nodes.return_value = (99, [node]) object_updater.container_ring = mock_ring - mock_update.return_value = ((True, 1)) + mock_update.return_value = ((True, 1, None)) object_updater.run_once() self.assertEqual(1, mock_update.call_count) self.assertEqual((node, 99, 'PUT', '/a/c/o'), @@ -1061,6 +1061,7 @@ class TestObjectController(unittest.TestCase): actual_headers = mock_update.call_args_list[0][0][4] # User-Agent is updated. expected_post_headers['User-Agent'] = 'object-updater %s' % os.getpid() + expected_post_headers['X-Backend-Accept-Redirect'] = 'true' self.assertDictEqual(expected_post_headers, actual_headers) self.assertFalse( os.listdir(os.path.join( @@ -1073,6 +1074,104 @@ class TestObjectController(unittest.TestCase): self._test_PUT_then_POST_async_pendings( POLICIES[1], update_etag='override_etag') + def _check_PUT_redirected_async_pending(self, container_path=None): + # When container update is redirected verify that the redirect location + # is persisted in the async pending file. + policy = POLICIES[0] + device_dir = os.path.join(self.testdir, 'sda1') + t_put = next(self.ts) + update_etag = '098f6bcd4621d373cade4e832627b4f6' + + put_headers = { + 'X-Trans-Id': 'put_trans_id', + 'X-Timestamp': t_put.internal, + 'Content-Type': 'application/octet-stream;swift_bytes=123456789', + 'Content-Length': '4', + 'X-Backend-Storage-Policy-Index': int(policy), + 'X-Container-Host': 'chost:3200', + 'X-Container-Partition': '99', + 'X-Container-Device': 'cdevice'} + + if container_path: + # the proxy may include this header + put_headers['X-Backend-Container-Path'] = container_path + expected_update_path = '/cdevice/99/%s/o' % container_path + else: + expected_update_path = '/cdevice/99/a/c/o' + + if policy.policy_type == EC_POLICY: + put_headers.update({ + 'X-Object-Sysmeta-Ec-Frag-Index': '2', + 'X-Backend-Container-Update-Override-Etag': update_etag, + 'X-Object-Sysmeta-Ec-Etag': update_etag}) + + req = Request.blank('/sda1/p/a/c/o', + environ={'REQUEST_METHOD': 'PUT'}, + headers=put_headers, body='test') + resp_headers = {'Location': '/.sharded_a/c_shard_1/o', + 'X-Backend-Redirect-Timestamp': next(self.ts).internal} + + with mocked_http_conn(301, headers=[resp_headers]) as conn, \ + mock.patch('swift.common.utils.HASH_PATH_PREFIX', ''),\ + fake_spawn(): + resp = req.get_response(self.object_controller) + + self.assertEqual(resp.status_int, 201) + self.assertEqual(1, len(conn.requests)) + + self.assertEqual(expected_update_path, conn.requests[0]['path']) + + # whether or not an X-Backend-Container-Path was received from the + # proxy, the async pending file should now have the container_path + # equal to the Location header received in the update response. + async_pending_file_put = os.path.join( + device_dir, diskfile.get_async_dir(policy), 'a83', + '06fbf0b514e5199dfc4e00f42eb5ea83-%s' % t_put.internal) + self.assertTrue(os.path.isfile(async_pending_file_put), + 'Expected %s to be a file but it is not.' + % async_pending_file_put) + expected_put_headers = { + 'Referer': 'PUT http://localhost/sda1/p/a/c/o', + 'X-Trans-Id': 'put_trans_id', + 'X-Timestamp': t_put.internal, + 'X-Content-Type': 'application/octet-stream;swift_bytes=123456789', + 'X-Size': '4', + 'X-Etag': '098f6bcd4621d373cade4e832627b4f6', + 'User-Agent': 'object-server %s' % os.getpid(), + 'X-Backend-Storage-Policy-Index': '%d' % int(policy)} + if policy.policy_type == EC_POLICY: + expected_put_headers['X-Etag'] = update_etag + self.assertEqual( + {'headers': expected_put_headers, + 'account': 'a', 'container': 'c', 'obj': 'o', 'op': 'PUT', + 'container_path': '.sharded_a/c_shard_1'}, + pickle.load(open(async_pending_file_put))) + + # when updater is run its first request will be to the redirect + # location that is persisted in the async pending file + with mocked_http_conn(201) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache', + lambda *args: None): + object_updater = updater.ObjectUpdater( + {'devices': self.testdir, + 'mount_check': 'false'}, logger=debug_logger()) + node = {'id': 1, 'ip': 'chost', 'port': 3200, + 'device': 'cdevice'} + mock_ring = mock.MagicMock() + mock_ring.get_nodes.return_value = (99, [node]) + object_updater.container_ring = mock_ring + object_updater.run_once() + + self.assertEqual(1, len(conn.requests)) + self.assertEqual('/cdevice/99/.sharded_a/c_shard_1/o', + conn.requests[0]['path']) + + def test_PUT_redirected_async_pending(self): + self._check_PUT_redirected_async_pending() + + def test_PUT_redirected_async_pending_with_container_path(self): + self._check_PUT_redirected_async_pending(container_path='.another/c') + def test_POST_quarantine_zbyte(self): timestamp = normalize_timestamp(time()) req = Request.blank('/sda1/p/a/c/o', environ={'REQUEST_METHOD': 'PUT'}, @@ -5263,6 +5362,95 @@ class TestObjectController(unittest.TestCase): 'X-Backend-Container-Update-Override-Content-Type': 'ignored', 'X-Backend-Container-Update-Override-Foo': 'ignored'}) + def test_PUT_container_update_to_shard(self): + # verify that alternate container update path is respected when + # included in request headers + def do_test(container_path, expected_path, expected_container_path): + policy = random.choice(list(POLICIES)) + container_updates = [] + + def capture_updates( + ip, port, method, path, headers, *args, **kwargs): + container_updates.append((ip, port, method, path, headers)) + + pickle_async_update_args = [] + + def fake_pickle_async_update(*args): + pickle_async_update_args.append(args) + + diskfile_mgr = self.object_controller._diskfile_router[policy] + diskfile_mgr.pickle_async_update = fake_pickle_async_update + + ts_put = next(self.ts) + headers = { + 'X-Timestamp': ts_put.internal, + 'X-Trans-Id': '123', + 'X-Container-Host': 'chost:cport', + 'X-Container-Partition': 'cpartition', + 'X-Container-Device': 'cdevice', + 'Content-Type': 'text/plain', + 'X-Object-Sysmeta-Ec-Frag-Index': 0, + 'X-Backend-Storage-Policy-Index': int(policy), + } + if container_path is not None: + headers['X-Backend-Container-Path'] = container_path + + req = Request.blank('/sda1/0/a/c/o', method='PUT', + headers=headers, body='') + with mocked_http_conn( + 500, give_connect=capture_updates) as fake_conn: + with fake_spawn(): + resp = req.get_response(self.object_controller) + self.assertRaises(StopIteration, fake_conn.code_iter.next) + self.assertEqual(resp.status_int, 201) + self.assertEqual(len(container_updates), 1) + # verify expected path used in update request + ip, port, method, path, headers = container_updates[0] + self.assertEqual(ip, 'chost') + self.assertEqual(port, 'cport') + self.assertEqual(method, 'PUT') + self.assertEqual(path, '/cdevice/cpartition/%s/o' % expected_path) + + # verify that the picked update *always* has root container + self.assertEqual(1, len(pickle_async_update_args)) + (objdevice, account, container, obj, data, timestamp, + policy) = pickle_async_update_args[0] + self.assertEqual(objdevice, 'sda1') + self.assertEqual(account, 'a') # NB user account + self.assertEqual(container, 'c') # NB root container + self.assertEqual(obj, 'o') + self.assertEqual(timestamp, ts_put.internal) + self.assertEqual(policy, policy) + expected_data = { + 'headers': HeaderKeyDict({ + 'X-Size': '0', + 'User-Agent': 'object-server %s' % os.getpid(), + 'X-Content-Type': 'text/plain', + 'X-Timestamp': ts_put.internal, + 'X-Trans-Id': '123', + 'Referer': 'PUT http://localhost/sda1/0/a/c/o', + 'X-Backend-Storage-Policy-Index': int(policy), + 'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e'}), + 'obj': 'o', + 'account': 'a', + 'container': 'c', + 'op': 'PUT'} + if expected_container_path: + expected_data['container_path'] = expected_container_path + self.assertEqual(expected_data, data) + + do_test('a_shard/c_shard', 'a_shard/c_shard', 'a_shard/c_shard') + do_test('', 'a/c', None) + do_test(None, 'a/c', None) + # TODO: should these cases trigger a 400 response rather than + # defaulting to root path? + do_test('garbage', 'a/c', None) + do_test('/', 'a/c', None) + do_test('/no-acct', 'a/c', None) + do_test('no-cont/', 'a/c', None) + do_test('too/many/parts', 'a/c', None) + do_test('/leading/slash', 'a/c', None) + def test_container_update_async(self): policy = random.choice(list(POLICIES)) req = Request.blank( @@ -5335,23 +5523,21 @@ class TestObjectController(unittest.TestCase): 'X-Container-Partition': '20', 'X-Container-Host': '1.2.3.4:5', 'X-Container-Device': 'sdb1'}) - with mock.patch.object(object_server, 'spawn', - local_fake_spawn): - with mock.patch.object(self.object_controller, - 'async_update', - local_fake_async_update): - resp = req.get_response(self.object_controller) - # check the response is completed and successful - self.assertEqual(resp.status_int, 201) - # check that async_update hasn't been called - self.assertFalse(len(called_async_update_args)) - # now do the work in greenthreads - for func, a, kw in saved_spawn_calls: - gt = spawn(func, *a, **kw) - greenthreads.append(gt) - # wait for the greenthreads to finish - for gt in greenthreads: - gt.wait() + with mock.patch.object(object_server, 'spawn', local_fake_spawn), \ + mock.patch.object(self.object_controller, 'async_update', + local_fake_async_update): + resp = req.get_response(self.object_controller) + # check the response is completed and successful + self.assertEqual(resp.status_int, 201) + # check that async_update hasn't been called + self.assertFalse(len(called_async_update_args)) + # now do the work in greenthreads + for func, a, kw in saved_spawn_calls: + gt = spawn(func, *a, **kw) + greenthreads.append(gt) + # wait for the greenthreads to finish + for gt in greenthreads: + gt.wait() # check that the calls to async_update have happened headers_out = {'X-Size': '0', 'X-Content-Type': 'application/burrito', @@ -5362,7 +5548,8 @@ class TestObjectController(unittest.TestCase): 'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e'} expected = [('PUT', 'a', 'c', 'o', '1.2.3.4:5', '20', 'sdb1', headers_out, 'sda1', POLICIES[0]), - {'logger_thread_locals': (None, None)}] + {'logger_thread_locals': (None, None), + 'container_path': None}] self.assertEqual(called_async_update_args, [expected]) def test_container_update_as_greenthread_with_timeout(self): diff --git a/test/unit/obj/test_updater.py b/test/unit/obj/test_updater.py index aac6325254..ae51153b8e 100644 --- a/test/unit/obj/test_updater.py +++ b/test/unit/obj/test_updater.py @@ -65,7 +65,9 @@ class TestObjectUpdater(unittest.TestCase): {'id': 1, 'ip': '127.0.0.1', 'port': 1, 'device': 'sda1', 'zone': 2}, {'id': 2, 'ip': '127.0.0.1', 'port': 1, - 'device': 'sda1', 'zone': 4}], 30), + 'device': 'sda1', 'zone': 4}, + {'id': 3, 'ip': '127.0.0.1', 'port': 1, + 'device': 'sda1', 'zone': 6}], 30), f) self.devices_dir = os.path.join(self.testdir, 'devices') os.mkdir(self.devices_dir) @@ -74,6 +76,7 @@ class TestObjectUpdater(unittest.TestCase): for policy in POLICIES: os.mkdir(os.path.join(self.sda1, get_tmp_dir(policy))) self.logger = debug_logger() + self.ts_iter = make_timestamp_iter() def tearDown(self): rmtree(self.testdir, ignore_errors=1) @@ -299,19 +302,22 @@ class TestObjectUpdater(unittest.TestCase): self.assertIn("sweep progress", info_lines[1]) # the space ensures it's a positive number self.assertIn( - "2 successes, 0 failures, 0 quarantines, 2 unlinks, 0 error", + "2 successes, 0 failures, 0 quarantines, 2 unlinks, 0 errors, " + "0 redirects", info_lines[1]) self.assertIn(self.sda1, info_lines[1]) self.assertIn("sweep progress", info_lines[2]) self.assertIn( - "4 successes, 0 failures, 0 quarantines, 4 unlinks, 0 error", + "4 successes, 0 failures, 0 quarantines, 4 unlinks, 0 errors, " + "0 redirects", info_lines[2]) self.assertIn(self.sda1, info_lines[2]) self.assertIn("sweep complete", info_lines[3]) self.assertIn( - "5 successes, 0 failures, 0 quarantines, 5 unlinks, 0 error", + "5 successes, 0 failures, 0 quarantines, 5 unlinks, 0 errors, " + "0 redirects", info_lines[3]) self.assertIn(self.sda1, info_lines[3]) @@ -547,6 +553,26 @@ class TestObjectUpdater(unittest.TestCase): {'successes': 1, 'unlinks': 1, 'async_pendings': 1}) + def _write_async_update(self, dfmanager, timestamp, policy, + headers=None, container_path=None): + # write an async + account, container, obj = 'a', 'c', 'o' + op = 'PUT' + headers_out = headers or { + 'x-size': 0, + 'x-content-type': 'text/plain', + 'x-etag': 'd41d8cd98f00b204e9800998ecf8427e', + 'x-timestamp': timestamp.internal, + 'X-Backend-Storage-Policy-Index': int(policy), + 'User-Agent': 'object-server %s' % os.getpid() + } + data = {'op': op, 'account': account, 'container': container, + 'obj': obj, 'headers': headers_out} + if container_path: + data['container_path'] = container_path + dfmanager.pickle_async_update(self.sda1, account, container, obj, + data, timestamp, policy) + def test_obj_put_async_updates(self): ts_iter = make_timestamp_iter() policies = list(POLICIES) @@ -562,16 +588,12 @@ class TestObjectUpdater(unittest.TestCase): async_dir = os.path.join(self.sda1, get_async_dir(policies[0])) os.mkdir(async_dir) - def do_test(headers_out, expected): + def do_test(headers_out, expected, container_path=None): # write an async dfmanager = DiskFileManager(conf, daemon.logger) - account, container, obj = 'a', 'c', 'o' - op = 'PUT' - data = {'op': op, 'account': account, 'container': container, - 'obj': obj, 'headers': headers_out} - dfmanager.pickle_async_update(self.sda1, account, container, obj, - data, next(ts_iter), policies[0]) - + self._write_async_update(dfmanager, next(ts_iter), policies[0], + headers=headers_out, + container_path=container_path) request_log = [] def capture(*args, **kwargs): @@ -613,11 +635,21 @@ class TestObjectUpdater(unittest.TestCase): 'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e', 'X-Timestamp': ts.normal, 'X-Backend-Storage-Policy-Index': str(int(policies[0])), - 'User-Agent': 'object-updater %s' % os.getpid() + 'User-Agent': 'object-updater %s' % os.getpid(), + 'X-Backend-Accept-Redirect': 'true', } + # always expect X-Backend-Accept-Redirect to be true + do_test(headers_out, expected, container_path='.shards_a/shard_c') do_test(headers_out, expected) + # ...unless X-Backend-Accept-Redirect is already set + expected['X-Backend-Accept-Redirect'] = 'false' + headers_out_2 = dict(headers_out) + headers_out_2['X-Backend-Accept-Redirect'] = 'false' + do_test(headers_out_2, expected) + # updater should add policy header if missing + expected['X-Backend-Accept-Redirect'] = 'true' headers_out['X-Backend-Storage-Policy-Index'] = None do_test(headers_out, expected) @@ -632,6 +664,414 @@ class TestObjectUpdater(unittest.TestCase): 'X-Backend-Storage-Policy-Index') do_test(headers_out, expected) + def _check_update_requests(self, requests, timestamp, policy): + # do some sanity checks on update request + expected_headers = { + 'X-Size': '0', + 'X-Content-Type': 'text/plain', + 'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e', + 'X-Timestamp': timestamp.internal, + 'X-Backend-Storage-Policy-Index': str(int(policy)), + 'User-Agent': 'object-updater %s' % os.getpid(), + 'X-Backend-Accept-Redirect': 'true'} + for request in requests: + self.assertEqual('PUT', request['method']) + self.assertDictEqual(expected_headers, request['headers']) + + def test_obj_put_async_root_update_redirected(self): + policies = list(POLICIES) + random.shuffle(policies) + # setup updater + conf = { + 'devices': self.devices_dir, + 'mount_check': 'false', + 'swift_dir': self.testdir, + } + daemon = object_updater.ObjectUpdater(conf, logger=self.logger) + async_dir = os.path.join(self.sda1, get_async_dir(policies[0])) + os.mkdir(async_dir) + dfmanager = DiskFileManager(conf, daemon.logger) + + ts_obj = next(self.ts_iter) + self._write_async_update(dfmanager, ts_obj, policies[0]) + + # run once + ts_redirect_1 = next(self.ts_iter) + ts_redirect_2 = next(self.ts_iter) + fake_responses = [ + # first round of update attempts, newest redirect should be chosen + (200, {}), + (301, {'Location': '/.shards_a/c_shard_new/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_2.internal}), + (301, {'Location': '/.shards_a/c_shard_old/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_1.internal}), + # second round of update attempts + (200, {}), + (200, {}), + (200, {}), + ] + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + + self._check_update_requests(conn.requests[:3], ts_obj, policies[0]) + self._check_update_requests(conn.requests[3:], ts_obj, policies[0]) + self.assertEqual(['/sda1/0/a/c/o'] * 3 + + ['/sda1/0/.shards_a/c_shard_new/o'] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 1, 'successes': 1, + 'unlinks': 1, 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + self.assertFalse(os.listdir(async_dir)) # no async file + + def test_obj_put_async_root_update_redirected_previous_success(self): + policies = list(POLICIES) + random.shuffle(policies) + # setup updater + conf = { + 'devices': self.devices_dir, + 'mount_check': 'false', + 'swift_dir': self.testdir, + } + daemon = object_updater.ObjectUpdater(conf, logger=self.logger) + async_dir = os.path.join(self.sda1, get_async_dir(policies[0])) + os.mkdir(async_dir) + dfmanager = DiskFileManager(conf, daemon.logger) + + ts_obj = next(self.ts_iter) + self._write_async_update(dfmanager, ts_obj, policies[0]) + orig_async_path, orig_async_data = self._check_async_file(async_dir) + + # run once + with mocked_http_conn( + 507, 200, 507) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + + self._check_update_requests(conn.requests, ts_obj, policies[0]) + self.assertEqual(['/sda1/0/a/c/o'] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'failures': 1, 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + async_path, async_data = self._check_async_file(async_dir) + self.assertEqual(dict(orig_async_data, successes=[1]), async_data) + + # run again - expect 3 redirected updates despite previous success + ts_redirect = next(self.ts_iter) + resp_headers_1 = {'Location': '/.shards_a/c_shard_1/o', + 'X-Backend-Redirect-Timestamp': ts_redirect.internal} + fake_responses = ( + # 1st round of redirects, 2nd round of redirects + [(301, resp_headers_1)] * 2 + [(200, {})] * 3) + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + + self._check_update_requests(conn.requests[:2], ts_obj, policies[0]) + self._check_update_requests(conn.requests[2:], ts_obj, policies[0]) + root_part = daemon.container_ring.get_part('a/c') + shard_1_part = daemon.container_ring.get_part('.shards_a/c_shard_1') + self.assertEqual( + ['/sda1/%s/a/c/o' % root_part] * 2 + + ['/sda1/%s/.shards_a/c_shard_1/o' % shard_1_part] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 1, 'successes': 1, 'failures': 1, 'unlinks': 1, + 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + self.assertFalse(os.listdir(async_dir)) # no async file + + def _check_async_file(self, async_dir): + async_subdirs = os.listdir(async_dir) + self.assertEqual([mock.ANY], async_subdirs) + async_files = os.listdir(os.path.join(async_dir, async_subdirs[0])) + self.assertEqual([mock.ANY], async_files) + async_path = os.path.join( + async_dir, async_subdirs[0], async_files[0]) + with open(async_path) as fd: + async_data = pickle.load(fd) + return async_path, async_data + + def _check_obj_put_async_update_bad_redirect_headers(self, headers): + policies = list(POLICIES) + random.shuffle(policies) + # setup updater + conf = { + 'devices': self.devices_dir, + 'mount_check': 'false', + 'swift_dir': self.testdir, + } + daemon = object_updater.ObjectUpdater(conf, logger=self.logger) + async_dir = os.path.join(self.sda1, get_async_dir(policies[0])) + os.mkdir(async_dir) + dfmanager = DiskFileManager(conf, daemon.logger) + + ts_obj = next(self.ts_iter) + self._write_async_update(dfmanager, ts_obj, policies[0]) + orig_async_path, orig_async_data = self._check_async_file(async_dir) + + fake_responses = [ + (301, headers), + (301, headers), + (301, headers), + ] + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + + self._check_update_requests(conn.requests, ts_obj, policies[0]) + self.assertEqual(['/sda1/0/a/c/o'] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'failures': 1, 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + # async file still intact + async_path, async_data = self._check_async_file(async_dir) + self.assertEqual(orig_async_path, async_path) + self.assertEqual(orig_async_data, async_data) + return daemon + + def test_obj_put_async_root_update_missing_location_header(self): + headers = { + 'X-Backend-Redirect-Timestamp': next(self.ts_iter).internal} + self._check_obj_put_async_update_bad_redirect_headers(headers) + + def test_obj_put_async_root_update_bad_location_header(self): + headers = { + 'Location': 'bad bad bad', + 'X-Backend-Redirect-Timestamp': next(self.ts_iter).internal} + daemon = self._check_obj_put_async_update_bad_redirect_headers(headers) + error_lines = daemon.logger.get_lines_for_level('error') + self.assertIn('Container update failed', error_lines[0]) + self.assertIn('Invalid path: bad%20bad%20bad', error_lines[0]) + + def test_obj_put_async_shard_update_redirected_twice(self): + policies = list(POLICIES) + random.shuffle(policies) + # setup updater + conf = { + 'devices': self.devices_dir, + 'mount_check': 'false', + 'swift_dir': self.testdir, + } + daemon = object_updater.ObjectUpdater(conf, logger=self.logger) + async_dir = os.path.join(self.sda1, get_async_dir(policies[0])) + os.mkdir(async_dir) + dfmanager = DiskFileManager(conf, daemon.logger) + + ts_obj = next(self.ts_iter) + self._write_async_update(dfmanager, ts_obj, policies[0], + container_path='.shards_a/c_shard_older') + orig_async_path, orig_async_data = self._check_async_file(async_dir) + + # run once + ts_redirect_1 = next(self.ts_iter) + ts_redirect_2 = next(self.ts_iter) + ts_redirect_3 = next(self.ts_iter) + fake_responses = [ + # 1st round of redirects, newest redirect should be chosen + (301, {'Location': '/.shards_a/c_shard_old/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_1.internal}), + (301, {'Location': '/.shards_a/c_shard_new/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_2.internal}), + (301, {'Location': '/.shards_a/c_shard_old/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_1.internal}), + # 2nd round of redirects + (301, {'Location': '/.shards_a/c_shard_newer/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_3.internal}), + (301, {'Location': '/.shards_a/c_shard_newer/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_3.internal}), + (301, {'Location': '/.shards_a/c_shard_newer/o', + 'X-Backend-Redirect-Timestamp': ts_redirect_3.internal}), + ] + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + + self._check_update_requests(conn.requests, ts_obj, policies[0]) + # only *one* set of redirected requests is attempted per cycle + older_part = daemon.container_ring.get_part('.shards_a/c_shard_older') + new_part = daemon.container_ring.get_part('.shards_a/c_shard_new') + newer_part = daemon.container_ring.get_part('.shards_a/c_shard_newer') + self.assertEqual( + ['/sda1/%s/.shards_a/c_shard_older/o' % older_part] * 3 + + ['/sda1/%s/.shards_a/c_shard_new/o' % new_part] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 2, 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + # update failed, we still have pending file with most recent redirect + # response Location header value added to data + async_path, async_data = self._check_async_file(async_dir) + self.assertEqual(orig_async_path, async_path) + self.assertEqual( + dict(orig_async_data, container_path='.shards_a/c_shard_newer', + redirect_history=['.shards_a/c_shard_new', + '.shards_a/c_shard_newer']), + async_data) + + # next cycle, should get latest redirect from pickled async update + fake_responses = [(200, {})] * 3 + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + + self._check_update_requests(conn.requests, ts_obj, policies[0]) + self.assertEqual( + ['/sda1/%s/.shards_a/c_shard_newer/o' % newer_part] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 2, 'successes': 1, 'unlinks': 1, + 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + self.assertFalse(os.listdir(async_dir)) # no async file + + def test_obj_put_async_update_redirection_loop(self): + policies = list(POLICIES) + random.shuffle(policies) + # setup updater + conf = { + 'devices': self.devices_dir, + 'mount_check': 'false', + 'swift_dir': self.testdir, + } + daemon = object_updater.ObjectUpdater(conf, logger=self.logger) + async_dir = os.path.join(self.sda1, get_async_dir(policies[0])) + os.mkdir(async_dir) + dfmanager = DiskFileManager(conf, daemon.logger) + + ts_obj = next(self.ts_iter) + self._write_async_update(dfmanager, ts_obj, policies[0]) + orig_async_path, orig_async_data = self._check_async_file(async_dir) + + # run once + ts_redirect = next(self.ts_iter) + + resp_headers_1 = {'Location': '/.shards_a/c_shard_1/o', + 'X-Backend-Redirect-Timestamp': ts_redirect.internal} + resp_headers_2 = {'Location': '/.shards_a/c_shard_2/o', + 'X-Backend-Redirect-Timestamp': ts_redirect.internal} + fake_responses = ( + # 1st round of redirects, 2nd round of redirects + [(301, resp_headers_1)] * 3 + [(301, resp_headers_2)] * 3) + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + self._check_update_requests(conn.requests[:3], ts_obj, policies[0]) + self._check_update_requests(conn.requests[3:], ts_obj, policies[0]) + # only *one* set of redirected requests is attempted per cycle + root_part = daemon.container_ring.get_part('a/c') + shard_1_part = daemon.container_ring.get_part('.shards_a/c_shard_1') + shard_2_part = daemon.container_ring.get_part('.shards_a/c_shard_2') + shard_3_part = daemon.container_ring.get_part('.shards_a/c_shard_3') + self.assertEqual(['/sda1/%s/a/c/o' % root_part] * 3 + + ['/sda1/%s/.shards_a/c_shard_1/o' % shard_1_part] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 2, 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + # update failed, we still have pending file with most recent redirect + # response Location header value added to data + async_path, async_data = self._check_async_file(async_dir) + self.assertEqual(orig_async_path, async_path) + self.assertEqual( + dict(orig_async_data, container_path='.shards_a/c_shard_2', + redirect_history=['.shards_a/c_shard_1', + '.shards_a/c_shard_2']), + async_data) + + # next cycle, more redirects! first is to previously visited location + resp_headers_3 = {'Location': '/.shards_a/c_shard_3/o', + 'X-Backend-Redirect-Timestamp': ts_redirect.internal} + fake_responses = ( + # 1st round of redirects, 2nd round of redirects + [(301, resp_headers_1)] * 3 + [(301, resp_headers_3)] * 3) + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + self._check_update_requests(conn.requests[:3], ts_obj, policies[0]) + self._check_update_requests(conn.requests[3:], ts_obj, policies[0]) + # first try the previously persisted container path, response to that + # creates a loop so ignore and send to root + self.assertEqual( + ['/sda1/%s/.shards_a/c_shard_2/o' % shard_2_part] * 3 + + ['/sda1/%s/a/c/o' % root_part] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 4, 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + # update failed, we still have pending file with most recent redirect + # response Location header value from root added to persisted data + async_path, async_data = self._check_async_file(async_dir) + self.assertEqual(orig_async_path, async_path) + # note: redirect_history was reset when falling back to root + self.assertEqual( + dict(orig_async_data, container_path='.shards_a/c_shard_3', + redirect_history=['.shards_a/c_shard_3']), + async_data) + + # next cycle, more redirects! first is to a location visited previously + # but not since last fall back to root, so that location IS tried; + # second is to a location visited since last fall back to root so that + # location is NOT tried + fake_responses = ( + # 1st round of redirects, 2nd round of redirects + [(301, resp_headers_1)] * 3 + [(301, resp_headers_3)] * 3) + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + self._check_update_requests(conn.requests, ts_obj, policies[0]) + self.assertEqual( + ['/sda1/%s/.shards_a/c_shard_3/o' % shard_3_part] * 3 + + ['/sda1/%s/.shards_a/c_shard_1/o' % shard_1_part] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 6, 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + # update failed, we still have pending file, but container_path is None + # because most recent redirect location was a repeat + async_path, async_data = self._check_async_file(async_dir) + self.assertEqual(orig_async_path, async_path) + self.assertEqual( + dict(orig_async_data, container_path=None, + redirect_history=[]), + async_data) + + # next cycle, persisted container path is None so update should go to + # root, this time it succeeds + fake_responses = [(200, {})] * 3 + fake_status_codes, fake_headers = zip(*fake_responses) + with mocked_http_conn( + *fake_status_codes, headers=fake_headers) as conn: + with mock.patch('swift.obj.updater.dump_recon_cache'): + daemon.run_once() + self._check_update_requests(conn.requests, ts_obj, policies[0]) + self.assertEqual(['/sda1/%s/a/c/o' % root_part] * 3, + [req['path'] for req in conn.requests]) + self.assertEqual( + {'redirects': 6, 'successes': 1, 'unlinks': 1, + 'async_pendings': 1}, + daemon.logger.get_increment_counts()) + self.assertFalse(os.listdir(async_dir)) # no async file + if __name__ == '__main__': unittest.main() diff --git a/test/unit/proxy/test_server.py b/test/unit/proxy/test_server.py index c4223c656d..8e67abb009 100644 --- a/test/unit/proxy/test_server.py +++ b/test/unit/proxy/test_server.py @@ -47,7 +47,7 @@ from eventlet.green import httplib from six import BytesIO from six import StringIO from six.moves import range -from six.moves.urllib.parse import quote +from six.moves.urllib.parse import quote, parse_qsl from test import listen_zero from test.unit import ( @@ -3222,95 +3222,197 @@ class TestReplicatedObjectController( # reset the router post patch_policies self.app.obj_controller_router = proxy_server.ObjectControllerRouter() self.app.sort_nodes = lambda nodes, *args, **kwargs: nodes - backend_requests = [] - def capture_requests(ip, port, method, path, headers, *args, - **kwargs): - backend_requests.append((method, path, headers)) + def do_test(resp_headers): + self.app.memcache.store = {} + backend_requests = [] - req = Request.blank('/v1/a/c/o', {}, method='POST', - headers={'X-Object-Meta-Color': 'Blue', - 'Content-Type': 'text/plain'}) + def capture_requests(ip, port, method, path, headers, *args, + **kwargs): + backend_requests.append((method, path, headers)) - # we want the container_info response to says a policy index of 1 - resp_headers = {'X-Backend-Storage-Policy-Index': 1} - with mocked_http_conn( - 200, 200, 202, 202, 202, - headers=resp_headers, give_connect=capture_requests - ) as fake_conn: - resp = req.get_response(self.app) - self.assertRaises(StopIteration, fake_conn.code_iter.next) + req = Request.blank('/v1/a/c/o', {}, method='POST', + headers={'X-Object-Meta-Color': 'Blue', + 'Content-Type': 'text/plain'}) - self.assertEqual(resp.status_int, 202) - self.assertEqual(len(backend_requests), 5) + # we want the container_info response to says a policy index of 1 + with mocked_http_conn( + 200, 200, 202, 202, 202, + headers=resp_headers, give_connect=capture_requests + ) as fake_conn: + resp = req.get_response(self.app) + self.assertRaises(StopIteration, fake_conn.code_iter.next) - def check_request(req, method, path, headers=None): - req_method, req_path, req_headers = req - self.assertEqual(method, req_method) - # caller can ignore leading path parts - self.assertTrue(req_path.endswith(path), - 'expected path to end with %s, it was %s' % ( - path, req_path)) - headers = headers or {} - # caller can ignore some headers - for k, v in headers.items(): - self.assertEqual(req_headers[k], v) - account_request = backend_requests.pop(0) - check_request(account_request, method='HEAD', path='/sda/0/a') - container_request = backend_requests.pop(0) - check_request(container_request, method='HEAD', path='/sda/0/a/c') - # make sure backend requests included expected container headers - container_headers = {} - for request in backend_requests: - req_headers = request[2] - device = req_headers['x-container-device'] - host = req_headers['x-container-host'] - container_headers[device] = host - expectations = { - 'method': 'POST', - 'path': '/0/a/c/o', - 'headers': { - 'X-Container-Partition': '0', - 'Connection': 'close', - 'User-Agent': 'proxy-server %s' % os.getpid(), - 'Host': 'localhost:80', - 'Referer': 'POST http://localhost/v1/a/c/o', - 'X-Object-Meta-Color': 'Blue', - 'X-Backend-Storage-Policy-Index': '1' - }, - } - check_request(request, **expectations) + self.assertEqual(resp.status_int, 202) + self.assertEqual(len(backend_requests), 5) - expected = {} - for i, device in enumerate(['sda', 'sdb', 'sdc']): - expected[device] = '10.0.0.%d:100%d' % (i, i) - self.assertEqual(container_headers, expected) + def check_request(req, method, path, headers=None): + req_method, req_path, req_headers = req + self.assertEqual(method, req_method) + # caller can ignore leading path parts + self.assertTrue(req_path.endswith(path), + 'expected path to end with %s, it was %s' % ( + path, req_path)) + headers = headers or {} + # caller can ignore some headers + for k, v in headers.items(): + self.assertEqual(req_headers[k], v) + self.assertNotIn('X-Backend-Container-Path', req_headers) - # and again with policy override - self.app.memcache.store = {} - backend_requests = [] - req = Request.blank('/v1/a/c/o', {}, method='POST', - headers={'X-Object-Meta-Color': 'Blue', - 'Content-Type': 'text/plain', - 'X-Backend-Storage-Policy-Index': 0}) - with mocked_http_conn( - 200, 200, 202, 202, 202, - headers=resp_headers, give_connect=capture_requests - ) as fake_conn: - resp = req.get_response(self.app) - self.assertRaises(StopIteration, fake_conn.code_iter.next) - self.assertEqual(resp.status_int, 202) - self.assertEqual(len(backend_requests), 5) - for request in backend_requests[2:]: - expectations = { - 'method': 'POST', - 'path': '/0/a/c/o', # ignore device bit - 'headers': { - 'X-Object-Meta-Color': 'Blue', - 'X-Backend-Storage-Policy-Index': '0', + account_request = backend_requests.pop(0) + check_request(account_request, method='HEAD', path='/sda/0/a') + container_request = backend_requests.pop(0) + check_request(container_request, method='HEAD', path='/sda/0/a/c') + # make sure backend requests included expected container headers + container_headers = {} + for request in backend_requests: + req_headers = request[2] + device = req_headers['x-container-device'] + host = req_headers['x-container-host'] + container_headers[device] = host + expectations = { + 'method': 'POST', + 'path': '/0/a/c/o', + 'headers': { + 'X-Container-Partition': '0', + 'Connection': 'close', + 'User-Agent': 'proxy-server %s' % os.getpid(), + 'Host': 'localhost:80', + 'Referer': 'POST http://localhost/v1/a/c/o', + 'X-Object-Meta-Color': 'Blue', + 'X-Backend-Storage-Policy-Index': '1' + }, } - } - check_request(request, **expectations) + check_request(request, **expectations) + + expected = {} + for i, device in enumerate(['sda', 'sdb', 'sdc']): + expected[device] = '10.0.0.%d:100%d' % (i, i) + self.assertEqual(container_headers, expected) + + # and again with policy override + self.app.memcache.store = {} + backend_requests = [] + req = Request.blank('/v1/a/c/o', {}, method='POST', + headers={'X-Object-Meta-Color': 'Blue', + 'Content-Type': 'text/plain', + 'X-Backend-Storage-Policy-Index': 0}) + with mocked_http_conn( + 200, 200, 202, 202, 202, + headers=resp_headers, give_connect=capture_requests + ) as fake_conn: + resp = req.get_response(self.app) + self.assertRaises(StopIteration, fake_conn.code_iter.next) + self.assertEqual(resp.status_int, 202) + self.assertEqual(len(backend_requests), 5) + for request in backend_requests[2:]: + expectations = { + 'method': 'POST', + 'path': '/0/a/c/o', # ignore device bit + 'headers': { + 'X-Object-Meta-Color': 'Blue', + 'X-Backend-Storage-Policy-Index': '0', + } + } + check_request(request, **expectations) + + resp_headers = {'X-Backend-Storage-Policy-Index': 1} + do_test(resp_headers) + resp_headers['X-Backend-Sharding-State'] = 'unsharded' + do_test(resp_headers) + + @patch_policies([ + StoragePolicy(0, 'zero', is_default=True, object_ring=FakeRing()), + StoragePolicy(1, 'one', object_ring=FakeRing()), + ]) + def test_backend_headers_update_shard_container(self): + # verify that when container is sharded the backend container update is + # directed to the shard container + # reset the router post patch_policies + self.app.obj_controller_router = proxy_server.ObjectControllerRouter() + self.app.sort_nodes = lambda nodes, *args, **kwargs: nodes + + def do_test(method, sharding_state): + self.app.memcache.store = {} + req = Request.blank('/v1/a/c/o', {}, method=method, body='', + headers={'Content-Type': 'text/plain'}) + + # we want the container_info response to say policy index of 1 and + # sharding state + # acc HEAD, cont HEAD, cont shard GET, obj POSTs + status_codes = (200, 200, 200, 202, 202, 202) + resp_headers = {'X-Backend-Storage-Policy-Index': 1, + 'x-backend-sharding-state': sharding_state, + 'X-Backend-Record-Type': 'shard'} + shard_range = utils.ShardRange( + '.shards_a/c_shard', utils.Timestamp.now(), 'l', 'u') + body = json.dumps([dict(shard_range)]) + with mocked_http_conn(*status_codes, headers=resp_headers, + body=body) as fake_conn: + resp = req.get_response(self.app) + + self.assertEqual(resp.status_int, 202) + backend_requests = fake_conn.requests + + def check_request(req, method, path, headers=None, params=None): + self.assertEqual(method, req['method']) + # caller can ignore leading path parts + self.assertTrue(req['path'].endswith(path), + 'expected path to end with %s, it was %s' % ( + path, req['path'])) + headers = headers or {} + # caller can ignore some headers + for k, v in headers.items(): + self.assertEqual(req['headers'][k], v, + 'Expected %s but got %s for key %s' % + (v, req['headers'][k], k)) + params = params or {} + req_params = dict(parse_qsl(req['qs'])) if req['qs'] else {} + for k, v in params.items(): + self.assertEqual(req_params[k], v, + 'Expected %s but got %s for key %s' % + (v, req_params[k], k)) + + account_request = backend_requests[0] + check_request(account_request, method='HEAD', path='/sda/0/a') + container_request = backend_requests[1] + check_request(container_request, method='HEAD', path='/sda/0/a/c') + container_request_shard = backend_requests[2] + check_request( + container_request_shard, method='GET', path='/sda/0/a/c', + params={'includes': 'o'}) + + # make sure backend requests included expected container headers + container_headers = {} + + for request in backend_requests[3:]: + req_headers = request['headers'] + device = req_headers['x-container-device'] + container_headers[device] = req_headers['x-container-host'] + expectations = { + 'method': method, + 'path': '/0/a/c/o', + 'headers': { + 'X-Container-Partition': '0', + 'Host': 'localhost:80', + 'Referer': '%s http://localhost/v1/a/c/o' % method, + 'X-Backend-Storage-Policy-Index': '1', + 'X-Backend-Container-Path': shard_range.name + }, + } + check_request(request, **expectations) + + expected = {} + for i, device in enumerate(['sda', 'sdb', 'sdc']): + expected[device] = '10.0.0.%d:100%d' % (i, i) + self.assertEqual(container_headers, expected) + + do_test('POST', 'sharding') + do_test('POST', 'sharded') + do_test('DELETE', 'sharding') + do_test('DELETE', 'sharded') + do_test('PUT', 'sharding') + do_test('PUT', 'sharded') def test_DELETE(self): with save_globals(): From 26418140108281ae5ac11004ebf33b7b3e08c74d Mon Sep 17 00:00:00 2001 From: Matthew Oliver Date: Wed, 2 May 2018 10:47:51 +0100 Subject: [PATCH 8/9] Add sharder daemon, manage_shard_ranges tool and probe tests The sharder daemon visits container dbs and when necessary executes the sharding workflow on the db. The workflow is, in overview: - perform an audit of the container for sharding purposes. - move any misplaced objects that do not belong in the container to their correct shard. - move shard ranges from FOUND state to CREATED state by creating shard containers. - move shard ranges from CREATED to CLEAVED state by cleaving objects to shard dbs and replicating those dbs. By default this is done in batches of 2 shard ranges per visit. Additionally, when the auto_shard option is True (NOT yet recommeneded in production), the sharder will identify shard ranges for containers that have exceeded the threshold for sharding, and will also manage the sharding and shrinking of shard containers. The manage_shard_ranges tool provides a means to manually identify shard ranges and merge them to a container in order to trigger sharding. This is currently the recommended way to shard a container. Co-Authored-By: Alistair Coles Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Change-Id: I7f192209d4d5580f5a0aa6838f9f04e436cf6b1f --- bin/swift-container-sharder | 33 + doc/saio/swift/container-server/1.conf | 10 + doc/saio/swift/container-server/2.conf | 10 + doc/saio/swift/container-server/3.conf | 10 + doc/saio/swift/container-server/4.conf | 10 + doc/saio/swift/internal-client.conf | 24 + etc/container-server.conf-sample | 118 + setup.cfg | 4 + swift/cli/manage_shard_ranges.py | 370 ++ swift/cli/shard-info.py | 195 + swift/common/manager.py | 11 +- swift/common/utils.py | 15 + swift/container/backend.py | 37 + swift/container/sharder.py | 1568 ++++++ swift/proxy/controllers/base.py | 2 +- swift/proxy/controllers/container.py | 14 + test/probe/test_sharder.py | 2025 ++++++++ test/unit/__init__.py | 43 + test/unit/cli/test_manage_shard_ranges.py | 362 ++ test/unit/common/test_db_replicator.py | 46 +- test/unit/common/test_utils.py | 47 + test/unit/container/test_backend.py | 69 + test/unit/container/test_sharder.py | 4580 +++++++++++++++++ test/unit/proxy/controllers/test_container.py | 85 + 24 files changed, 9640 insertions(+), 48 deletions(-) create mode 100755 bin/swift-container-sharder create mode 100644 doc/saio/swift/internal-client.conf create mode 100644 swift/cli/manage_shard_ranges.py create mode 100644 swift/cli/shard-info.py create mode 100644 swift/container/sharder.py create mode 100644 test/probe/test_sharder.py create mode 100644 test/unit/cli/test_manage_shard_ranges.py create mode 100644 test/unit/container/test_sharder.py diff --git a/bin/swift-container-sharder b/bin/swift-container-sharder new file mode 100755 index 0000000000..3e6551319b --- /dev/null +++ b/bin/swift-container-sharder @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# Copyright (c) 2010-2015 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.container.sharder import ContainerSharder +from swift.common.utils import parse_options +from swift.common.daemon import run_daemon +from optparse import OptionParser + +if __name__ == '__main__': + parser = OptionParser("%prog CONFIG [options]") + parser.add_option('-d', '--devices', + help='Shard containers only on given devices. ' + 'Comma-separated list. ' + 'Only has effect if --once is used.') + parser.add_option('-p', '--partitions', + help='Shard containers only in given partitions. ' + 'Comma-separated list. ' + 'Only has effect if --once is used.') + conf_file, options = parse_options(parser=parser, once=True) + run_daemon(ContainerSharder, conf_file, **options) diff --git a/doc/saio/swift/container-server/1.conf b/doc/saio/swift/container-server/1.conf index 5bf3c0f28c..e71a5b6683 100644 --- a/doc/saio/swift/container-server/1.conf +++ b/doc/saio/swift/container-server/1.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/2.conf b/doc/saio/swift/container-server/2.conf index 0b29ada029..86e58a9fde 100644 --- a/doc/saio/swift/container-server/2.conf +++ b/doc/saio/swift/container-server/2.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/3.conf b/doc/saio/swift/container-server/3.conf index 9f340d07e6..73e760af15 100644 --- a/doc/saio/swift/container-server/3.conf +++ b/doc/saio/swift/container-server/3.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/4.conf b/doc/saio/swift/container-server/4.conf index 5e95e9c57c..c254191b8f 100644 --- a/doc/saio/swift/container-server/4.conf +++ b/doc/saio/swift/container-server/4.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/internal-client.conf b/doc/saio/swift/internal-client.conf new file mode 100644 index 0000000000..052d1e7549 --- /dev/null +++ b/doc/saio/swift/internal-client.conf @@ -0,0 +1,24 @@ +[DEFAULT] + +[pipeline:main] +pipeline = catch_errors proxy-logging cache symlink proxy-server + +[app:proxy-server] +use = egg:swift#proxy +account_autocreate = true +# See proxy-server.conf-sample for options + +[filter:symlink] +use = egg:swift#symlink +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/etc/container-server.conf-sample b/etc/container-server.conf-sample index 4059e39418..7d38deb0c5 100644 --- a/etc/container-server.conf-sample +++ b/etc/container-server.conf-sample @@ -69,6 +69,10 @@ bind_port = 6201 # Work only with ionice_class. # ionice_class = # ionice_priority = +# +# The prefix used for hidden auto-created accounts, for example accounts in +# which shard containers are created. Defaults to '.'. +# auto_create_account_prefix = . [pipeline:main] pipeline = healthcheck recon container-server @@ -323,3 +327,117 @@ use = egg:swift#xprofile # # unwind the iterator of applications # unwind = false + +[container-sharder] +# You can override the default log routing for this app here (don't use set!): +# log_name = container-sharder +# log_facility = LOG_LOCAL0 +# log_level = INFO +# log_address = /dev/log +# +# Container sharder specific settings +# +# If the auto_shard option is true then the sharder will automatically select +# containers to shard, scan for shard ranges, and select shards to shrink. +# The default is false. +# Warning: auto-sharding is still under development and should not be used in +# production; do not set this option to true in a production cluster. +# auto_shard = false +# +# When auto-sharding is enabled shard_container_threshold defines the object +# count at which a container with container-sharding enabled will start to +# shard. shard_container_threshold also indirectly determines the initial +# nominal size of shard containers, which is shard_container_threshold // 2, as +# well as determining the thresholds for shrinking and merging shard +# containers. +# shard_container_threshold = 1000000 +# +# When auto-sharding is enabled shard_shrink_point defines the object count +# below which a 'donor' shard container will be considered for shrinking into +# another 'acceptor' shard container. shard_shrink_point is a percentage of +# shard_container_threshold e.g. the default value of 5 means 5% of the +# shard_container_threshold. +# shard_shrink_point = 5 +# +# When auto-sharding is enabled shard_shrink_merge_point defines the maximum +# allowed size of an acceptor shard container after having a donor merged into +# it. Shard_shrink_merge_point is a percentage of shard_container_threshold. +# e.g. the default value of 75 means that the projected sum of a donor object +# count and acceptor count must be less than 75% of shard_container_threshold +# for the donor to be allowed to merge into the acceptor. +# +# For example, if the shard_container_threshold is 1 million, +# shard_shrink_point is 5, and shard_shrink_merge_point is 75 then a shard will +# be considered for shrinking if it has less than or equal to 50 thousand +# objects but will only merge into an acceptor if the combined object count +# would be less than or equal to 750 thousand objects. +# shard_shrink_merge_point = 75 +# +# When auto-sharding is enabled shard_scanner_batch_size defines the maximum +# number of shard ranges that will be found each time the sharder daemon visits +# a sharding container. If necessary the sharder daemon will continue to search +# for more shard ranges each time it visits the container. +# shard_scanner_batch_size = 10 +# +# cleave_batch_size defines the number of shard ranges that will be cleaved +# each time the sharder daemon visits a sharding container. +# cleave_batch_size = 2 +# +# cleave_row_batch_size defines the size of batches of object rows read from a +# sharding container and merged to a shard container during cleaving. +# cleave_row_batch_size = 10000 +# +# Defines the number of successfully replicated shard dbs required when +# cleaving a previously uncleaved shard range before the sharder will progress +# to the next shard range. The value should be less than or equal to the +# container ring replica count. The default of 'auto' causes the container ring +# quorum value to be used. This option only applies to the container-sharder +# replication and does not affect the number of shard container replicas that +# will eventually be replicated by the container-replicator. +# shard_replication_quorum = auto +# +# Defines the number of successfully replicated shard dbs required when +# cleaving a shard range that has been previously cleaved on another node +# before the sharder will progress to the next shard range. The value should be +# less than or equal to the container ring replica count. The default of 'auto' +# causes the shard_replication_quorum value to be used. This option only +# applies to the container-sharder replication and does not affect the number +# of shard container replicas that will eventually be replicated by the +# container-replicator. +# existing_shard_replication_quorum = auto +# +# The sharder uses an internal client to create and make requests to +# containers. The absolute path to the client config file can be configured. +# internal_client_conf_path = /etc/swift/internal-client.conf +# +# The number of time the internal client will retry requests. +# request_tries = 3 +# +# Each time the sharder dumps stats to the recon cache file it includes a list +# of containers that appear to need sharding but are not yet sharding. By +# default this list is limited to the top 5 containers, ordered by object +# count. The limit may be changed by setting recon_candidates_limit to an +# integer value. A negative value implies no limit. +# recon_candidates_limit = 5 +# +# Large databases tend to take a while to work with, but we want to make sure +# we write down our progress. Use a larger-than-normal broker timeout to make +# us less likely to bomb out on a LockTimeout. +# broker_timeout = 60 +# +# Time in seconds to wait between sharder cycles +# interval = 30 +# +# The container-sharder accepts the following configuration options as defined +# in the container-replicator section: +# +# per_diff = 1000 +# max_diffs = 100 +# concurrency = 8 +# node_timeout = 10 +# conn_timeout = 0.5 +# reclaim_age = 604800 +# rsync_compress = no +# rsync_module = {replication_ip}::container +# recon_cache_path = /var/cache/swift +# diff --git a/setup.cfg b/setup.cfg index 7ed7f1ec17..bc6b1a07c0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,6 +36,7 @@ scripts = bin/swift-container-info bin/swift-container-replicator bin/swift-container-server + bin/swift-container-sharder bin/swift-container-sync bin/swift-container-updater bin/swift-container-reconciler @@ -71,6 +72,9 @@ keystone = keystonemiddleware>=4.17.0 [entry_points] +console_scripts = + swift-manage-shard-ranges = swift.cli.manage_shard_ranges:main + paste.app_factory = proxy = swift.proxy.server:app_factory object = swift.obj.server:app_factory diff --git a/swift/cli/manage_shard_ranges.py b/swift/cli/manage_shard_ranges.py new file mode 100644 index 0000000000..acbc364968 --- /dev/null +++ b/swift/cli/manage_shard_ranges.py @@ -0,0 +1,370 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from __future__ import print_function +import argparse +import json +import sys +import time + +from six.moves import input + +from swift.common.utils import Timestamp, get_logger, ShardRange +from swift.container.backend import ContainerBroker, UNSHARDED +from swift.container.sharder import make_shard_ranges, sharding_enabled, \ + CleavingContext + + +def _load_and_validate_shard_data(args): + try: + with open(args.input, 'rb') as fd: + try: + data = json.load(fd) + if not isinstance(data, list): + raise ValueError('Shard data must be a list of dicts') + for k in ('lower', 'upper', 'index', 'object_count'): + for shard in data: + shard[k] + return data + except (TypeError, ValueError, KeyError) as err: + print('Failed to load valid shard range data: %r' % err, + file=sys.stderr) + exit(2) + except IOError as err: + print('Failed to open file %s: %s' % (args.input, err), + file=sys.stderr) + exit(2) + + +def _check_shard_ranges(own_shard_range, shard_ranges): + reasons = [] + + def reason(x, y): + if x != y: + reasons.append('%s != %s' % (x, y)) + + if not shard_ranges: + reasons.append('No shard ranges.') + else: + reason(own_shard_range.lower, shard_ranges[0].lower) + reason(own_shard_range.upper, shard_ranges[-1].upper) + for x, y in zip(shard_ranges, shard_ranges[1:]): + reason(x.upper, y.lower) + + if reasons: + print('WARNING: invalid shard ranges: %s.' % reasons) + print('Aborting.') + exit(2) + + +def _check_own_shard_range(broker, args): + # TODO: this check is weak - if the shards prefix changes then we may not + # identify a shard container. The goal is to not inadvertently create an + # entire namespace default shard range for a shard container. + is_shard = broker.account.startswith(args.shards_account_prefix) + own_shard_range = broker.get_own_shard_range(no_default=is_shard) + if not own_shard_range: + print('WARNING: shard container missing own shard range.') + print('Aborting.') + exit(2) + return own_shard_range + + +def _find_ranges(broker, args, status_file=None): + start = last_report = time.time() + limit = 5 if status_file else -1 + shard_data, last_found = broker.find_shard_ranges( + args.rows_per_shard, limit=limit) + if shard_data: + while not last_found: + if last_report + 10 < time.time(): + print('Found %d ranges in %gs; looking for more...' % ( + len(shard_data), time.time() - start), file=status_file) + last_report = time.time() + # prefix doesn't matter since we aren't persisting it + found_ranges = make_shard_ranges(broker, shard_data, '.shards_') + more_shard_data, last_found = broker.find_shard_ranges( + args.rows_per_shard, existing_ranges=found_ranges, limit=5) + shard_data.extend(more_shard_data) + return shard_data, time.time() - start + + +def find_ranges(broker, args): + shard_data, delta_t = _find_ranges(broker, args, sys.stderr) + print(json.dumps(shard_data, sort_keys=True, indent=2)) + print('Found %d ranges in %gs (total object count %s)' % + (len(shard_data), delta_t, + sum(r['object_count'] for r in shard_data)), + file=sys.stderr) + return 0 + + +def show_shard_ranges(broker, args): + shard_ranges = broker.get_shard_ranges( + include_deleted=getattr(args, 'include_deleted', False)) + shard_data = [dict(sr, state=sr.state_text) + for sr in shard_ranges] + + if not shard_data: + print("No shard data found.", file=sys.stderr) + elif getattr(args, 'brief', False): + print("Existing shard ranges:", file=sys.stderr) + print(json.dumps([(sd['lower'], sd['upper']) for sd in shard_data], + sort_keys=True, indent=2)) + else: + print("Existing shard ranges:", file=sys.stderr) + print(json.dumps(shard_data, sort_keys=True, indent=2)) + return 0 + + +def db_info(broker, args): + print('Sharding enabled = %s' % sharding_enabled(broker)) + own_sr = broker.get_own_shard_range(no_default=True) + print('Own shard range: %s' % + (json.dumps(dict(own_sr, state=own_sr.state_text), + sort_keys=True, indent=2) + if own_sr else None)) + db_state = broker.get_db_state() + print('db_state = %s' % db_state) + if db_state == 'sharding': + print('Retiring db id: %s' % broker.get_brokers()[0].get_info()['id']) + print('Cleaving context: %s' % + json.dumps(dict(CleavingContext.load(broker)), + sort_keys=True, indent=2)) + print('Metadata:') + for k, (v, t) in broker.metadata.items(): + print(' %s = %s' % (k, v)) + + +def delete_shard_ranges(broker, args): + shard_ranges = broker.get_shard_ranges() + if not shard_ranges: + print("No shard ranges found to delete.") + return 0 + + while not args.force: + print('This will delete existing %d shard ranges.' % len(shard_ranges)) + if broker.get_db_state() != UNSHARDED: + print('WARNING: Be very cautious about deleting existing shard ' + 'ranges. Deleting all ranges in this db does not guarantee ' + 'deletion of all ranges on all replicas of the db.') + print(' - this db is in state %s' % broker.get_db_state()) + print(' - %d existing shard ranges have started sharding' % + [sr.state != ShardRange.FOUND + for sr in shard_ranges].count(True)) + choice = input('Do you want to show the existing ranges [s], ' + 'delete the existing ranges [yes] ' + 'or quit without deleting [q]? ') + if choice == 's': + show_shard_ranges(broker, args) + continue + elif choice == 'q': + return 1 + elif choice == 'yes': + break + else: + print('Please make a valid choice.') + print() + + now = Timestamp.now() + for sr in shard_ranges: + sr.deleted = 1 + sr.timestamp = now + broker.merge_shard_ranges(shard_ranges) + print('Deleted %s existing shard ranges.' % len(shard_ranges)) + return 0 + + +def _replace_shard_ranges(broker, args, shard_data, timeout=None): + own_shard_range = _check_own_shard_range(broker, args) + shard_ranges = make_shard_ranges( + broker, shard_data, args.shards_account_prefix) + _check_shard_ranges(own_shard_range, shard_ranges) + + if args.verbose > 0: + print('New shard ranges to be injected:') + print(json.dumps([dict(sr) for sr in shard_ranges], + sort_keys=True, indent=2)) + + # Crank up the timeout in an effort to *make sure* this succeeds + with broker.updated_timeout(max(timeout, args.replace_timeout)): + delete_shard_ranges(broker, args) + broker.merge_shard_ranges(shard_ranges) + + print('Injected %d shard ranges.' % len(shard_ranges)) + print('Run container-replicator to replicate them to other nodes.') + if args.enable: + return enable_sharding(broker, args) + else: + print('Use the enable sub-command to enable sharding.') + return 0 + + +def replace_shard_ranges(broker, args): + shard_data = _load_and_validate_shard_data(args) + return _replace_shard_ranges(broker, args, shard_data) + + +def find_replace_shard_ranges(broker, args): + shard_data, delta_t = _find_ranges(broker, args, sys.stdout) + # Since we're trying to one-shot this, and the previous step probably + # took a while, make the timeout for writing *at least* that long + return _replace_shard_ranges(broker, args, shard_data, timeout=delta_t) + + +def _enable_sharding(broker, own_shard_range, args): + if own_shard_range.update_state(ShardRange.SHARDING): + own_shard_range.epoch = Timestamp.now() + own_shard_range.state_timestamp = own_shard_range.epoch + + with broker.updated_timeout(args.enable_timeout): + broker.merge_shard_ranges([own_shard_range]) + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('True', Timestamp.now().normal)}) + return own_shard_range + + +def enable_sharding(broker, args): + own_shard_range = _check_own_shard_range(broker, args) + _check_shard_ranges(own_shard_range, broker.get_shard_ranges()) + + if own_shard_range.state == ShardRange.ACTIVE: + own_shard_range = _enable_sharding(broker, own_shard_range, args) + print('Container moved to state %r with epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + elif own_shard_range.state == ShardRange.SHARDING: + if own_shard_range.epoch: + print('Container already in state %r with epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + print('No action required.') + else: + print('Container already in state %r but missing epoch.' % + own_shard_range.state_text) + own_shard_range = _enable_sharding(broker, own_shard_range, args) + print('Container in state %r given epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + else: + print('WARNING: container in state %s (should be active or sharding).' + % own_shard_range.state_text) + print('Aborting.') + return 2 + + print('Run container-sharder on all nodes to shard the container.') + return 0 + + +def _add_find_args(parser): + parser.add_argument('rows_per_shard', nargs='?', type=int, default=500000) + + +def _add_replace_args(parser): + parser.add_argument( + '--shards_account_prefix', metavar='shards_account_prefix', type=str, + required=False, help='Prefix for shards account', default='.shards_') + parser.add_argument( + '--replace-timeout', type=int, default=600, + help='Minimum DB timeout to use when replacing shard ranges.') + parser.add_argument( + '--force', '-f', action='store_true', default=False, + help='Delete existing shard ranges; no questions asked.') + parser.add_argument( + '--enable', action='store_true', default=False, + help='Enable sharding after adding shard ranges.') + + +def _add_enable_args(parser): + parser.add_argument( + '--enable-timeout', type=int, default=300, + help='DB timeout to use when enabling sharding.') + + +def _make_parser(): + parser = argparse.ArgumentParser(description='Manage shard ranges') + parser.add_argument('container_db') + parser.add_argument('--verbose', '-v', action='count', + help='Increase output verbosity') + subparsers = parser.add_subparsers( + help='Sub-command help', title='Sub-commands') + + # find + find_parser = subparsers.add_parser( + 'find', help='Find and display shard ranges') + _add_find_args(find_parser) + find_parser.set_defaults(func=find_ranges) + + # delete + delete_parser = subparsers.add_parser( + 'delete', help='Delete all existing shard ranges from db') + delete_parser.add_argument( + '--force', '-f', action='store_true', default=False, + help='Delete existing shard ranges; no questions asked.') + delete_parser.set_defaults(func=delete_shard_ranges) + + # show + show_parser = subparsers.add_parser( + 'show', help='Print shard range data') + show_parser.add_argument( + '--include_deleted', '-d', action='store_true', default=False, + help='Include deleted shard ranges in output.') + show_parser.add_argument( + '--brief', '-b', action='store_true', default=False, + help='Show only shard range bounds in output.') + show_parser.set_defaults(func=show_shard_ranges) + + # info + info_parser = subparsers.add_parser( + 'info', help='Print container db info') + info_parser.set_defaults(func=db_info) + + # replace + replace_parser = subparsers.add_parser( + 'replace', + help='Replace existing shard ranges. User will be prompted before ' + 'deleting any existing shard ranges.') + replace_parser.add_argument('input', metavar='input_file', + type=str, help='Name of file') + _add_replace_args(replace_parser) + replace_parser.set_defaults(func=replace_shard_ranges) + + # find_and_replace + find_replace_parser = subparsers.add_parser( + 'find_and_replace', + help='Find new shard ranges and replace existing shard ranges. ' + 'User will be prompted before deleting any existing shard ranges.' + ) + _add_find_args(find_replace_parser) + _add_replace_args(find_replace_parser) + _add_enable_args(find_replace_parser) + find_replace_parser.set_defaults(func=find_replace_shard_ranges) + + # enable + enable_parser = subparsers.add_parser( + 'enable', help='Enable sharding and move db to sharding state.') + _add_enable_args(enable_parser) + enable_parser.set_defaults(func=enable_sharding) + _add_replace_args(enable_parser) + return parser + + +def main(args=None): + parser = _make_parser() + args = parser.parse_args(args) + logger = get_logger({}, name='ContainerBroker', log_to_console=True) + broker = ContainerBroker(args.container_db, logger=logger, + skip_commits=True) + broker.get_info() + print('Loaded db broker for %s.' % broker.path, file=sys.stderr) + return args.func(broker, args) + + +if __name__ == '__main__': + exit(main()) diff --git a/swift/cli/shard-info.py b/swift/cli/shard-info.py new file mode 100644 index 0000000000..01223787f7 --- /dev/null +++ b/swift/cli/shard-info.py @@ -0,0 +1,195 @@ +# Copyright (c) 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict + +from swift.common import utils +from swift.common.db_replicator import roundrobin_datadirs +from swift.common.ring import ring +from swift.common.utils import Timestamp +from swift.container.backend import ContainerBroker, DATADIR + +TAB = ' ' + + +def broker_key(broker): + broker.get_info() + return broker.path + + +def container_type(broker): + return 'ROOT' if broker.is_root_container() else 'SHARD' + + +def collect_brokers(conf_path, names2nodes): + conf = utils.readconf(conf_path, 'container-replicator') + root = conf.get('devices', '/srv/node') + swift_dir = conf.get('swift_dir', '/etc/swift') + c_ring = ring.Ring(swift_dir, ring_name='container') + dirs = [] + brokers = defaultdict(dict) + for node in c_ring.devs: + if node is None: + continue + datadir = os.path.join(root, node['device'], DATADIR) + if os.path.isdir(datadir): + dirs.append((datadir, node['id'], lambda *args: True)) + for part, object_file, node_id in roundrobin_datadirs(dirs): + broker = ContainerBroker(object_file) + for node in c_ring.get_part_nodes(int(part)): + if node['id'] == node_id: + node_index = str(node['index']) + break + else: + node_index = 'handoff' + names2nodes[broker_key(broker)][(node_id, node_index)] = broker + return brokers + + +def print_broker_info(node, broker, indent_level=0): + indent = indent_level * TAB + info = broker.get_info() + raw_info = broker._get_info() + deleted_at = float(info['delete_timestamp']) + if deleted_at: + deleted_at = Timestamp(info['delete_timestamp']).isoformat + else: + deleted_at = ' - ' + print('%s(%s) %s, objs: %s, bytes: %s, actual_objs: %s, put: %s, ' + 'deleted: %s' % + (indent, node[1][0], broker.get_db_state(), + info['object_count'], info['bytes_used'], raw_info['object_count'], + Timestamp(info['put_timestamp']).isoformat, deleted_at)) + + +def print_db(node, broker, expect_type='ROOT', indent_level=0): + indent = indent_level * TAB + print('%s(%s) %s node id: %s, node index: %s' % + (indent, node[1][0], broker.db_file, node[0], node[1])) + actual_type = container_type(broker) + if actual_type != expect_type: + print('%s ERROR expected %s but found %s' % + (indent, expect_type, actual_type)) + + +def print_own_shard_range(node, sr, indent_level): + indent = indent_level * TAB + range = '%r - %r' % (sr.lower, sr.upper) + print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), ' + 'modified: %s (%s), %7s: %s (%s), deleted: %s epoch: %s' % + (indent, node[1][0], range, sr.object_count, sr.bytes_used, + sr.timestamp.isoformat, sr.timestamp.internal, + sr.meta_timestamp.isoformat, sr.meta_timestamp.internal, + sr.state_text, sr.state_timestamp.isoformat, + sr.state_timestamp.internal, sr.deleted, + sr.epoch.internal if sr.epoch else None)) + + +def print_own_shard_range_info(node, shard_ranges, indent_level=0): + shard_ranges.sort(key=lambda x: x.deleted) + for sr in shard_ranges: + print_own_shard_range(node, sr, indent_level) + + +def print_shard_range(node, sr, indent_level): + indent = indent_level * TAB + range = '%r - %r' % (sr.lower, sr.upper) + print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), ' + 'modified: %s (%s), %7s: %s (%s), deleted: %s %s' % + (indent, node[1][0], range, sr.object_count, sr.bytes_used, + sr.timestamp.isoformat, sr.timestamp.internal, + sr.meta_timestamp.isoformat, sr.meta_timestamp.internal, + sr.state_text, sr.state_timestamp.isoformat, + sr.state_timestamp.internal, sr.deleted, sr.name)) + + +def print_shard_range_info(node, shard_ranges, indent_level=0): + shard_ranges.sort(key=lambda x: x.deleted) + for sr in shard_ranges: + print_shard_range(node, sr, indent_level) + + +def print_sharding_info(node, broker, indent_level=0): + indent = indent_level * TAB + print('%s(%s) %s' % (indent, node[1][0], broker.get_sharding_sysmeta())) + + +def print_container(name, name2nodes2brokers, expect_type='ROOT', + indent_level=0, used_names=None): + used_names = used_names or set() + indent = indent_level * TAB + node2broker = name2nodes2brokers[name] + ordered_by_index = sorted(node2broker.keys(), key=lambda x: x[1]) + brokers = [(node, node2broker[node]) for node in ordered_by_index] + + print('%sName: %s' % (indent, name)) + if name in used_names: + print('%s (Details already listed)\n' % indent) + return + + used_names.add(name) + print(indent + 'DB files:') + for node, broker in brokers: + print_db(node, broker, expect_type, indent_level=indent_level + 1) + + print(indent + 'Info:') + for node, broker in brokers: + print_broker_info(node, broker, indent_level=indent_level + 1) + + print(indent + 'Sharding info:') + for node, broker in brokers: + print_sharding_info(node, broker, indent_level=indent_level + 1) + print(indent + 'Own shard range:') + for node, broker in brokers: + shard_ranges = broker.get_shard_ranges( + include_deleted=True, include_own=True, exclude_others=True) + print_own_shard_range_info(node, shard_ranges, + indent_level=indent_level + 1) + print(indent + 'Shard ranges:') + shard_names = set() + for node, broker in brokers: + shard_ranges = broker.get_shard_ranges(include_deleted=True) + for sr_name in shard_ranges: + shard_names.add(sr_name.name) + print_shard_range_info(node, shard_ranges, + indent_level=indent_level + 1) + print(indent + 'Shards:') + for sr_name in shard_names: + print_container(sr_name, name2nodes2brokers, expect_type='SHARD', + indent_level=indent_level + 1, used_names=used_names) + print('\n') + + +def run(conf_paths): + # container_name -> (node id, node index) -> broker + name2nodes2brokers = defaultdict(dict) + for conf_path in conf_paths: + collect_brokers(conf_path, name2nodes2brokers) + + print('First column on each line is (node index)\n') + for name, node2broker in name2nodes2brokers.items(): + expect_root = False + for node, broker in node2broker.items(): + expect_root = broker.is_root_container() or expect_root + if expect_root: + print_container(name, name2nodes2brokers) + + +if __name__ == '__main__': + conf_dir = '/etc/swift/container-server' + conf_paths = [os.path.join(conf_dir, p) for p in os.listdir(conf_dir) + if p.endswith(('conf', 'conf.d'))] + run(conf_paths) diff --git a/swift/common/manager.py b/swift/common/manager.py index 330f8310f4..71f9e689b3 100644 --- a/swift/common/manager.py +++ b/swift/common/manager.py @@ -34,7 +34,7 @@ PROC_DIR = '/proc' ALL_SERVERS = ['account-auditor', 'account-server', 'container-auditor', 'container-replicator', 'container-reconciler', - 'container-server', 'container-sync', + 'container-server', 'container-sharder', 'container-sync', 'container-updater', 'object-auditor', 'object-server', 'object-expirer', 'object-replicator', 'object-reconstructor', 'object-updater', @@ -637,13 +637,16 @@ class Server(object): {'server': self.server, 'pid': pid, 'conf': conf_file}) return 0 - def spawn(self, conf_file, once=False, wait=True, daemon=True, **kwargs): + def spawn(self, conf_file, once=False, wait=True, daemon=True, + additional_args=None, **kwargs): """Launch a subprocess for this server. :param conf_file: path to conf_file to use as first arg :param once: boolean, add once argument to command :param wait: boolean, if true capture stdout with a pipe :param daemon: boolean, if false ask server to log to console + :param additional_args: list of additional arguments to pass + on the command line :returns: the pid of the spawned process """ @@ -653,6 +656,10 @@ class Server(object): if not daemon: # ask the server to log to console args.append('verbose') + if additional_args: + if isinstance(additional_args, str): + additional_args = [additional_args] + args.extend(additional_args) # figure out what we're going to do with stdio if not daemon: diff --git a/swift/common/utils.py b/swift/common/utils.py index 4a1c6e3911..048e64d65d 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -412,6 +412,21 @@ def config_positive_int_value(value): return result +def config_float_value(value, minimum=None, maximum=None): + try: + val = float(value) + if minimum is not None and val < minimum: + raise ValueError() + if maximum is not None and val > maximum: + raise ValueError() + return val + except (TypeError, ValueError): + min_ = ', greater than %s' % minimum if minimum is not None else '' + max_ = ', less than %s' % maximum if maximum is not None else '' + raise ValueError('Config option must be a number%s%s, not "%s".' % + (min_, max_, value)) + + def config_auto_int_value(value, default): """ Returns default if value is None or 'auto'. diff --git a/swift/container/backend.py b/swift/container/backend.py index 9d75d0f680..040b79ad0b 100644 --- a/swift/container/backend.py +++ b/swift/container/backend.py @@ -746,6 +746,43 @@ class ContainerBroker(DatabaseBroker): 'meta_timestamp': meta_timestamp} self.put_record(record) + def remove_objects(self, lower, upper, max_row=None): + """ + Removes object records in the given namespace range from the object + table. + + Note that objects are removed regardless of their storage_policy_index. + + :param lower: defines the lower bound of object names that will be + removed; names greater than this value will be removed; names less + than or equal to this value will not be removed. + :param upper: defines the upper bound of object names that will be + removed; names less than or equal to this value will be removed; + names greater than this value will not be removed. The empty string + is interpreted as there being no upper bound. + :param max_row: if specified only rows less than or equal to max_row + will be removed + """ + query_conditions = [] + query_args = [] + if max_row is not None: + query_conditions.append('ROWID <= ?') + query_args.append(str(max_row)) + if lower: + query_conditions.append('name > ?') + query_args.append(lower) + if upper: + query_conditions.append('name <= ?') + query_args.append(upper) + + query = 'DELETE FROM object WHERE deleted in (0, 1)' + if query_conditions: + query += ' AND ' + ' AND '.join(query_conditions) + + with self.get() as conn: + conn.execute(query, query_args) + conn.commit() + def _is_deleted_info(self, object_count, put_timestamp, delete_timestamp, **kwargs): """ diff --git a/swift/container/sharder.py b/swift/container/sharder.py new file mode 100644 index 0000000000..06c2b6d9db --- /dev/null +++ b/swift/container/sharder.py @@ -0,0 +1,1568 @@ +# Copyright (c) 2015 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import errno +import json +import time +from collections import defaultdict +from random import random + +import os +import six +from eventlet import Timeout + +from swift.common import internal_client, db_replicator +from swift.common.constraints import check_drive +from swift.common.direct_client import (direct_put_container, + DirectClientException) +from swift.common.exceptions import DeviceUnavailable +from swift.common.ring.utils import is_local_device +from swift.common.utils import get_logger, config_true_value, \ + dump_recon_cache, whataremyips, Timestamp, ShardRange, GreenAsyncPile, \ + config_float_value, config_positive_int_value, \ + quorum_size, parse_override_options, Everything, config_auto_int_value +from swift.container.backend import ContainerBroker, \ + RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED, COLLAPSED, \ + SHARD_UPDATE_STATES +from swift.container.replicator import ContainerReplicator + + +def sharding_enabled(broker): + # NB all shards will by default have been created with + # X-Container-Sysmeta-Sharding set and will therefore be candidates for + # sharding, along with explicitly configured root containers. + sharding = broker.metadata.get('X-Container-Sysmeta-Sharding') + if sharding and config_true_value(sharding[0]): + return True + # if broker has been marked deleted it will have lost sysmeta, but we still + # need to process the broker (for example, to shrink any shard ranges) so + # fallback to checking if it has any shard ranges + if broker.get_shard_ranges(): + return True + return False + + +def make_shard_ranges(broker, shard_data, shards_account_prefix): + timestamp = Timestamp.now() + shard_ranges = [] + for data in shard_data: + # Make a copy so we don't mutate the original + kwargs = data.copy() + path = ShardRange.make_path( + shards_account_prefix + broker.root_account, + broker.root_container, broker.container, + timestamp, kwargs.pop('index')) + + shard_ranges.append(ShardRange(path, timestamp, **kwargs)) + return shard_ranges + + +def find_missing_ranges(shard_ranges): + """ + Find any ranges in the entire object namespace that are not covered by any + shard range in the given list. + + :param shard_ranges: A list of :class:`~swift.utils.ShardRange` + :return: a list of missing ranges + """ + gaps = [] + if not shard_ranges: + return ((ShardRange.MIN, ShardRange.MAX),) + if shard_ranges[0].lower > ShardRange.MIN: + gaps.append((ShardRange.MIN, shard_ranges[0].lower)) + for first, second in zip(shard_ranges, shard_ranges[1:]): + if first.upper < second.lower: + gaps.append((first.upper, second.lower)) + if shard_ranges[-1].upper < ShardRange.MAX: + gaps.append((shard_ranges[-1].upper, ShardRange.MAX)) + return gaps + + +def find_overlapping_ranges(shard_ranges): + """ + Find all pairs of overlapping ranges in the given list. + + :param shard_ranges: A list of :class:`~swift.utils.ShardRange` + :return: a set of tuples, each tuple containing ranges that overlap with + each other. + """ + result = set() + for shard_range in shard_ranges: + overlapping = [sr for sr in shard_ranges + if shard_range != sr and shard_range.overlaps(sr)] + if overlapping: + overlapping.append(shard_range) + overlapping.sort() + result.add(tuple(overlapping)) + + return result + + +def is_sharding_candidate(shard_range, threshold): + return (shard_range.state == ShardRange.ACTIVE and + shard_range.object_count >= threshold) + + +def find_sharding_candidates(broker, threshold, shard_ranges=None): + # this should only execute on root containers; the goal is to find + # large shard containers that should be sharded. + # First cut is simple: assume root container shard usage stats are good + # enough to make decision. + # TODO: object counts may well not be the appropriate metric for + # deciding to shrink because a shard with low object_count may have a + # large number of deleted object rows that will need to be merged with + # a neighbour. We may need to expose row count as well as object count. + if shard_ranges is None: + shard_ranges = broker.get_shard_ranges(states=[ShardRange.ACTIVE]) + candidates = [] + for shard_range in shard_ranges: + if not is_sharding_candidate(shard_range, threshold): + continue + shard_range.update_state(ShardRange.SHARDING, + state_timestamp=Timestamp.now()) + shard_range.epoch = shard_range.state_timestamp + candidates.append(shard_range) + return candidates + + +def find_shrinking_candidates(broker, shrink_threshold, merge_size): + # this should only execute on root containers that have sharded; the + # goal is to find small shard containers that could be retired by + # merging with a neighbour. + # First cut is simple: assume root container shard usage stats are good + # enough to make decision; only merge with upper neighbour so that + # upper bounds never change (shard names include upper bound). + # TODO: object counts may well not be the appropriate metric for + # deciding to shrink because a shard with low object_count may have a + # large number of deleted object rows that will need to be merged with + # a neighbour. We may need to expose row count as well as object count. + shard_ranges = broker.get_shard_ranges() + own_shard_range = broker.get_own_shard_range() + if len(shard_ranges) == 1: + # special case to enable final shard to shrink into root + shard_ranges.append(own_shard_range) + + merge_pairs = {} + for donor, acceptor in zip(shard_ranges, shard_ranges[1:]): + if donor in merge_pairs: + # this range may already have been made an acceptor; if so then + # move on. In principle it might be that even after expansion + # this range and its donor(s) could all be merged with the next + # range. In practice it is much easier to reason about a single + # donor merging into a single acceptor. Don't fret - eventually + # all the small ranges will be retired. + continue + if (acceptor.name != own_shard_range.name and + acceptor.state != ShardRange.ACTIVE): + # don't shrink into a range that is not yet ACTIVE + continue + if donor.state not in (ShardRange.ACTIVE, ShardRange.SHRINKING): + # found? created? sharded? don't touch it + continue + + proposed_object_count = donor.object_count + acceptor.object_count + if (donor.state == ShardRange.SHRINKING or + (donor.object_count < shrink_threshold and + proposed_object_count < merge_size)): + # include previously identified merge pairs on presumption that + # following shrink procedure is idempotent + merge_pairs[acceptor] = donor + if donor.update_state(ShardRange.SHRINKING): + # Set donor state to shrinking so that next cycle won't use + # it as an acceptor; state_timestamp defines new epoch for + # donor and new timestamp for the expanded acceptor below. + donor.epoch = donor.state_timestamp = Timestamp.now() + if acceptor.lower != donor.lower: + # Update the acceptor container with its expanding state to + # prevent it treating objects cleaved from the donor + # as misplaced. + acceptor.lower = donor.lower + acceptor.timestamp = donor.state_timestamp + return merge_pairs + + +class CleavingContext(object): + def __init__(self, ref, cursor='', max_row=None, cleave_to_row=None, + last_cleave_to_row=None, cleaving_done=False, + misplaced_done=False, ranges_done=0, ranges_todo=0): + self.ref = ref + self._cursor = None + self.cursor = cursor + self.max_row = max_row + self.cleave_to_row = cleave_to_row + self.last_cleave_to_row = last_cleave_to_row + self.cleaving_done = cleaving_done + self.misplaced_done = misplaced_done + self.ranges_done = ranges_done + self.ranges_todo = ranges_todo + + def __iter__(self): + yield 'ref', self.ref + yield 'cursor', self.cursor + yield 'max_row', self.max_row + yield 'cleave_to_row', self.cleave_to_row + yield 'last_cleave_to_row', self.last_cleave_to_row + yield 'cleaving_done', self.cleaving_done + yield 'misplaced_done', self.misplaced_done + yield 'ranges_done', self.ranges_done + yield 'ranges_todo', self.ranges_todo + + def _encode(cls, value): + if value is not None and six.PY2 and isinstance(value, six.text_type): + return value.encode('utf-8') + return value + + @property + def cursor(self): + return self._cursor + + @cursor.setter + def cursor(self, value): + self._cursor = self._encode(value) + + @property + def marker(self): + return self.cursor + '\x00' + + @classmethod + def _make_ref(cls, broker): + return broker.get_info()['id'] + + @classmethod + def load(cls, broker): + """ + Returns a context dict for tracking the progress of cleaving this + broker's retiring DB. The context is persisted in sysmeta using a key + that is based off the retiring db id and max row. This form of + key ensures that a cleaving context is only loaded for a db that + matches the id and max row when the context was created; if a db is + modified such that its max row changes then a different context, or no + context, will be loaded. + + :return: A dict to which cleave progress metadata may be added. The + dict initially has a key ``ref`` which should not be modified by + any caller. + """ + brokers = broker.get_brokers() + ref = cls._make_ref(brokers[0]) + data = brokers[-1].get_sharding_sysmeta('Context-' + ref) + data = json.loads(data) if data else {} + data['ref'] = ref + data['max_row'] = brokers[0].get_max_row() + return cls(**data) + + def store(self, broker): + broker.set_sharding_sysmeta('Context-' + self.ref, + json.dumps(dict(self))) + + def reset(self): + self.cursor = '' + self.ranges_done = 0 + self.ranges_todo = 0 + self.cleaving_done = False + self.misplaced_done = False + self.last_cleave_to_row = self.cleave_to_row + + def start(self): + self.cursor = '' + self.ranges_done = 0 + self.ranges_todo = 0 + self.cleaving_done = False + self.cleave_to_row = self.max_row + + def done(self): + return all((self.misplaced_done, self.cleaving_done, + self.max_row == self.cleave_to_row)) + + +DEFAULT_SHARD_CONTAINER_THRESHOLD = 10000000 +DEFAULT_SHARD_SHRINK_POINT = 25 +DEFAULT_SHARD_MERGE_POINT = 75 + + +class ContainerSharder(ContainerReplicator): + """Shards containers.""" + + def __init__(self, conf, logger=None): + logger = logger or get_logger(conf, log_route='container-sharder') + super(ContainerSharder, self).__init__(conf, logger=logger) + self.shards_account_prefix = ( + (conf.get('auto_create_account_prefix') or '.') + 'shards_') + + def percent_value(key, default): + try: + value = conf.get(key, default) + return config_float_value(value, 0, 100) / 100.0 + except ValueError as err: + raise ValueError("%s: %s" % (str(err), key)) + + self.shard_shrink_point = percent_value('shard_shrink_point', + DEFAULT_SHARD_SHRINK_POINT) + self.shrink_merge_point = percent_value('shard_shrink_merge_point', + DEFAULT_SHARD_MERGE_POINT) + self.shard_container_threshold = config_positive_int_value( + conf.get('shard_container_threshold', + DEFAULT_SHARD_CONTAINER_THRESHOLD)) + self.shrink_size = (self.shard_container_threshold * + self.shard_shrink_point) + self.merge_size = (self.shard_container_threshold * + self.shrink_merge_point) + self.split_size = self.shard_container_threshold // 2 + self.scanner_batch_size = config_positive_int_value( + conf.get('shard_scanner_batch_size', 10)) + self.cleave_batch_size = config_positive_int_value( + conf.get('cleave_batch_size', 2)) + self.cleave_row_batch_size = config_positive_int_value( + conf.get('cleave_row_batch_size', 10000)) + self.auto_shard = config_true_value(conf.get('auto_shard', False)) + self.sharding_candidates = [] + self.recon_candidates_limit = int( + conf.get('recon_candidates_limit', 5)) + self.broker_timeout = config_positive_int_value( + conf.get('broker_timeout', 60)) + replica_count = self.ring.replica_count + quorum = quorum_size(replica_count) + self.shard_replication_quorum = config_auto_int_value( + conf.get('shard_replication_quorum'), quorum) + if self.shard_replication_quorum > replica_count: + self.logger.warning( + 'shard_replication_quorum of %s exceeds replica count %s' + ', reducing to %s', self.shard_replication_quorum, + replica_count, replica_count) + self.shard_replication_quorum = replica_count + self.existing_shard_replication_quorum = config_auto_int_value( + conf.get('existing_shard_replication_quorum'), + self.shard_replication_quorum) + if self.existing_shard_replication_quorum > replica_count: + self.logger.warning( + 'existing_shard_replication_quorum of %s exceeds replica count' + ' %s, reducing to %s', self.existing_shard_replication_quorum, + replica_count, replica_count) + self.existing_shard_replication_quorum = replica_count + + # internal client + self.conn_timeout = float(conf.get('conn_timeout', 5)) + request_tries = config_positive_int_value( + conf.get('request_tries', 3)) + internal_client_conf_path = conf.get('internal_client_conf_path', + '/etc/swift/internal-client.conf') + try: + self.int_client = internal_client.InternalClient( + internal_client_conf_path, + 'Swift Container Sharder', + request_tries, + allow_modify_pipeline=False) + except IOError as err: + if err.errno != errno.ENOENT: + raise + raise SystemExit( + 'Unable to load internal client from config: %r (%s)' % + (internal_client_conf_path, err)) + self.reported = 0 + + def _zero_stats(self): + """Zero out the stats.""" + super(ContainerSharder, self)._zero_stats() + # all sharding stats that are additional to the inherited replicator + # stats are maintained under the 'sharding' key in self.stats + self.stats['sharding'] = defaultdict(lambda: defaultdict(int)) + self.sharding_candidates = [] + + def _append_stat(self, category, key, value): + if not self.stats['sharding'][category][key]: + self.stats['sharding'][category][key] = list() + self.stats['sharding'][category][key].append(value) + + def _min_stat(self, category, key, value): + current = self.stats['sharding'][category][key] + if not current: + self.stats['sharding'][category][key] = value + else: + self.stats['sharding'][category][key] = min(current, value) + + def _max_stat(self, category, key, value): + current = self.stats['sharding'][category][key] + if not current: + self.stats['sharding'][category][key] = value + else: + self.stats['sharding'][category][key] = max(current, value) + + def _increment_stat(self, category, key, step=1, statsd=False): + self.stats['sharding'][category][key] += step + if statsd: + statsd_key = '%s_%s' % (category, key) + self.logger.increment(statsd_key) + + def _make_stats_info(self, broker, node, own_shard_range): + try: + file_size = os.stat(broker.db_file).st_size + except OSError: + file_size = None + + return {'path': broker.db_file, + 'node_index': node.get('index'), + 'account': broker.account, + 'container': broker.container, + 'root': broker.root_path, + 'object_count': own_shard_range.object_count, + 'meta_timestamp': own_shard_range.meta_timestamp.internal, + 'file_size': file_size} + + def _identify_sharding_candidate(self, broker, node): + own_shard_range = broker.get_own_shard_range() + if is_sharding_candidate( + own_shard_range, self.shard_container_threshold): + self.sharding_candidates.append( + self._make_stats_info(broker, node, own_shard_range)) + + def _transform_sharding_candidate_stats(self): + category = self.stats['sharding']['sharding_candidates'] + candidates = self.sharding_candidates + category['found'] = len(candidates) + candidates.sort(key=lambda c: c['object_count'], reverse=True) + if self.recon_candidates_limit >= 0: + category['top'] = candidates[:self.recon_candidates_limit] + else: + category['top'] = candidates + + def _record_sharding_progress(self, broker, node, error): + own_shard_range = broker.get_own_shard_range() + if (broker.get_db_state() in (UNSHARDED, SHARDING) and + own_shard_range.state in (ShardRange.SHARDING, + ShardRange.SHARDED)): + info = self._make_stats_info(broker, node, own_shard_range) + info['state'] = own_shard_range.state_text + info['db_state'] = broker.get_db_state() + states = [ShardRange.FOUND, ShardRange.CREATED, + ShardRange.CLEAVED, ShardRange.ACTIVE] + shard_ranges = broker.get_shard_ranges(states=states) + state_count = {} + for state in states: + state_count[ShardRange.STATES[state]] = 0 + for shard_range in shard_ranges: + state_count[shard_range.state_text] += 1 + info.update(state_count) + info['error'] = error and str(error) + self._append_stat('sharding_in_progress', 'all', info) + + def _report_stats(self): + # report accumulated stats since start of one sharder cycle + default_stats = ('attempted', 'success', 'failure') + category_keys = ( + ('visited', default_stats + ('skipped', 'completed')), + ('scanned', default_stats + ('found', 'min_time', 'max_time')), + ('created', default_stats), + ('cleaved', default_stats + ('min_time', 'max_time',)), + ('misplaced', default_stats + ('found', 'placed', 'unplaced')), + ('audit_root', default_stats), + ('audit_shard', default_stats), + ) + + now = time.time() + last_report = time.ctime(self.stats['start']) + elapsed = now - self.stats['start'] + sharding_stats = self.stats['sharding'] + for category, keys in category_keys: + stats = sharding_stats[category] + msg = ' '.join(['%s:%s' % (k, str(stats[k])) for k in keys]) + self.logger.info('Since %s %s - %s', last_report, category, msg) + + self._transform_sharding_candidate_stats() + + dump_recon_cache( + {'sharding_stats': self.stats, + 'sharding_time': elapsed, + 'sharding_last': now}, + self.rcache, self.logger) + self.reported = now + + def _periodic_report_stats(self): + if (time.time() - self.reported) >= 3600: # once an hour + self._report_stats() + + def _check_node(self, node): + if not node: + return False + if not is_local_device(self.ips, self.port, + node['replication_ip'], + node['replication_port']): + return False + if not check_drive(self.root, node['device'], + self.mount_check): + self.logger.warning( + 'Skipping %(device)s as it is not mounted' % node) + return False + return True + + def _fetch_shard_ranges(self, broker, newest=False, params=None, + include_deleted=False): + path = self.int_client.make_path(broker.root_account, + broker.root_container) + params = params or {} + params.setdefault('format', 'json') + headers = {'X-Backend-Record-Type': 'shard', + 'X-Backend-Override-Deleted': 'true', + 'X-Backend-Include-Deleted': str(include_deleted)} + if newest: + headers['X-Newest'] = 'true' + try: + try: + resp = self.int_client.make_request( + 'GET', path, headers, acceptable_statuses=(2,), + params=params) + except internal_client.UnexpectedResponse as err: + self.logger.warning("Failed to get shard ranges from %s: %s", + broker.root_path, err) + return None + record_type = resp.headers.get('x-backend-record-type') + if record_type != 'shard': + err = 'unexpected record type %r' % record_type + self.logger.error("Failed to get shard ranges from %s: %s", + broker.root_path, err) + return None + + try: + data = json.loads(resp.body) + if not isinstance(data, list): + raise ValueError('not a list') + return [ShardRange.from_dict(shard_range) + for shard_range in data] + except (ValueError, TypeError, KeyError) as err: + self.logger.error( + "Failed to get shard ranges from %s: invalid data: %r", + broker.root_path, err) + return None + finally: + self.logger.txn_id = None + + def _put_container(self, node, part, account, container, headers, body): + try: + direct_put_container(node, part, account, container, + conn_timeout=self.conn_timeout, + response_timeout=self.node_timeout, + headers=headers, contents=body) + except DirectClientException as err: + self.logger.warning( + 'Failed to put shard ranges to %s:%s/%s: %s', + node['ip'], node['port'], node['device'], err.http_status) + except (Exception, Timeout) as err: + self.logger.exception( + 'Failed to put shard ranges to %s:%s/%s: %s', + node['ip'], node['port'], node['device'], err) + else: + return True + return False + + def _send_shard_ranges(self, account, container, shard_ranges, + headers=None): + body = json.dumps([dict(sr) for sr in shard_ranges]) + part, nodes = self.ring.get_nodes(account, container) + headers = headers or {} + headers.update({'X-Backend-Record-Type': RECORD_TYPE_SHARD, + 'User-Agent': 'container-sharder %s' % os.getpid(), + 'X-Timestamp': Timestamp.now().normal, + 'Content-Length': len(body), + 'Content-Type': 'application/json'}) + + pool = GreenAsyncPile(len(nodes)) + for node in nodes: + pool.spawn(self._put_container, node, part, account, + container, headers, body) + + results = pool.waitall(None) + return results.count(True) >= quorum_size(self.ring.replica_count) + + def _get_shard_broker(self, shard_range, root_path, policy_index): + """ + Get a broker for a container db for the given shard range. If one of + the shard container's primary nodes is a local device then that will be + chosen for the db, otherwise the first of the shard container's handoff + nodes that is local will be chosen. + + :param shard_range: a :class:`~swift.common.utils.ShardRange` + :param root_path: the path of the shard's root container + :param policy_index: the storage policy index + :returns: a tuple of ``(part, broker, node_id)`` where ``part`` is the + shard container's partition, ``broker`` is an instance of + :class:`~swift.container.backend.ContainerBroker`, + ``node_id`` is the id of the selected node. + """ + part = self.ring.get_part(shard_range.account, shard_range.container) + node = self.find_local_handoff_for_part(part) + if not node: + raise DeviceUnavailable( + 'No mounted devices found suitable for creating shard broker' + 'for %s in partition %s' % (shard_range.name, part)) + + shard_broker = ContainerBroker.create_broker( + os.path.join(self.root, node['device']), part, shard_range.account, + shard_range.container, epoch=shard_range.epoch, + storage_policy_index=policy_index) + + # Get the valid info into the broker.container, etc + shard_broker.get_info() + shard_broker.merge_shard_ranges(shard_range) + shard_broker.set_sharding_sysmeta('Root', root_path) + shard_broker.update_metadata({ + 'X-Container-Sysmeta-Sharding': + ('True', Timestamp.now().internal)}) + + return part, shard_broker, node['id'] + + def _audit_root_container(self, broker): + # This is the root container, and therefore the tome of knowledge, + # all we can do is check there is nothing screwy with the ranges + self._increment_stat('audit_root', 'attempted') + warnings = [] + own_shard_range = broker.get_own_shard_range() + + if own_shard_range.state in (ShardRange.SHARDING, ShardRange.SHARDED): + shard_ranges = broker.get_shard_ranges() + missing_ranges = find_missing_ranges(shard_ranges) + if missing_ranges: + warnings.append( + 'missing range(s): %s' % + ' '.join(['%s-%s' % (lower, upper) + for lower, upper in missing_ranges])) + + for state in ShardRange.STATES: + shard_ranges = broker.get_shard_ranges(states=state) + overlaps = find_overlapping_ranges(shard_ranges) + for overlapping_ranges in overlaps: + warnings.append( + 'overlapping ranges in state %s: %s' % + (ShardRange.STATES[state], + ' '.join(['%s-%s' % (sr.lower, sr.upper) + for sr in overlapping_ranges]))) + + if warnings: + self.logger.warning( + 'Audit failed for root %s (%s): %s' % + (broker.db_file, broker.path, ', '.join(warnings))) + self._increment_stat('audit_root', 'failure', statsd=True) + return False + + self._increment_stat('audit_root', 'success', statsd=True) + return True + + def _audit_shard_container(self, broker): + # Get the root view of the world. + self._increment_stat('audit_shard', 'attempted') + warnings = [] + errors = [] + if not broker.account.startswith(self.shards_account_prefix): + warnings.append('account not in shards namespace %r' % + self.shards_account_prefix) + + own_shard_range = broker.get_own_shard_range(no_default=True) + + shard_range = None + if own_shard_range: + shard_ranges = self._fetch_shard_ranges( + broker, newest=True, + params={'marker': own_shard_range.lower, + 'end_marker': own_shard_range.upper}, + include_deleted=True) + if shard_ranges: + for shard_range in shard_ranges: + if (shard_range.lower == own_shard_range.lower and + shard_range.upper == own_shard_range.upper and + shard_range.name == own_shard_range.name): + break + else: + # this is not necessarily an error - some replicas of the + # root may not yet know about this shard container + warnings.append('root has no matching shard range') + shard_range = None + else: + warnings.append('unable to get shard ranges from root') + else: + errors.append('missing own shard range') + + if warnings: + self.logger.warning( + 'Audit warnings for shard %s (%s): %s' % + (broker.db_file, broker.path, ', '.join(warnings))) + + if errors: + self.logger.warning( + 'Audit failed for shard %s (%s) - skipping: %s' % + (broker.db_file, broker.path, ', '.join(errors))) + self._increment_stat('audit_shard', 'failure', statsd=True) + return False + + if shard_range: + self.logger.debug('Updating shard from root %s', dict(shard_range)) + broker.merge_shard_ranges(shard_range) + own_shard_range = broker.get_own_shard_range() + delete_age = time.time() - self.reclaim_age + if (own_shard_range.state == ShardRange.SHARDED and + own_shard_range.deleted and + own_shard_range.timestamp < delete_age and + broker.empty()): + broker.delete_db(Timestamp.now().internal) + self.logger.debug('Deleted shard container %s (%s)', + broker.db_file, broker.path) + self._increment_stat('audit_shard', 'success', statsd=True) + return True + + def _audit_container(self, broker): + if broker.is_deleted(): + # if the container has been marked as deleted, all metadata will + # have been erased so no point auditing. But we want it to pass, in + # case any objects exist inside it. + return True + if broker.is_root_container(): + return self._audit_root_container(broker) + return self._audit_shard_container(broker) + + def yield_objects(self, broker, src_shard_range, since_row=None): + """ + Iterates through all objects in ``src_shard_range`` in name order + yielding them in lists of up to CONTAINER_LISTING_LIMIT length. + + :param broker: A :class:`~swift.container.backend.ContainerBroker`. + :param src_shard_range: A :class:`~swift.common.utils.ShardRange` + describing the source range. + :param since_row: include only items whose ROWID is greater than + the given row id; by default all rows are included. + :return: a generator of tuples of (list of objects, broker info dict) + """ + for include_deleted in (False, True): + marker = src_shard_range.lower_str + while True: + info = broker.get_info() + info['max_row'] = broker.get_max_row() + start = time.time() + objects = broker.get_objects( + self.cleave_row_batch_size, + marker=marker, + end_marker=src_shard_range.end_marker, + include_deleted=include_deleted, + since_row=since_row) + if objects: + self.logger.debug('got %s objects from %s in %ss', + len(objects), broker.db_file, + time.time() - start) + yield objects, info + + if len(objects) < self.cleave_row_batch_size: + break + marker = objects[-1]['name'] + + def yield_objects_to_shard_range(self, broker, src_shard_range, + dest_shard_ranges): + """ + Iterates through all objects in ``src_shard_range`` to place them in + destination shard ranges provided by the ``next_shard_range`` function. + Yields tuples of (object list, destination shard range in which those + objects belong). Note that the same destination shard range may be + referenced in more than one yielded tuple. + + :param broker: A :class:`~swift.container.backend.ContainerBroker`. + :param src_shard_range: A :class:`~swift.common.utils.ShardRange` + describing the source range. + :param dest_shard_ranges: A function which should return a list of + destination shard ranges in name order. + :return: a generator of tuples of + (object list, shard range, broker info dict) + """ + dest_shard_range_iter = dest_shard_range = None + for objs, info in self.yield_objects(broker, src_shard_range): + if not objs: + return + + def next_or_none(it): + try: + return next(it) + except StopIteration: + return None + + if dest_shard_range_iter is None: + dest_shard_range_iter = iter(dest_shard_ranges()) + dest_shard_range = next_or_none(dest_shard_range_iter) + + unplaced = False + last_index = next_index = 0 + for obj in objs: + if dest_shard_range is None: + # no more destinations: yield remainder of batch and return + # NB there may be more batches of objects but none of them + # will be placed so no point fetching them + yield objs[last_index:], None, info + return + if obj['name'] <= dest_shard_range.lower: + unplaced = True + elif unplaced: + # end of run of unplaced objects, yield them + yield objs[last_index:next_index], None, info + last_index = next_index + unplaced = False + while (dest_shard_range and + obj['name'] > dest_shard_range.upper): + if next_index != last_index: + # yield the objects in current dest_shard_range + yield (objs[last_index:next_index], + dest_shard_range, + info) + last_index = next_index + dest_shard_range = next_or_none(dest_shard_range_iter) + next_index += 1 + + if next_index != last_index: + # yield tail of current batch of objects + # NB there may be more objects for the current + # dest_shard_range in the next batch from yield_objects + yield (objs[last_index:next_index], + None if unplaced else dest_shard_range, + info) + + def _post_replicate_hook(self, broker, info, responses): + # override superclass behaviour + pass + + def _replicate_and_delete(self, broker, dest_shard_range, part, + dest_broker, node_id, info): + success, responses = self._replicate_object( + part, dest_broker.db_file, node_id) + quorum = quorum_size(self.ring.replica_count) + if not success and responses.count(True) < quorum: + self.logger.warning( + 'Failed to sufficiently replicate misplaced objects: %s in %s ' + '(not removing)', dest_shard_range, broker.path) + return False + + if broker.get_info()['id'] != info['id']: + # the db changed - don't remove any objects + success = False + else: + # remove objects up to the max row of the db sampled prior to + # the first object yielded for this destination; objects added + # after that point may not have been yielded and replicated so + # it is not safe to remove them yet + broker.remove_objects( + dest_shard_range.lower_str, + dest_shard_range.upper_str, + max_row=info['max_row']) + success = True + + if not success: + self.logger.warning( + 'Refused to remove misplaced objects: %s in %s', + dest_shard_range, broker.path) + return success + + def _move_objects(self, src_broker, src_shard_range, policy_index, + shard_range_fetcher): + # move objects from src_shard_range in src_broker to destination shard + # ranges provided by shard_range_fetcher + dest_brokers = {} # map shard range -> broker + placed = unplaced = 0 + success = True + for objs, dest_shard_range, info in self.yield_objects_to_shard_range( + src_broker, src_shard_range, shard_range_fetcher): + if not dest_shard_range: + unplaced += len(objs) + success = False + continue + + if dest_shard_range.name == src_broker.path: + self.logger.debug( + 'Skipping source as misplaced objects destination') + # in shrinking context, the misplaced objects might actually be + # correctly placed if the root has expanded this shard but this + # broker has not yet been updated + continue + + if dest_shard_range not in dest_brokers: + part, dest_broker, node_id = self._get_shard_broker( + dest_shard_range, src_broker.root_path, policy_index) + # save the broker info that was sampled prior to the *first* + # yielded objects for this destination + destination = {'part': part, + 'dest_broker': dest_broker, + 'node_id': node_id, + 'info': info} + dest_brokers[dest_shard_range] = destination + else: + destination = dest_brokers[dest_shard_range] + destination['dest_broker'].merge_items(objs) + placed += len(objs) + + if unplaced: + self.logger.warning( + 'Failed to find destination for at least %s misplaced objects ' + 'in %s' % (unplaced, src_broker.path)) + + # TODO: consider executing the replication jobs concurrently + for dest_shard_range, dest_args in dest_brokers.items(): + self.logger.debug('moving misplaced objects found in range %s' % + dest_shard_range) + success &= self._replicate_and_delete( + src_broker, dest_shard_range, **dest_args) + + self._increment_stat('misplaced', 'placed', step=placed) + self._increment_stat('misplaced', 'unplaced', step=unplaced) + return success, placed + unplaced + + def _make_shard_range_fetcher(self, broker, src_shard_range): + # returns a function that will lazy load shard ranges on demand; + # this means only one lookup is made for all misplaced ranges. + outer = {} + + def shard_range_fetcher(): + if not outer: + if broker.is_root_container(): + ranges = broker.get_shard_ranges( + marker=src_shard_range.lower_str, + end_marker=src_shard_range.end_marker, + states=SHARD_UPDATE_STATES) + else: + # TODO: the root may not yet know about shard ranges to + # which a shard is sharding, but those could come from + # the broker + ranges = self._fetch_shard_ranges( + broker, newest=True, + params={'states': 'updating', + 'marker': src_shard_range.lower_str, + 'end_marker': src_shard_range.end_marker}) + outer['ranges'] = iter(ranges) + return outer['ranges'] + return shard_range_fetcher + + def _make_default_misplaced_object_bounds(self, broker): + # Objects outside of this container's own range are misplaced. + own_shard_range = broker.get_own_shard_range() + bounds = [] + if own_shard_range.lower: + bounds.append(('', own_shard_range.lower)) + if own_shard_range.upper: + bounds.append((own_shard_range.upper, '')) + return bounds + + def _make_misplaced_object_bounds(self, broker): + bounds = [] + state = broker.get_db_state() + if state == SHARDED: + # Anything in the object table is treated as a misplaced object. + bounds.append(('', '')) + + if not bounds and state == SHARDING: + # Objects outside of this container's own range are misplaced. + # Objects in already cleaved shard ranges are also misplaced. + cleave_context = CleavingContext.load(broker) + if cleave_context.cursor: + bounds.append(('', cleave_context.cursor)) + own_shard_range = broker.get_own_shard_range() + if own_shard_range.upper: + bounds.append((own_shard_range.upper, '')) + + return bounds or self._make_default_misplaced_object_bounds(broker) + + def _move_misplaced_objects(self, broker, src_broker=None, + src_bounds=None): + """ + Search for objects in the given broker that do not belong in that + broker's namespace and move those objects to their correct shard + container. + + :param broker: An instance of :class:`swift.container.ContainerBroker`. + :param src_broker: optional alternative broker to use as the source + of misplaced objects; if not specified then ``broker`` is used as + the source. + :param src_bounds: optional list of (lower, upper) namespace bounds to + use when searching for misplaced objects + :return: True if all misplaced objects were sufficiently replicated to + their correct shard containers, False otherwise + """ + self.logger.debug('Looking for misplaced objects in %s (%s)', + broker.path.decode('utf-8'), broker.db_file) + self._increment_stat('misplaced', 'attempted') + src_broker = src_broker or broker + if src_bounds is None: + src_bounds = self._make_misplaced_object_bounds(broker) + # (ab)use ShardRange instances to encapsulate source namespaces + src_ranges = [ShardRange('dont/care', Timestamp.now(), lower, upper) + for lower, upper in src_bounds] + self.logger.debug('misplaced object source bounds %s' % src_bounds) + policy_index = broker.storage_policy_index + success = True + num_found = 0 + for src_shard_range in src_ranges: + part_success, part_num_found = self._move_objects( + src_broker, src_shard_range, policy_index, + self._make_shard_range_fetcher(broker, src_shard_range)) + success &= part_success + num_found += part_num_found + + if num_found: + self._increment_stat('misplaced', 'found', statsd=True) + self.logger.debug('Moved %s misplaced objects' % num_found) + self._increment_stat('misplaced', 'success' if success else 'failure') + self.logger.debug('Finished handling misplaced objects') + return success + + def _find_shard_ranges(self, broker): + """ + Scans the container to find shard ranges and adds them to the shard + ranges table. If there are existing shard ranges then scanning starts + from the upper bound of the uppermost existing shard range. + + :param broker: An instance of :class:`swift.container.ContainerBroker` + :return: a tuple of (success, num of shard ranges found) where success + is True if the last shard range has been found, False otherwise. + """ + own_shard_range = broker.get_own_shard_range() + shard_ranges = broker.get_shard_ranges() + if shard_ranges and shard_ranges[-1].upper >= own_shard_range.upper: + self.logger.debug('Scan already completed for %s', broker.path) + return 0 + + self.logger.info('Starting scan for shard ranges on %s', broker.path) + self._increment_stat('scanned', 'attempted') + + start = time.time() + shard_data, last_found = broker.find_shard_ranges( + self.split_size, limit=self.scanner_batch_size, + existing_ranges=shard_ranges) + elapsed = time.time() - start + + if not shard_data: + if last_found: + self.logger.info("Already found all shard ranges") + self._increment_stat('scanned', 'success', statsd=True) + else: + # we didn't find anything + self.logger.warning("No shard ranges found") + self._increment_stat('scanned', 'failure', statsd=True) + return 0 + + shard_ranges = make_shard_ranges( + broker, shard_data, self.shards_account_prefix) + broker.merge_shard_ranges(shard_ranges) + num_found = len(shard_ranges) + self.logger.info( + "Completed scan for shard ranges: %d found", num_found) + self._increment_stat('scanned', 'found', step=num_found) + self._min_stat('scanned', 'min_time', round(elapsed / num_found, 3)) + self._max_stat('scanned', 'max_time', round(elapsed / num_found, 3)) + + if last_found: + self.logger.info("Final shard range reached.") + self._increment_stat('scanned', 'success', statsd=True) + return num_found + + def _create_shard_containers(self, broker): + # Create shard containers that are ready to receive redirected object + # updates. Do this now, so that redirection can begin immediately + # without waiting for cleaving to complete. + found_ranges = broker.get_shard_ranges(states=ShardRange.FOUND) + created_ranges = [] + for shard_range in found_ranges: + self._increment_stat('created', 'attempted') + shard_range.update_state(ShardRange.CREATED) + headers = { + 'X-Backend-Storage-Policy-Index': broker.storage_policy_index, + 'X-Container-Sysmeta-Shard-Root': broker.root_path, + 'X-Container-Sysmeta-Sharding': True} + success = self._send_shard_ranges( + shard_range.account, shard_range.container, + [shard_range], headers=headers) + if success: + self.logger.debug('PUT new shard range container for %s', + shard_range) + self._increment_stat('created', 'success', statsd=True) + else: + self.logger.error( + 'PUT of new shard container %r failed for %s.', + shard_range, broker.path) + self._increment_stat('created', 'failure', statsd=True) + # break, not continue, because elsewhere it is assumed that + # finding and cleaving shard ranges progresses linearly, so we + # do not want any subsequent shard ranges to be in created + # state while this one is still in found state + break + created_ranges.append(shard_range) + + if created_ranges: + broker.merge_shard_ranges(created_ranges) + if not broker.is_root_container(): + self._send_shard_ranges( + broker.root_account, broker.root_container, created_ranges) + self.logger.info( + "Completed creating shard range containers: %d created.", + len(created_ranges)) + return len(created_ranges) + + def _cleave_shard_range(self, broker, cleaving_context, shard_range): + self.logger.info("Cleaving '%s' from row %s into %s for %r", + broker.path, cleaving_context.last_cleave_to_row, + shard_range.name, shard_range) + self._increment_stat('cleaved', 'attempted') + start = time.time() + policy_index = broker.storage_policy_index + try: + shard_part, shard_broker, node_id = self._get_shard_broker( + shard_range, broker.root_path, policy_index) + except DeviceUnavailable as duex: + self.logger.warning(str(duex)) + self._increment_stat('cleaved', 'failure', statsd=True) + return False + + # only cleave from the retiring db - misplaced objects handler will + # deal with any objects in the fresh db + source_broker = broker.get_brokers()[0] + # if this range has been cleaved before but replication + # failed then the shard db may still exist and it may not be + # necessary to merge all the rows again + source_db_id = source_broker.get_info()['id'] + source_max_row = source_broker.get_max_row() + sync_point = shard_broker.get_sync(source_db_id) + if sync_point < source_max_row: + sync_from_row = max(cleaving_context.last_cleave_to_row, + sync_point) + for objects, info in self.yield_objects( + source_broker, shard_range, + since_row=sync_from_row): + shard_broker.merge_items(objects) + # Note: the max row stored as a sync point is sampled *before* + # objects are yielded to ensure that is less than or equal to + # the last yielded row. Other sync points are also copied from the + # source broker to the shards; if another replica of the source + # happens to subsequently cleave into a primary replica of the + # shard then it will only need to cleave rows after its last sync + # point with this replica of the source broker. + shard_broker.merge_syncs( + [{'sync_point': source_max_row, 'remote_id': source_db_id}] + + source_broker.get_syncs()) + else: + self.logger.debug("Cleaving '%s': %r - shard db already in sync", + broker.path, shard_range) + + own_shard_range = broker.get_own_shard_range() + + replication_quorum = self.existing_shard_replication_quorum + if shard_range.includes(own_shard_range): + # When shrinking, include deleted own (donor) shard range in + # the replicated db so that when acceptor next updates root it + # will atomically update its namespace *and* delete the donor. + # Don't do this when sharding a shard because the donor + # namespace should not be deleted until all shards are cleaved. + if own_shard_range.update_state(ShardRange.SHARDED): + own_shard_range.set_deleted() + broker.merge_shard_ranges(own_shard_range) + shard_broker.merge_shard_ranges(own_shard_range) + elif shard_range.state == ShardRange.CREATED: + # The shard range object stats may have changed since the shard + # range was found, so update with stats of objects actually + # copied to the shard broker. Only do this the first time each + # shard range is cleaved. + info = shard_broker.get_info() + shard_range.update_meta( + info['object_count'], info['bytes_used']) + shard_range.update_state(ShardRange.CLEAVED) + shard_broker.merge_shard_ranges(shard_range) + replication_quorum = self.shard_replication_quorum + + self.logger.info( + 'Replicating new shard container %s for %s', + shard_broker.path, shard_broker.get_own_shard_range()) + + success, responses = self._replicate_object( + shard_part, shard_broker.db_file, node_id) + + replication_successes = responses.count(True) + if (not success and (not responses or + replication_successes < replication_quorum)): + # insufficient replication or replication not even attempted; + # break because we don't want to progress the cleave cursor + # until each shard range has been successfully cleaved + self.logger.warning( + 'Failed to sufficiently replicate cleaved shard %s for %s: ' + '%s successes, %s required.', shard_range, broker.path, + replication_successes, replication_quorum) + self._increment_stat('cleaved', 'failure', statsd=True) + return False + + elapsed = round(time.time() - start, 3) + self._min_stat('cleaved', 'min_time', elapsed) + self._max_stat('cleaved', 'max_time', elapsed) + broker.merge_shard_ranges(shard_range) + cleaving_context.cursor = shard_range.upper_str + cleaving_context.ranges_done += 1 + cleaving_context.ranges_todo -= 1 + if shard_range.upper >= own_shard_range.upper: + # cleaving complete + cleaving_context.cleaving_done = True + cleaving_context.store(broker) + self.logger.info( + 'Cleaved %s for shard range %s in %gs.', + broker.path, shard_range, elapsed) + self._increment_stat('cleaved', 'success', statsd=True) + return True + + def _cleave(self, broker): + # Returns True if misplaced objects have been moved and the entire + # container namespace has been successfully cleaved, False otherwise + if broker.is_sharded(): + self.logger.debug('Passing over already sharded container %s/%s', + broker.account, broker.container) + return True + + cleaving_context = CleavingContext.load(broker) + if not cleaving_context.misplaced_done: + # ensure any misplaced objects in the source broker are moved; note + # that this invocation of _move_misplaced_objects is targetted at + # the *retiring* db. + self.logger.debug( + 'Moving any misplaced objects from sharding container: %s', + broker.path) + bounds = self._make_default_misplaced_object_bounds(broker) + cleaving_context.misplaced_done = self._move_misplaced_objects( + broker, src_broker=broker.get_brokers()[0], + src_bounds=bounds) + cleaving_context.store(broker) + + if cleaving_context.cleaving_done: + self.logger.debug('Cleaving already complete for container %s', + broker.path) + return cleaving_context.misplaced_done + + ranges_todo = broker.get_shard_ranges(marker=cleaving_context.marker) + if cleaving_context.cursor: + # always update ranges_todo in case more ranges have been found + # since last visit + cleaving_context.ranges_todo = len(ranges_todo) + self.logger.debug('Continuing to cleave (%s done, %s todo): %s', + cleaving_context.ranges_done, + cleaving_context.ranges_todo, + broker.path) + else: + cleaving_context.start() + cleaving_context.ranges_todo = len(ranges_todo) + self.logger.debug('Starting to cleave (%s todo): %s', + cleaving_context.ranges_todo, broker.path) + + ranges_done = [] + for shard_range in ranges_todo[:self.cleave_batch_size]: + if shard_range.state == ShardRange.FOUND: + break + elif shard_range.state in (ShardRange.CREATED, + ShardRange.CLEAVED, + ShardRange.ACTIVE): + if self._cleave_shard_range( + broker, cleaving_context, shard_range): + ranges_done.append(shard_range) + else: + break + else: + self.logger.warning('Unexpected shard range state for cleave', + shard_range.state) + break + + if not ranges_done: + cleaving_context.store(broker) + self.logger.debug( + 'Cleaved %s shard ranges for %s', len(ranges_done), broker.path) + return (cleaving_context.misplaced_done and + cleaving_context.cleaving_done) + + def _complete_sharding(self, broker): + cleaving_context = CleavingContext.load(broker) + if cleaving_context.done(): + # Move all CLEAVED shards to ACTIVE state and if a shard then + # delete own shard range; these changes will be simultaneously + # reported in the next update to the root container. + modified_shard_ranges = broker.get_shard_ranges( + states=ShardRange.CLEAVED) + for sr in modified_shard_ranges: + sr.update_state(ShardRange.ACTIVE) + own_shard_range = broker.get_own_shard_range() + own_shard_range.update_state(ShardRange.SHARDED) + own_shard_range.update_meta(0, 0) + if (not broker.is_root_container() and not + own_shard_range.deleted): + own_shard_range = own_shard_range.copy( + timestamp=Timestamp.now(), deleted=1) + modified_shard_ranges.append(own_shard_range) + broker.merge_shard_ranges(modified_shard_ranges) + if broker.set_sharded_state(): + return True + else: + self.logger.warning( + 'Failed to remove retiring db file for %s', + broker.path) + else: + self.logger.warning( + 'Repeat cleaving required for %r with context: %s' + % (broker.db_files[0], dict(cleaving_context))) + cleaving_context.reset() + cleaving_context.store(broker) + + return False + + def _find_and_enable_sharding_candidates(self, broker, shard_ranges=None): + candidates = find_sharding_candidates( + broker, self.shard_container_threshold, shard_ranges) + if candidates: + self.logger.debug('Identified %s sharding candidates' + % len(candidates)) + broker.merge_shard_ranges(candidates) + + def _find_and_enable_shrinking_candidates(self, broker): + if not broker.is_sharded(): + self.logger.warning('Cannot shrink a not yet sharded container %s', + broker.path) + return + + merge_pairs = find_shrinking_candidates( + broker, self.shrink_size, self.merge_size) + self.logger.debug('Found %s shrinking candidates' % len(merge_pairs)) + own_shard_range = broker.get_own_shard_range() + for acceptor, donor in merge_pairs.items(): + self.logger.debug('shrinking shard range %s into %s in %s' % + (donor, acceptor, broker.db_file)) + broker.merge_shard_ranges([acceptor, donor]) + if acceptor.name != own_shard_range.name: + self._send_shard_ranges( + acceptor.account, acceptor.container, [acceptor]) + acceptor.increment_meta(donor.object_count, donor.bytes_used) + else: + # no need to change namespace or stats + acceptor.update_state(ShardRange.ACTIVE, + state_timestamp=Timestamp.now()) + # Now send a copy of the expanded acceptor, with an updated + # timestamp, to the donor container. This forces the donor to + # asynchronously cleave its entire contents to the acceptor and + # delete itself. The donor will pass its own deleted shard range to + # the acceptor when cleaving. Subsequent updates from the donor or + # the acceptor will then update the root to have the deleted donor + # shard range. + self._send_shard_ranges( + donor.account, donor.container, [donor, acceptor]) + + def _update_root_container(self, broker): + own_shard_range = broker.get_own_shard_range(no_default=True) + if not own_shard_range: + return + + # persist the reported shard metadata + broker.merge_shard_ranges(own_shard_range) + # now get a consistent list of own and other shard ranges + shard_ranges = broker.get_shard_ranges( + include_own=True, + include_deleted=True) + # send everything + self._send_shard_ranges( + broker.root_account, broker.root_container, + shard_ranges) + + def _process_broker(self, broker, node, part): + broker.get_info() # make sure account/container are populated + state = broker.get_db_state() + self.logger.debug('Starting processing %s state %s', + broker.path, state) + + if not self._audit_container(broker): + return + + # now look and deal with misplaced objects. + self._move_misplaced_objects(broker) + + if broker.is_deleted(): + # This container is deleted so we can skip it. We still want + # deleted containers to go via misplaced items because they may + # have new objects sitting in them that may need to move. + return + + is_leader = node['index'] == 0 and self.auto_shard + if state in (UNSHARDED, COLLAPSED): + if is_leader and broker.is_root_container(): + # bootstrap sharding of root container + self._find_and_enable_sharding_candidates( + broker, shard_ranges=[broker.get_own_shard_range()]) + + own_shard_range = broker.get_own_shard_range() + if own_shard_range.state in (ShardRange.SHARDING, + ShardRange.SHRINKING, + ShardRange.SHARDED): + if broker.get_shard_ranges(): + # container has been given shard ranges rather than + # found them e.g. via replication or a shrink event + if broker.set_sharding_state(): + state = SHARDING + elif is_leader: + if broker.set_sharding_state(): + state = SHARDING + else: + self.logger.debug( + 'Own shard range in state %r but no shard ranges ' + 'and not leader; remaining unsharded: %s' + % (own_shard_range.state_text, broker.path)) + + if state == SHARDING: + if is_leader: + num_found = self._find_shard_ranges(broker) + else: + num_found = 0 + + # create shard containers for newly found ranges + num_created = self._create_shard_containers(broker) + + if num_found or num_created: + # share updated shard range state with other nodes + self._replicate_object(part, broker.db_file, node['id']) + + # always try to cleave any pending shard ranges + cleave_complete = self._cleave(broker) + + if cleave_complete: + self.logger.info('Completed cleaving of %s', broker.path) + if self._complete_sharding(broker): + state = SHARDED + self._increment_stat('visited', 'completed', statsd=True) + else: + self.logger.debug('Remaining in sharding state %s', + broker.path) + + if state == SHARDED and broker.is_root_container(): + if is_leader: + self._find_and_enable_shrinking_candidates(broker) + self._find_and_enable_sharding_candidates(broker) + for shard_range in broker.get_shard_ranges( + states=[ShardRange.SHARDING]): + self._send_shard_ranges( + shard_range.account, shard_range.container, + [shard_range]) + + if not broker.is_root_container(): + # Update the root container with this container's shard range + # info; do this even when sharded in case previous attempts + # failed; don't do this if there is no own shard range. When + # sharding a shard, this is when the root will see the new + # shards move to ACTIVE state and the sharded shard + # simultaneously become deleted. + self._update_root_container(broker) + + self.logger.debug('Finished processing %s/%s state %s', + broker.account, broker.container, + broker.get_db_state()) + + def _one_shard_cycle(self, devices_to_shard, partitions_to_shard): + """ + The main function, everything the sharder does forks from this method. + + The sharder loops through each container with sharding enabled and each + sharded container on the server, on each container it: + - audits the container + - checks and deals with misplaced items + - cleaves any shard ranges as required + - if not a root container, reports shard range stats to the root + container + """ + self.logger.info('Container sharder cycle starting, auto-sharding %s', + self.auto_shard) + if isinstance(devices_to_shard, (list, tuple)): + self.logger.info('(Override devices: %s)', + ', '.join(str(d) for d in devices_to_shard)) + if isinstance(partitions_to_shard, (list, tuple)): + self.logger.info('(Override partitions: %s)', + ', '.join(str(p) for p in partitions_to_shard)) + self._zero_stats() + self._local_device_ids = set() + dirs = [] + self.ips = whataremyips(bind_ip=self.bind_ip) + for node in self.ring.devs: + if not self._check_node(node): + continue + datadir = os.path.join(self.root, node['device'], self.datadir) + if os.path.isdir(datadir): + # Populate self._local_device_ids so we can find devices for + # shard containers later + self._local_device_ids.add(node['id']) + if node['device'] not in devices_to_shard: + continue + part_filt = self._partition_dir_filter( + node['id'], + partitions_to_shard) + dirs.append((datadir, node, part_filt)) + if not dirs: + self.logger.warning('Found no data dirs!') + for part, path, node in db_replicator.roundrobin_datadirs(dirs): + # NB: get_part_nodes always provides an 'index' key; + # this will be used in leader selection + for primary in self.ring.get_part_nodes(int(part)): + if node['id'] == primary['id']: + node = primary + break + else: + # Set index such that we'll *never* be selected as a leader + node['index'] = 'handoff' + + broker = ContainerBroker(path, logger=self.logger, + timeout=self.broker_timeout) + error = None + try: + self._identify_sharding_candidate(broker, node) + if sharding_enabled(broker): + self._increment_stat('visited', 'attempted') + self._process_broker(broker, node, part) + self._increment_stat('visited', 'success', statsd=True) + else: + self._increment_stat('visited', 'skipped') + except (Exception, Timeout) as error: + self._increment_stat('visited', 'failure', statsd=True) + self.logger.exception( + 'Unhandled exception while processing %s: %s', path, error) + try: + self._record_sharding_progress(broker, node, error) + except (Exception, Timeout) as error: + self.logger.exception( + 'Unhandled exception while dumping progress for %s: %s', + path, error) + self._periodic_report_stats() + + self._report_stats() + + def run_forever(self, *args, **kwargs): + """Run the container sharder until stopped.""" + self.reported = time.time() + time.sleep(random() * self.interval) + while True: + begin = time.time() + try: + self._one_shard_cycle(devices_to_shard=Everything(), + partitions_to_shard=Everything()) + except (Exception, Timeout): + self.logger.increment('errors') + self.logger.exception('Exception in sharder') + elapsed = time.time() - begin + self.logger.info( + 'Container sharder cycle completed: %.02fs', elapsed) + if elapsed < self.interval: + time.sleep(self.interval - elapsed) + + def run_once(self, *args, **kwargs): + """Run the container sharder once.""" + self.logger.info('Begin container sharder "once" mode') + override_options = parse_override_options(once=True, **kwargs) + devices_to_shard = override_options.devices or Everything() + partitions_to_shard = override_options.partitions or Everything() + begin = self.reported = time.time() + self._one_shard_cycle(devices_to_shard=devices_to_shard, + partitions_to_shard=partitions_to_shard) + elapsed = time.time() - begin + self.logger.info( + 'Container sharder "once" mode completed: %.02fs', elapsed) diff --git a/swift/proxy/controllers/base.py b/swift/proxy/controllers/base.py index cca8f6cc14..4822b01729 100644 --- a/swift/proxy/controllers/base.py +++ b/swift/proxy/controllers/base.py @@ -2007,7 +2007,7 @@ class Controller(object): :param req: original Request instance. :param account: account in which `container` is stored. - :param container: container from which listing should be fetched. + :param container: container from listing should be fetched. :param headers: headers to be included with the request :param params: query string parameters to be used. :return: a tuple of (deserialized json data structure, swob Response) diff --git a/swift/proxy/controllers/container.py b/swift/proxy/controllers/container.py index f95a31f35a..e90632a294 100644 --- a/swift/proxy/controllers/container.py +++ b/swift/proxy/controllers/container.py @@ -21,6 +21,7 @@ from swift.common.utils import public, csv_append, Timestamp, \ config_true_value, ShardRange from swift.common.constraints import check_metadata, CONTAINER_LISTING_LIMIT from swift.common.http import HTTP_ACCEPTED, is_success +from swift.common.request_helpers import get_sys_meta_prefix from swift.proxy.controllers.base import Controller, delay_denial, \ cors_validation, set_info_cache, clear_info_cache from swift.common.storage_policy import POLICIES @@ -136,6 +137,11 @@ class ContainerController(Controller): for key in self.app.swift_owner_headers: if key in resp.headers: del resp.headers[key] + # Expose sharding state in reseller requests + if req.environ.get('reseller_request', False): + resp.headers['X-Container-Sharding'] = config_true_value( + resp.headers.get(get_sys_meta_prefix('container') + 'Sharding', + 'False')) return resp def _get_from_shards(self, req, resp): @@ -257,6 +263,10 @@ class ContainerController(Controller): if not req.environ.get('swift_owner'): for key in self.app.swift_owner_headers: req.headers.pop(key, None) + if req.environ.get('reseller_request', False) and \ + 'X-Container-Sharding' in req.headers: + req.headers[get_sys_meta_prefix('container') + 'Sharding'] = \ + str(config_true_value(req.headers['X-Container-Sharding'])) length_limit = self.get_name_length_limit() if len(self.container_name) > length_limit: resp = HTTPBadRequest(request=req) @@ -305,6 +315,10 @@ class ContainerController(Controller): if not req.environ.get('swift_owner'): for key in self.app.swift_owner_headers: req.headers.pop(key, None) + if req.environ.get('reseller_request', False) and \ + 'X-Container-Sharding' in req.headers: + req.headers[get_sys_meta_prefix('container') + 'Sharding'] = \ + str(config_true_value(req.headers['X-Container-Sharding'])) account_partition, accounts, container_count = \ self.account_info(self.account_name, req) if not accounts: diff --git a/test/probe/test_sharder.py b/test/probe/test_sharder.py new file mode 100644 index 0000000000..77ee3dd35b --- /dev/null +++ b/test/probe/test_sharder.py @@ -0,0 +1,2025 @@ +# Copyright (c) 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import hashlib +import json +import os +import shutil +import uuid + +from nose import SkipTest + +from swift.common import direct_client +from swift.common.direct_client import DirectClientException +from swift.common.utils import ShardRange, parse_db_filename, get_db_files, \ + quorum_size, config_true_value, Timestamp +from swift.container.backend import ContainerBroker, UNSHARDED, SHARDING +from swift.common import utils +from swift.common.manager import Manager +from swiftclient import client, get_auth, ClientException + +from swift.proxy.controllers.obj import num_container_updates +from test import annotate_failure +from test.probe.brain import BrainSplitter +from test.probe.common import ReplProbeTest, get_server_number, \ + wait_for_server_to_hangup + + +MIN_SHARD_CONTAINER_THRESHOLD = 4 +MAX_SHARD_CONTAINER_THRESHOLD = 100 + + +class ShardCollector(object): + """ + Returns map of node to tuples of (headers, shard ranges) returned from node + """ + def __init__(self): + self.ranges = {} + + def __call__(self, cnode, cpart, account, container): + self.ranges[cnode['id']] = direct_client.direct_get_container( + cnode, cpart, account, container, + headers={'X-Backend-Record-Type': 'shard'}) + + +class BaseTestContainerSharding(ReplProbeTest): + + def _maybe_skip_test(self): + try: + cont_configs = [utils.readconf(p, 'container-sharder') + for p in self.configs['container-server'].values()] + except ValueError: + raise SkipTest('No [container-sharder] section found in ' + 'container-server configs') + + skip_reasons = [] + auto_shard = all([config_true_value(c.get('auto_shard', False)) + for c in cont_configs]) + if not auto_shard: + skip_reasons.append( + 'auto_shard must be true in all container_sharder configs') + + self.max_shard_size = max( + int(c.get('shard_container_threshold', '1000000')) + for c in cont_configs) + + if not (MIN_SHARD_CONTAINER_THRESHOLD <= self.max_shard_size + <= MAX_SHARD_CONTAINER_THRESHOLD): + skip_reasons.append( + 'shard_container_threshold %d must be between %d and %d' % + (self.max_shard_size, MIN_SHARD_CONTAINER_THRESHOLD, + MAX_SHARD_CONTAINER_THRESHOLD)) + + def skip_check(reason_list, option, required): + values = set([int(c.get(option, required)) for c in cont_configs]) + if values != {required}: + reason_list.append('%s must be %s' % (option, required)) + + skip_check(skip_reasons, 'shard_scanner_batch_size', 10) + skip_check(skip_reasons, 'shard_batch_size', 2) + + if skip_reasons: + raise SkipTest(', '.join(skip_reasons)) + + def _load_rings_and_configs(self): + super(BaseTestContainerSharding, self)._load_rings_and_configs() + # perform checks for skipping test before starting services + self._maybe_skip_test() + + def _make_object_names(self, number): + return ['obj-%04d' % x for x in range(number)] + + def _setup_container_name(self): + self.container_name = 'container-%s' % uuid.uuid4() + + def setUp(self): + client.logger.setLevel(client.logging.WARNING) + client.requests.logging.getLogger().setLevel( + client.requests.logging.WARNING) + super(BaseTestContainerSharding, self).setUp() + _, self.admin_token = get_auth( + 'http://127.0.0.1:8080/auth/v1.0', 'admin:admin', 'admin') + self._setup_container_name() + self.brain = BrainSplitter(self.url, self.token, self.container_name, + None, 'container') + self.brain.put_container(policy_index=int(self.policy)) + self.sharders = Manager(['container-sharder']) + self.internal_client = self.make_internal_client() + + def stop_container_servers(self, node_numbers=None): + if node_numbers: + ipports = [] + server2ipport = {v: k for k, v in self.ipport2server.items()} + for number in self.brain.node_numbers[node_numbers]: + self.brain.servers.stop(number=number) + server = 'container%d' % number + ipports.append(server2ipport[server]) + else: + ipports = [k for k, v in self.ipport2server.items() + if v.startswith('container')] + self.brain.servers.stop() + for ipport in ipports: + wait_for_server_to_hangup(ipport) + + def put_objects(self, obj_names): + for obj in obj_names: + client.put_object(self.url, self.token, self.container_name, obj) + + def delete_objects(self, obj_names): + for obj in obj_names: + client.delete_object( + self.url, self.token, self.container_name, obj) + + def get_container_shard_ranges(self, account=None, container=None): + account = account if account else self.account + container = container if container else self.container_name + path = self.internal_client.make_path(account, container) + resp = self.internal_client.make_request( + 'GET', path + '?format=json', {'X-Backend-Record-Type': 'shard'}, + [200]) + return [ShardRange.from_dict(sr) for sr in json.loads(resp.body)] + + def direct_container_op(self, func, account=None, container=None, + expect_failure=False): + account = account if account else self.account + container = container if container else self.container_name + cpart, cnodes = self.container_ring.get_nodes(account, container) + unexpected_responses = [] + results = {} + for cnode in cnodes: + try: + results[cnode['id']] = func(cnode, cpart, account, container) + except DirectClientException as err: + if not expect_failure: + unexpected_responses.append((cnode, err)) + else: + if expect_failure: + unexpected_responses.append((cnode, 'success')) + if unexpected_responses: + self.fail('Unexpected responses: %s' % unexpected_responses) + return results + + def direct_get_container_shard_ranges(self, account=None, container=None, + expect_failure=False): + collector = ShardCollector() + self.direct_container_op( + collector, account, container, expect_failure) + return collector.ranges + + def direct_delete_container(self, account=None, container=None, + expect_failure=False): + self.direct_container_op(direct_client.direct_delete_container, + account, container, expect_failure) + + def direct_head_container(self, account=None, container=None, + expect_failure=False): + return self.direct_container_op(direct_client.direct_head_container, + account, container, expect_failure) + + def get_storage_dir(self, part, node, account=None, container=None): + account = account or self.brain.account + container = container or self.container_name + server_type, config_number = get_server_number( + (node['ip'], node['port']), self.ipport2server) + assert server_type == 'container' + repl_server = '%s-replicator' % server_type + conf = utils.readconf(self.configs[repl_server][config_number], + section_name=repl_server) + datadir = os.path.join(conf['devices'], node['device'], 'containers') + container_hash = utils.hash_path(account, container) + return (utils.storage_directory(datadir, part, container_hash), + container_hash) + + def get_broker(self, part, node, account=None, container=None): + container_dir, container_hash = self.get_storage_dir( + part, node, account=account, container=container) + db_file = os.path.join(container_dir, container_hash + '.db') + self.assertTrue(get_db_files(db_file)) # sanity check + return ContainerBroker(db_file) + + def categorize_container_dir_content(self, account=None, container=None): + account = account or self.brain.account + container = container or self.container_name + part, nodes = self.brain.ring.get_nodes(account, container) + storage_dirs = [ + self.get_storage_dir(part, node, account=account, + container=container)[0] + for node in nodes] + result = { + 'shard_dbs': [], + 'normal_dbs': [], + 'pendings': [], + 'locks': [], + 'other': [], + } + for storage_dir in storage_dirs: + for f in os.listdir(storage_dir): + path = os.path.join(storage_dir, f) + if path.endswith('.db'): + hash_, epoch, ext = parse_db_filename(path) + if epoch: + result['shard_dbs'].append(path) + else: + result['normal_dbs'].append(path) + elif path.endswith('.db.pending'): + result['pendings'].append(path) + elif path.endswith('/.lock'): + result['locks'].append(path) + else: + result['other'].append(path) + if result['other']: + self.fail('Found unexpected files in storage directory:\n %s' % + '\n '.join(result['other'])) + return result + + def assertLengthEqual(self, obj, length): + obj_len = len(obj) + self.assertEqual(obj_len, length, 'len(%r) == %d, not %d' % ( + obj, obj_len, length)) + + def assert_dict_contains(self, expected_items, actual_dict): + ignored = set(expected_items) ^ set(actual_dict) + filtered_actual = dict((k, actual_dict[k]) + for k in actual_dict if k not in ignored) + self.assertEqual(expected_items, filtered_actual) + + def assert_shard_ranges_contiguous(self, expected_number, shard_ranges, + first_lower='', last_upper=''): + if shard_ranges and isinstance(shard_ranges[0], ShardRange): + actual_shard_ranges = sorted(shard_ranges) + else: + actual_shard_ranges = sorted([ShardRange.from_dict(d) + for d in shard_ranges]) + self.assertLengthEqual(actual_shard_ranges, expected_number) + if expected_number: + with annotate_failure('Ranges %s.' % actual_shard_ranges): + self.assertEqual(first_lower, actual_shard_ranges[0].lower_str) + for x, y in zip(actual_shard_ranges, actual_shard_ranges[1:]): + self.assertEqual(x.upper, y.lower) + self.assertEqual(last_upper, actual_shard_ranges[-1].upper_str) + + def assert_shard_range_equal(self, expected, actual, excludes=None): + excludes = excludes or [] + expected_dict = dict(expected) + actual_dict = dict(actual) + for k in excludes: + expected_dict.pop(k, None) + actual_dict.pop(k, None) + self.assertEqual(expected_dict, actual_dict) + + def assert_shard_range_lists_equal(self, expected, actual, excludes=None): + self.assertEqual(len(expected), len(actual)) + for expected, actual in zip(expected, actual): + self.assert_shard_range_equal(expected, actual, excludes=excludes) + + def assert_shard_range_state(self, expected_state, shard_ranges): + if shard_ranges and not isinstance(shard_ranges[0], ShardRange): + shard_ranges = [ShardRange.from_dict(data) + for data in shard_ranges] + self.assertEqual([expected_state] * len(shard_ranges), + [sr.state for sr in shard_ranges]) + + def assert_total_object_count(self, expected_object_count, shard_ranges): + actual = sum([sr['object_count'] for sr in shard_ranges]) + self.assertEqual(expected_object_count, actual) + + def assert_container_listing(self, expected_listing): + headers, actual_listing = client.get_container( + self.url, self.token, self.container_name) + self.assertIn('x-container-object-count', headers) + expected_obj_count = len(expected_listing) + self.assertEqual(expected_listing, [ + x['name'].encode('utf-8') for x in actual_listing]) + self.assertEqual(str(expected_obj_count), + headers['x-container-object-count']) + return headers, actual_listing + + def assert_container_object_count(self, expected_obj_count): + headers = client.head_container( + self.url, self.token, self.container_name) + self.assertIn('x-container-object-count', headers) + self.assertEqual(str(expected_obj_count), + headers['x-container-object-count']) + + def assert_container_post_ok(self, meta_value): + key = 'X-Container-Meta-Assert-Post-Works' + headers = {key: meta_value} + client.post_container( + self.url, self.token, self.container_name, headers=headers) + resp_headers = client.head_container( + self.url, self.token, self.container_name) + self.assertEqual(meta_value, resp_headers.get(key.lower())) + + def assert_container_post_fails(self, meta_value): + key = 'X-Container-Meta-Assert-Post-Works' + headers = {key: meta_value} + with self.assertRaises(ClientException) as cm: + client.post_container( + self.url, self.token, self.container_name, headers=headers) + self.assertEqual(404, cm.exception.http_status) + + def assert_container_delete_fails(self): + with self.assertRaises(ClientException) as cm: + client.delete_container(self.url, self.token, self.container_name) + self.assertEqual(409, cm.exception.http_status) + + def assert_container_not_found(self): + with self.assertRaises(ClientException) as cm: + client.get_container(self.url, self.token, self.container_name) + self.assertEqual(404, cm.exception.http_status) + # check for headers leaking out while deleted + resp_headers = cm.exception.http_response_headers + self.assertNotIn('X-Container-Object-Count', resp_headers) + self.assertNotIn('X-Container-Bytes-Used', resp_headers) + self.assertNotIn('X-Timestamp', resp_headers) + self.assertNotIn('X-PUT-Timestamp', resp_headers) + + def assert_container_has_shard_sysmeta(self): + node_headers = self.direct_head_container() + for node_id, headers in node_headers.items(): + with annotate_failure('%s in %s' % (node_id, node_headers.keys())): + for k, v in headers.items(): + if k.lower().startswith('x-container-sysmeta-shard'): + break + else: + self.fail('No shard sysmeta found in %s' % headers) + + def assert_container_state(self, node, expected_state, num_shard_ranges): + headers, shard_ranges = direct_client.direct_get_container( + node, self.brain.part, self.account, self.container_name, + headers={'X-Backend-Record-Type': 'shard'}) + self.assertEqual(num_shard_ranges, len(shard_ranges)) + self.assertIn('X-Backend-Sharding-State', headers) + self.assertEqual( + expected_state, headers['X-Backend-Sharding-State']) + return [ShardRange.from_dict(sr) for sr in shard_ranges] + + def get_part_and_node_numbers(self, shard_range): + """Return the partition and node numbers for a shard range.""" + part, nodes = self.brain.ring.get_nodes( + shard_range.account, shard_range.container) + return part, [n['id'] + 1 for n in nodes] + + def run_sharders(self, shard_ranges): + """Run the sharder on partitions for given shard ranges.""" + if not isinstance(shard_ranges, (list, tuple, set)): + shard_ranges = (shard_ranges,) + partitions = ','.join(str(self.get_part_and_node_numbers(sr)[0]) + for sr in shard_ranges) + self.sharders.once(additional_args='--partitions=%s' % partitions) + + def run_sharder_sequentially(self, shard_range=None): + """Run sharder node by node on partition for given shard range.""" + if shard_range: + part, node_numbers = self.get_part_and_node_numbers(shard_range) + else: + part, node_numbers = self.brain.part, self.brain.node_numbers + for node_number in node_numbers: + self.sharders.once(number=node_number, + additional_args='--partitions=%s' % part) + + +class TestContainerShardingNonUTF8(BaseTestContainerSharding): + def test_sharding_listing(self): + # verify parameterised listing of a container during sharding + all_obj_names = self._make_object_names(4 * self.max_shard_size) + obj_names = all_obj_names[::2] + self.put_objects(obj_names) + # choose some names approx in middle of each expected shard range + markers = [ + obj_names[i] for i in range(self.max_shard_size / 4, + 2 * self.max_shard_size, + self.max_shard_size / 2)] + + def check_listing(objects, **params): + qs = '&'.join(['%s=%s' % param for param in params.items()]) + headers, listing = client.get_container( + self.url, self.token, self.container_name, query_string=qs) + listing = [x['name'].encode('utf-8') for x in listing] + if params.get('reverse'): + marker = params.get('marker', ShardRange.MAX) + end_marker = params.get('end_marker', ShardRange.MIN) + expected = [o for o in objects if end_marker < o < marker] + expected.reverse() + else: + marker = params.get('marker', ShardRange.MIN) + end_marker = params.get('end_marker', ShardRange.MAX) + expected = [o for o in objects if marker < o < end_marker] + if 'limit' in params: + expected = expected[:params['limit']] + self.assertEqual(expected, listing) + + def check_listing_precondition_fails(**params): + qs = '&'.join(['%s=%s' % param for param in params.items()]) + with self.assertRaises(ClientException) as cm: + client.get_container( + self.url, self.token, self.container_name, query_string=qs) + self.assertEqual(412, cm.exception.http_status) + return cm.exception + + def do_listing_checks(objects): + check_listing(objects) + check_listing(objects, marker=markers[0], end_marker=markers[1]) + check_listing(objects, marker=markers[0], end_marker=markers[2]) + check_listing(objects, marker=markers[1], end_marker=markers[3]) + check_listing(objects, marker=markers[1], end_marker=markers[3], + limit=self.max_shard_size / 4) + check_listing(objects, marker=markers[1], end_marker=markers[3], + limit=self.max_shard_size / 4) + check_listing(objects, marker=markers[1], end_marker=markers[2], + limit=self.max_shard_size / 2) + check_listing(objects, marker=markers[1], end_marker=markers[1]) + check_listing(objects, reverse=True) + check_listing(objects, reverse=True, end_marker=markers[1]) + check_listing(objects, reverse=True, marker=markers[3], + end_marker=markers[1], limit=self.max_shard_size / 4) + check_listing(objects, reverse=True, marker=markers[3], + end_marker=markers[1], limit=0) + check_listing([], marker=markers[0], end_marker=markers[0]) + check_listing([], marker=markers[0], end_marker=markers[1], + reverse=True) + check_listing(objects, prefix='obj') + check_listing([], prefix='zzz') + # delimiter + headers, listing = client.get_container( + self.url, self.token, self.container_name, + query_string='delimiter=-') + self.assertEqual([{'subdir': 'obj-'}], listing) + + limit = self.cluster_info['swift']['container_listing_limit'] + exc = check_listing_precondition_fails(limit=limit + 1) + self.assertIn('Maximum limit', exc.http_response_content) + exc = check_listing_precondition_fails(delimiter='ab') + self.assertIn('Bad delimiter', exc.http_response_content) + + # sanity checks + do_listing_checks(obj_names) + + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + # First run the 'leader' in charge of scanning, which finds all shard + # ranges and cleaves first two + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + # Then run sharder on other nodes which will also cleave first two + # shard ranges + for n in self.brain.node_numbers[1:]: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + + # sanity check shard range states + for node in self.brain.nodes: + self.assert_container_state(node, 'sharding', 4) + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 4) + self.assert_shard_range_state(ShardRange.CLEAVED, shard_ranges[:2]) + self.assert_shard_range_state(ShardRange.CREATED, shard_ranges[2:]) + + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() # confirm no sysmeta deleted + self.assert_container_post_ok('sharding') + do_listing_checks(obj_names) + + # put some new objects spread through entire namespace + new_obj_names = all_obj_names[1::4] + self.put_objects(new_obj_names) + + # new objects that fell into the first two cleaved shard ranges are + # reported in listing, new objects in the yet-to-be-cleaved shard + # ranges are not yet included in listing + exp_obj_names = [o for o in obj_names + new_obj_names + if o <= shard_ranges[1].upper] + exp_obj_names += [o for o in obj_names + if o > shard_ranges[1].upper] + exp_obj_names.sort() + do_listing_checks(exp_obj_names) + + # run all the sharders again and the last two shard ranges get cleaved + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 4) + shard_ranges = self.get_container_shard_ranges() + self.assert_shard_range_state(ShardRange.ACTIVE, shard_ranges) + + exp_obj_names = obj_names + new_obj_names + exp_obj_names.sort() + do_listing_checks(exp_obj_names) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + + # delete original objects + self.delete_objects(obj_names) + do_listing_checks(new_obj_names) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + + +class TestContainerShardingUTF8(TestContainerShardingNonUTF8): + def _make_object_names(self, number): + # override default with names that include non-ascii chars + name_length = self.cluster_info['swift']['max_object_name_length'] + obj_names = [] + for x in range(number): + name = (u'obj-\u00e4\u00ea\u00ec\u00f2\u00fb-%04d' % x) + name = name.encode('utf8').ljust(name_length, 'o') + obj_names.append(name) + return obj_names + + def _setup_container_name(self): + # override default with max length name that includes non-ascii chars + super(TestContainerShardingUTF8, self)._setup_container_name() + name_length = self.cluster_info['swift']['max_container_name_length'] + cont_name = self.container_name + u'-\u00e4\u00ea\u00ec\u00f2\u00fb' + self.conainer_name = cont_name.encode('utf8').ljust(name_length, 'x') + + +class TestContainerSharding(BaseTestContainerSharding): + def _test_sharded_listing(self, run_replicators=False): + obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(obj_names) + + # Verify that we start out with normal DBs, no shards + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['normal_dbs'], 3) + self.assertLengthEqual(found['shard_dbs'], 0) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('unsharded', broker.get_db_state()) + self.assertLengthEqual(broker.get_shard_ranges(), 0) + + headers, pre_sharding_listing = client.get_container( + self.url, self.token, self.container_name) + self.assertEqual(obj_names, [x['name'].encode('utf-8') + for x in pre_sharding_listing]) # sanity + + # Shard it + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + pre_sharding_headers = client.head_container( + self.url, self.admin_token, self.container_name) + self.assertEqual('True', + pre_sharding_headers.get('x-container-sharding')) + + # Only run the one in charge of scanning + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # Verify that we have one sharded db -- though the other normal DBs + # received the shard ranges that got defined + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 1) + broker = ContainerBroker(found['shard_dbs'][0]) + # TODO: assert the shard db is on replica 0 + self.assertIs(True, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + orig_root_shard_ranges = [dict(sr) for sr in broker.get_shard_ranges()] + self.assertLengthEqual(orig_root_shard_ranges, 2) + self.assert_total_object_count(len(obj_names), orig_root_shard_ranges) + self.assert_shard_ranges_contiguous(2, orig_root_shard_ranges) + self.assertEqual([ShardRange.ACTIVE, ShardRange.ACTIVE], + [sr['state'] for sr in orig_root_shard_ranges]) + self.direct_delete_container(expect_failure=True) + + self.assertLengthEqual(found['normal_dbs'], 2) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('unsharded', broker.get_db_state()) + # the sharded db had shard range meta_timestamps and state updated + # during cleaving, so we do not expect those to be equal on other + # nodes + self.assert_shard_range_lists_equal( + orig_root_shard_ranges, broker.get_shard_ranges(), + excludes=['meta_timestamp', 'state', 'state_timestamp']) + + if run_replicators: + Manager(['container-replicator']).once() + # replication doesn't change the db file names + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 1) + self.assertLengthEqual(found['normal_dbs'], 2) + + # Now that everyone has shard ranges, run *everyone* + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + + # Verify that we only have shard dbs now + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 0) + # Shards stayed the same + for db_file in found['shard_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + # Well, except for meta_timestamps, since the shards each reported + self.assert_shard_range_lists_equal( + orig_root_shard_ranges, broker.get_shard_ranges(), + excludes=['meta_timestamp', 'state_timestamp']) + for orig, updated in zip(orig_root_shard_ranges, + broker.get_shard_ranges()): + self.assertGreaterEqual(updated.state_timestamp, + orig['state_timestamp']) + self.assertGreaterEqual(updated.meta_timestamp, + orig['meta_timestamp']) + + # Check that entire listing is available + headers, actual_listing = self.assert_container_listing(obj_names) + # ... and check some other container properties + self.assertEqual(headers['last-modified'], + pre_sharding_headers['last-modified']) + + # It even works in reverse! + headers, listing = client.get_container(self.url, self.token, + self.container_name, + query_string='reverse=on') + self.assertEqual(pre_sharding_listing[::-1], listing) + + # Now put some new objects into first shard, taking its count to + # 3 shard ranges' worth + more_obj_names = [ + 'beta%03d' % x for x in range(self.max_shard_size)] + self.put_objects(more_obj_names) + + # The listing includes new objects... + headers, listing = self.assert_container_listing( + more_obj_names + obj_names) + self.assertEqual(pre_sharding_listing, listing[len(more_obj_names):]) + + # ...but root object count is out of date until the sharders run and + # update the root + self.assert_container_object_count(len(obj_names)) + + # run sharders on the shard to get root updated + shard_1 = ShardRange.from_dict(orig_root_shard_ranges[0]) + self.run_sharders(shard_1) + self.assert_container_object_count(len(more_obj_names + obj_names)) + + # we've added objects enough that we need to shard the first shard + # *again* into three new sub-shards, but nothing happens until the root + # leader identifies shard candidate... + root_shard_ranges = self.direct_get_container_shard_ranges() + for node, (hdrs, root_shards) in root_shard_ranges.items(): + self.assertLengthEqual(root_shards, 2) + with annotate_failure('node %s. ' % node): + self.assertEqual( + [ShardRange.ACTIVE] * 2, + [sr['state'] for sr in root_shards]) + # orig shards 0, 1 should be contiguous + self.assert_shard_ranges_contiguous(2, root_shards) + + # Now run the root leader to identify shard candidate...while one of + # the shard container servers is down + shard_1_part, shard_1_nodes = self.get_part_and_node_numbers(shard_1) + self.brain.servers.stop(number=shard_1_nodes[2]) + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # ... so third replica of first shard state is not moved to sharding + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + self.assertEqual( + [ShardRange.SHARDING, ShardRange.SHARDING, ShardRange.ACTIVE], + [ContainerBroker(db_file).get_own_shard_range().state + for db_file in found_for_shard['normal_dbs']]) + + # ...then run first cycle of first shard sharders in order, leader + # first, to get to predictable state where all nodes have cleaved 2 out + # of 3 ranges...starting with first two nodes + for node_number in shard_1_nodes[:2]: + self.sharders.once( + number=node_number, + additional_args='--partitions=%s' % shard_1_part) + + # ... first two replicas start sharding to sub-shards + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['shard_dbs'], 2) + for db_file in found_for_shard['shard_dbs'][:2]: + broker = ContainerBroker(db_file) + with annotate_failure('shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('sharding', broker.get_db_state()) + self.assertEqual( + ShardRange.SHARDING, broker.get_own_shard_range().state) + shard_shards = broker.get_shard_ranges() + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, + ShardRange.CREATED], + [sr.state for sr in shard_shards]) + self.assert_shard_ranges_contiguous( + 3, shard_shards, + first_lower=orig_root_shard_ranges[0]['lower'], + last_upper=orig_root_shard_ranges[0]['upper']) + + # but third replica still has no idea it should be sharding + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + self.assertEqual( + ShardRange.ACTIVE, + ContainerBroker( + found_for_shard['normal_dbs'][2]).get_own_shard_range().state) + + # ...but once sharder runs on third replica it will learn its state; + # note that any root replica on the stopped container server also won't + # know about the shards being in sharding state, so leave that server + # stopped for now so that shard fetches its state from an up-to-date + # root replica + self.sharders.once( + number=shard_1_nodes[2], + additional_args='--partitions=%s' % shard_1_part) + + # third replica is sharding but has no sub-shard ranges yet... + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['shard_dbs'], 2) + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + broker = ContainerBroker(found_for_shard['normal_dbs'][2]) + self.assertEqual('unsharded', broker.get_db_state()) + self.assertEqual( + ShardRange.SHARDING, broker.get_own_shard_range().state) + self.assertFalse(broker.get_shard_ranges()) + + # ...until sub-shard ranges are replicated from another shard replica; + # there may also be a sub-shard replica missing so run replicators on + # all nodes to fix that if necessary + self.brain.servers.start(number=shard_1_nodes[2]) + self.replicators.once() + + # now run sharder again on third replica + self.sharders.once( + number=shard_1_nodes[2], + additional_args='--partitions=%s' % shard_1_part) + + # check original first shard range state and sub-shards - all replicas + # should now be in consistent state + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['shard_dbs'], 3) + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + for db_file in found_for_shard['shard_dbs']: + broker = ContainerBroker(db_file) + with annotate_failure('shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('sharding', broker.get_db_state()) + self.assertEqual( + ShardRange.SHARDING, broker.get_own_shard_range().state) + shard_shards = broker.get_shard_ranges() + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, + ShardRange.CREATED], + [sr.state for sr in shard_shards]) + self.assert_shard_ranges_contiguous( + 3, shard_shards, + first_lower=orig_root_shard_ranges[0]['lower'], + last_upper=orig_root_shard_ranges[0]['upper']) + + # check third sub-shard is in created state + sub_shard = shard_shards[2] + found_for_sub_shard = self.categorize_container_dir_content( + sub_shard.account, sub_shard.container) + self.assertFalse(found_for_sub_shard['shard_dbs']) + self.assertLengthEqual(found_for_sub_shard['normal_dbs'], 3) + for db_file in found_for_sub_shard['normal_dbs']: + broker = ContainerBroker(db_file) + with annotate_failure('sub shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('unsharded', broker.get_db_state()) + self.assertEqual( + ShardRange.CREATED, broker.get_own_shard_range().state) + self.assertFalse(broker.get_shard_ranges()) + + # check root shard ranges + root_shard_ranges = self.direct_get_container_shard_ranges() + for node, (hdrs, root_shards) in root_shard_ranges.items(): + self.assertLengthEqual(root_shards, 5) + with annotate_failure('node %s. ' % node): + # shard ranges are sorted by upper, state, lower, so expect: + # sub-shards, orig shard 0, orig shard 1 + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, + ShardRange.CREATED, ShardRange.SHARDING, + ShardRange.ACTIVE], + [sr['state'] for sr in root_shards]) + # sub-shards 0, 1, 2, orig shard 1 should be contiguous + self.assert_shard_ranges_contiguous( + 4, root_shards[:3] + root_shards[4:]) + # orig shards 0, 1 should be contiguous + self.assert_shard_ranges_contiguous(2, root_shards[3:]) + + self.assert_container_listing(more_obj_names + obj_names) + self.assert_container_object_count(len(more_obj_names + obj_names)) + + # add another object that lands in the first of the new sub-shards + self.put_objects(['alpha']) + + # TODO: assert that alpha is in the first new shard + self.assert_container_listing(['alpha'] + more_obj_names + obj_names) + # Run sharders again so things settle. + self.run_sharders(shard_1) + + # check original first shard range shards + for db_file in found_for_shard['shard_dbs']: + broker = ContainerBroker(db_file) + with annotate_failure('shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + self.assertEqual( + [ShardRange.ACTIVE] * 3, + [sr.state for sr in broker.get_shard_ranges()]) + # check root shard ranges + root_shard_ranges = self.direct_get_container_shard_ranges() + for node, (hdrs, root_shards) in root_shard_ranges.items(): + # old first shard range should have been deleted + self.assertLengthEqual(root_shards, 4) + with annotate_failure('node %s. ' % node): + self.assertEqual( + [ShardRange.ACTIVE] * 4, + [sr['state'] for sr in root_shards]) + self.assert_shard_ranges_contiguous(4, root_shards) + + headers, final_listing = self.assert_container_listing( + ['alpha'] + more_obj_names + obj_names) + + # check root + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 0) + new_shard_ranges = None + for db_file in found['shard_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + if new_shard_ranges is None: + new_shard_ranges = broker.get_shard_ranges( + include_deleted=True) + self.assertLengthEqual(new_shard_ranges, 5) + # Second half is still there, and unchanged + self.assertIn( + dict(orig_root_shard_ranges[1], meta_timestamp=None, + state_timestamp=None), + [dict(sr, meta_timestamp=None, state_timestamp=None) + for sr in new_shard_ranges]) + # But the first half split in three, then deleted + by_name = {sr.name: sr for sr in new_shard_ranges} + self.assertIn(orig_root_shard_ranges[0]['name'], by_name) + old_shard_range = by_name.pop( + orig_root_shard_ranges[0]['name']) + self.assertTrue(old_shard_range.deleted) + self.assert_shard_ranges_contiguous(4, by_name.values()) + else: + # Everyone's on the same page. Well, except for + # meta_timestamps, since the shards each reported + other_shard_ranges = broker.get_shard_ranges( + include_deleted=True) + self.assert_shard_range_lists_equal( + new_shard_ranges, other_shard_ranges, + excludes=['meta_timestamp', 'state_timestamp']) + for orig, updated in zip(orig_root_shard_ranges, + other_shard_ranges): + self.assertGreaterEqual(updated.meta_timestamp, + orig['meta_timestamp']) + + self.assert_container_delete_fails() + + for obj in final_listing: + client.delete_object( + self.url, self.token, self.container_name, obj['name']) + + # the objects won't be listed anymore + self.assert_container_listing([]) + # but root container stats will not yet be aware of the deletions + self.assert_container_delete_fails() + + # One server was down while the shard sharded its first two sub-shards, + # so there may be undeleted handoff db(s) for sub-shard(s) that were + # not fully replicated; run replicators now to clean up so they no + # longer report bogus stats to root. + self.replicators.once() + + # Run sharder so that shard containers update the root. Do not run + # sharder on root container because that triggers shrinks which can + # cause root object count to temporarily be non-zero and prevent the + # final delete. + self.run_sharders(self.get_container_shard_ranges()) + # then root is empty and can be deleted + self.assert_container_listing([]) + self.assert_container_object_count(0) + client.delete_container(self.url, self.token, self.container_name) + + def test_sharded_listing_no_replicators(self): + self._test_sharded_listing() + + def test_sharded_listing_with_replicators(self): + self._test_sharded_listing(run_replicators=True) + + def test_async_pendings(self): + obj_names = self._make_object_names(self.max_shard_size * 2) + + # There are some updates *everyone* gets + self.put_objects(obj_names[::5]) + # But roll some outages so each container only get ~2/5 more object + # records i.e. total of 3/5 updates per container; and async pendings + # pile up + for i, n in enumerate(self.brain.node_numbers, start=1): + self.brain.servers.stop(number=n) + self.put_objects(obj_names[i::5]) + self.brain.servers.start(number=n) + + # But there are also 1/5 updates *no one* gets + self.brain.servers.stop() + self.put_objects(obj_names[4::5]) + self.brain.servers.start() + + # Shard it + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + headers = client.head_container(self.url, self.admin_token, + self.container_name) + self.assertEqual('True', headers.get('x-container-sharding')) + + # sanity check + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 0) + self.assertLengthEqual(found['normal_dbs'], 3) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + + # Only run the 'leader' in charge of scanning. + # Each container has ~2 * max * 3/5 objects + # which are distributed from obj000 to obj<2 * max - 1>, + # so expect 3 shard ranges to be found: the first two will be complete + # shards with max/2 objects and lower/upper bounds spaced by approx: + # (2 * max - 1)/(2 * max * 3/5) * (max/2) =~ 5/6 * max + # + # Note that during this shard cycle the leader replicates to other + # nodes so they will end up with ~2 * max * 4/5 objects. + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # Verify that we have one shard db -- though the other normal DBs + # received the shard ranges that got defined + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 1) + node_index_zero_db = found['shard_dbs'][0] + broker = ContainerBroker(node_index_zero_db) + self.assertIs(True, broker.is_root_container()) + self.assertEqual(SHARDING, broker.get_db_state()) + expected_shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(expected_shard_ranges, 3) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED], + [sr.state for sr in expected_shard_ranges]) + + # Still have all three big DBs -- we've only cleaved 2 of the 3 shard + # ranges that got defined + self.assertLengthEqual(found['normal_dbs'], 3) + db_states = [] + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + db_states.append(broker.get_db_state()) + # the sharded db had shard range meta_timestamps updated during + # cleaving, so we do not expect those to be equal on other nodes + self.assert_shard_range_lists_equal( + expected_shard_ranges, broker.get_shard_ranges(), + excludes=['meta_timestamp', 'state_timestamp', 'state']) + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + self.assertEqual([SHARDING, UNSHARDED, UNSHARDED], sorted(db_states)) + + # Run the other sharders so we're all in (roughly) the same state + for n in self.brain.node_numbers[1:]: + self.sharders.once( + number=n, + additional_args='--partitions=%s' % self.brain.part) + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 3) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertEqual(SHARDING, broker.get_db_state()) + # no new rows + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + + # Run updaters to clear the async pendings + Manager(['object-updater']).once() + + # Our "big" dbs didn't take updates + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + + # TODO: confirm that the updates got redirected to the shards + + # The entire listing is not yet available - we have two cleaved shard + # ranges, complete with async updates, but for the remainder of the + # namespace only what landed in the original container + headers, listing = client.get_container(self.url, self.token, + self.container_name) + start_listing = [ + o for o in obj_names if o <= expected_shard_ranges[1].upper] + self.assertEqual( + [x['name'].encode('utf-8') for x in listing[:len(start_listing)]], + start_listing) + # we can't assert much about the remaining listing, other than that + # there should be something + self.assertTrue( + [x['name'].encode('utf-8') for x in listing[len(start_listing):]]) + # Object count is hard to reason about though! + # TODO: nail down what this *should* be and make sure all containers + # respond with it! Depending on what you're looking at, this + # could be 0, 1/2, 7/12 (!?), 3/5, 2/3, or 4/5 or all objects! + # Apparently, it may not even be present at all! + # self.assertIn('x-container-object-count', headers) + # self.assertEqual(headers['x-container-object-count'], + # str(len(obj_names) - len(obj_names) // 6)) + + # TODO: Doesn't work in reverse, yet + # headers, listing = client.get_container(self.url, self.token, + # self.container_name, + # query_string='reverse=on') + # self.assertEqual([x['name'].encode('utf-8') for x in listing], + # obj_names[::-1]) + + # Run the sharders again to get everything to settle + self.sharders.once() + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 0) + # now all shards have been cleaved we should get the complete listing + headers, listing = client.get_container(self.url, self.token, + self.container_name) + self.assertEqual([x['name'].encode('utf-8') for x in listing], + obj_names) + + def test_shrinking(self): + int_client = self.make_internal_client() + + def check_node_data(node_data, exp_hdrs, exp_obj_count, exp_shards): + hdrs, range_data = node_data + self.assert_dict_contains(exp_hdrs, hdrs) + self.assert_shard_ranges_contiguous(exp_shards, range_data) + self.assert_total_object_count(exp_obj_count, range_data) + + def check_shard_nodes_data(node_data, expected_state='unsharded', + expected_shards=0, exp_obj_count=0): + # checks that shard range is consistent on all nodes + root_path = '%s/%s' % (self.account, self.container_name) + exp_shard_hdrs = {'X-Container-Sysmeta-Shard-Root': root_path, + 'X-Backend-Sharding-State': expected_state} + object_counts = [] + bytes_used = [] + for node_id, node_data in node_data.items(): + with annotate_failure('Node id %s.' % node_id): + check_node_data( + node_data, exp_shard_hdrs, exp_obj_count, + expected_shards) + hdrs = node_data[0] + object_counts.append(int(hdrs['X-Container-Object-Count'])) + bytes_used.append(int(hdrs['X-Container-Bytes-Used'])) + if len(set(object_counts)) != 1: + self.fail('Inconsistent object counts: %s' % object_counts) + if len(set(bytes_used)) != 1: + self.fail('Inconsistent bytes used: %s' % bytes_used) + return object_counts[0], bytes_used[0] + + repeat = [0] + + def do_shard_then_shrink(): + repeat[0] += 1 + obj_names = ['obj-%s-%03d' % (repeat[0], x) + for x in range(self.max_shard_size)] + self.put_objects(obj_names) + # these two object names will fall at start of first shard range... + alpha = 'alpha-%s' % repeat[0] + beta = 'beta-%s' % repeat[0] + + # Enable sharding + client.post_container( + self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + + # sanity check + self.assert_container_listing(obj_names) + + # Only run the one in charge of scanning + self.sharders.once( + number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # check root container + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + + # nodes on which sharder has not run are still in unsharded state + # but have had shard ranges replicated to them + exp_obj_count = len(obj_names) + exp_hdrs = {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': str(exp_obj_count)} + node_id = self.brain.node_numbers[1] - 1 + check_node_data( + root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2) + node_id = self.brain.node_numbers[2] - 1 + check_node_data( + root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2) + + # only one that ran sharder is in sharded state + exp_hdrs['X-Backend-Sharding-State'] = 'sharded' + node_id = self.brain.node_numbers[0] - 1 + check_node_data( + root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2) + + orig_range_data = root_nodes_data[node_id][1] + orig_shard_ranges = [ShardRange.from_dict(r) + for r in orig_range_data] + + # check first shard + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[0].account, orig_shard_ranges[0].container) + obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data) + total_shard_object_count = obj_count + + # check second shard + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[1].account, orig_shard_ranges[1].container) + obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data) + total_shard_object_count += obj_count + self.assertEqual(exp_obj_count, total_shard_object_count) + + # Now that everyone has shard ranges, run *everyone* + self.sharders.once( + additional_args='--partitions=%s' % self.brain.part) + + # all root container nodes should now be in sharded state + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + check_node_data(node_data, exp_hdrs, exp_obj_count, 2) + + # run updaters to update .sharded account; shard containers have + # not updated account since having objects replicated to them + self.updaters.once() + shard_cont_count, shard_obj_count = int_client.get_account_info( + orig_shard_ranges[0].account, [204]) + self.assertEqual(2 * repeat[0], shard_cont_count) + self.assertEqual(len(obj_names), shard_obj_count) + + # checking the listing also refreshes proxy container info cache so + # that the proxy becomes aware that container is sharded and will + # now look up the shard target for subsequent updates + self.assert_container_listing(obj_names) + + # delete objects from first shard range + first_shard_objects = [obj_name for obj_name in obj_names + if obj_name <= orig_shard_ranges[0].upper] + for obj in first_shard_objects: + client.delete_object( + self.url, self.token, self.container_name, obj) + with self.assertRaises(ClientException): + client.get_object( + self.url, self.token, self.container_name, obj) + + second_shard_objects = [obj_name for obj_name in obj_names + if obj_name > orig_shard_ranges[1].lower] + self.assert_container_listing(second_shard_objects) + + self.put_objects([alpha]) + second_shard_objects = [obj_name for obj_name in obj_names + if obj_name > orig_shard_ranges[1].lower] + self.assert_container_listing([alpha] + second_shard_objects) + + # while container servers are down, but proxy has container info in + # cache from recent listing, put another object; this update will + # lurk in async pending until the updaters run again + # TODO: because all the root container servers are down and + # therefore cannot respond to a GET for a redirect target, the + # object update will default to being targeted at the root + # container; can we provoke an object update that does get targeted + # to the shard, but fails to update shard, so that the async + # pending will first be directed to the shard when the updaters + # run? + self.stop_container_servers() + self.put_objects([beta]) + self.brain.servers.start() + async_pendings = self.gather_async_pendings( + self.get_all_object_nodes()) + num_container_replicas = len(self.brain.nodes) + num_obj_replicas = self.policy.object_ring.replica_count + expected_num_updates = num_container_updates( + num_container_replicas, quorum_size(num_container_replicas), + num_obj_replicas, self.policy.quorum) + expected_num_pendings = min(expected_num_updates, num_obj_replicas) + # sanity check + with annotate_failure('policy %s. ' % self.policy): + self.assertLengthEqual(async_pendings, expected_num_pendings) + + # root object count is not updated... + self.assert_container_object_count(len(obj_names)) + self.assert_container_listing([alpha] + second_shard_objects) + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + check_node_data(node_data, exp_hdrs, exp_obj_count, 2) + range_data = node_data[1] + self.assert_shard_range_lists_equal( + orig_range_data, range_data, + excludes=['meta_timestamp', 'state_timestamp']) + + # ...until the sharders run and update root + self.run_sharders(orig_shard_ranges[0]) + exp_obj_count = len(second_shard_objects) + 1 + self.assert_container_object_count(exp_obj_count) + self.assert_container_listing([alpha] + second_shard_objects) + + # root sharder finds donor, acceptor pair and pushes changes + self.sharders.once( + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_listing([alpha] + second_shard_objects) + # run sharder on donor to shrink and replicate to acceptor + self.run_sharders(orig_shard_ranges[0]) + self.assert_container_listing([alpha] + second_shard_objects) + # run sharder on acceptor to update root with stats + self.run_sharders(orig_shard_ranges[1]) + self.assert_container_listing([alpha] + second_shard_objects) + self.assert_container_object_count(len(second_shard_objects) + 1) + + # check root container + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + exp_hdrs['X-Container-Object-Count'] = str(exp_obj_count) + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + # NB now only *one* shard range in root + check_node_data(node_data, exp_hdrs, exp_obj_count, 1) + + # the acceptor shard is intact.. + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[1].account, orig_shard_ranges[1].container) + obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data) + # all objects should now be in this shard + self.assertEqual(exp_obj_count, obj_count) + + # the donor shard is also still intact + # TODO: once we have figured out when these redundant donors are + # deleted, test for deletion/clean up + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[0].account, orig_shard_ranges[0].container) + # the donor's shard range will have the acceptor's projected stats + obj_count, bytes_used = check_shard_nodes_data( + shard_nodes_data, expected_state='sharded', expected_shards=1, + exp_obj_count=len(second_shard_objects) + 1) + # but the donor is empty and so reports zero stats + self.assertEqual(0, obj_count) + self.assertEqual(0, bytes_used) + + # delete all the second shard's object apart from 'alpha' + for obj in second_shard_objects: + client.delete_object( + self.url, self.token, self.container_name, obj) + + self.assert_container_listing([alpha]) + + # runs sharders so second range shrinks away, requires up to 3 + # cycles + self.sharders.once() # shard updates root stats + self.assert_container_listing([alpha]) + self.sharders.once() # root finds shrinkable shard + self.assert_container_listing([alpha]) + self.sharders.once() # shards shrink themselves + self.assert_container_listing([alpha]) + + # the second shard range has sharded and is empty + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[1].account, orig_shard_ranges[1].container) + check_shard_nodes_data( + shard_nodes_data, expected_state='sharded', expected_shards=1, + exp_obj_count=1) + + # check root container + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + exp_hdrs = {'X-Backend-Sharding-State': 'collapsed', + # just the alpha object + 'X-Container-Object-Count': '1'} + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + # NB now no shard ranges in root + check_node_data(node_data, exp_hdrs, 0, 0) + + # delete the alpha object + client.delete_object( + self.url, self.token, self.container_name, alpha) + # should now be able to delete the *apparently* empty container + client.delete_container(self.url, self.token, self.container_name) + self.assert_container_not_found() + self.direct_head_container(expect_failure=True) + + # and the container stays deleted even after sharders run and shard + # send updates + self.sharders.once() + self.assert_container_not_found() + self.direct_head_container(expect_failure=True) + + # now run updaters to deal with the async pending for the beta + # object + self.updaters.once() + # and the container is revived! + self.assert_container_listing([beta]) + + # finally, clear out the container + client.delete_object( + self.url, self.token, self.container_name, beta) + + do_shard_then_shrink() + # repeat from starting point of a collapsed and previously deleted + # container + do_shard_then_shrink() + + def _setup_replication_scenario(self, num_shards, extra_objs=('alpha',)): + # Get cluster to state where 2 replicas are sharding or sharded but 3rd + # replica is unsharded and has an object that the first 2 are missing. + + # put objects while all servers are up + obj_names = self._make_object_names( + num_shards * self.max_shard_size / 2) + self.put_objects(obj_names) + + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + node_numbers = self.brain.node_numbers + + # run replicators first time to get sync points set + self.replicators.once() + + # stop the leader node and one other server + self.stop_container_servers(slice(0, 2)) + + # ...then put one more object in first shard range namespace + self.put_objects(extra_objs) + + # start leader and first other server, stop third server + for number in node_numbers[:2]: + self.brain.servers.start(number=number) + self.brain.servers.stop(number=node_numbers[2]) + self.assert_container_listing(obj_names) # sanity check + + # shard the container - first two shard ranges are cleaved + for number in node_numbers[:2]: + self.sharders.once( + number=number, + additional_args='--partitions=%s' % self.brain.part) + + self.assert_container_listing(obj_names) # sanity check + return obj_names + + def test_replication_to_sharding_container(self): + # verify that replication from an unsharded replica to a sharding + # replica does not replicate rows but does replicate shard ranges + obj_names = self._setup_replication_scenario(3) + for node in self.brain.nodes[:2]: + self.assert_container_state(node, 'sharding', 3) + + # bring third server back up, run replicator + node_numbers = self.brain.node_numbers + self.brain.servers.start(number=node_numbers[2]) + # sanity check... + self.assert_container_state(self.brain.nodes[2], 'unsharded', 0) + self.replicators.once(number=node_numbers[2]) + # check db files unchanged + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 2) + self.assertLengthEqual(found['normal_dbs'], 3) + + # the 'alpha' object is NOT replicated to the two sharded nodes + for node in self.brain.nodes[:2]: + broker = self.get_broker(self.brain.part, node) + with annotate_failure( + 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])): + self.assertFalse(broker.get_objects()) + self.assert_container_state(node, 'sharding', 3) + self.brain.servers.stop(number=node_numbers[2]) + self.assert_container_listing(obj_names) + + # all nodes now have shard ranges + self.brain.servers.start(number=node_numbers[2]) + node_data = self.direct_get_container_shard_ranges() + for node, (hdrs, shard_ranges) in node_data.items(): + with annotate_failure(node): + self.assert_shard_ranges_contiguous(3, shard_ranges) + + # complete cleaving third shard range on first two nodes + self.brain.servers.stop(number=node_numbers[2]) + for number in node_numbers[:2]: + self.sharders.once( + number=number, + additional_args='--partitions=%s' % self.brain.part) + # ...and now they are in sharded state + self.assert_container_state(self.brain.nodes[0], 'sharded', 3) + self.assert_container_state(self.brain.nodes[1], 'sharded', 3) + # ...still no 'alpha' object in listing + self.assert_container_listing(obj_names) + + # run the sharder on the third server, alpha object is included in + # shards that it cleaves + self.brain.servers.start(number=node_numbers[2]) + self.assert_container_state(self.brain.nodes[2], 'unsharded', 3) + self.sharders.once(number=node_numbers[2], + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(self.brain.nodes[2], 'sharding', 3) + self.sharders.once(number=node_numbers[2], + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(self.brain.nodes[2], 'sharded', 3) + self.assert_container_listing(['alpha'] + obj_names) + + def test_replication_to_sharded_container(self): + # verify that replication from an unsharded replica to a sharded + # replica does not replicate rows but does replicate shard ranges + obj_names = self._setup_replication_scenario(2) + for node in self.brain.nodes[:2]: + self.assert_container_state(node, 'sharded', 2) + + # sanity check + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 2) + self.assertLengthEqual(found['normal_dbs'], 1) + for node in self.brain.nodes[:2]: + broker = self.get_broker(self.brain.part, node) + info = broker.get_info() + with annotate_failure( + 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])): + self.assertEqual(len(obj_names), info['object_count']) + self.assertFalse(broker.get_objects()) + + # bring third server back up, run replicator + node_numbers = self.brain.node_numbers + self.brain.servers.start(number=node_numbers[2]) + # sanity check... + self.assert_container_state(self.brain.nodes[2], 'unsharded', 0) + self.replicators.once(number=node_numbers[2]) + # check db files unchanged + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 2) + self.assertLengthEqual(found['normal_dbs'], 1) + + # the 'alpha' object is NOT replicated to the two sharded nodes + for node in self.brain.nodes[:2]: + broker = self.get_broker(self.brain.part, node) + with annotate_failure( + 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])): + self.assertFalse(broker.get_objects()) + self.assert_container_state(node, 'sharded', 2) + self.brain.servers.stop(number=node_numbers[2]) + self.assert_container_listing(obj_names) + + # all nodes now have shard ranges + self.brain.servers.start(number=node_numbers[2]) + node_data = self.direct_get_container_shard_ranges() + for node, (hdrs, shard_ranges) in node_data.items(): + with annotate_failure(node): + self.assert_shard_ranges_contiguous(2, shard_ranges) + + # run the sharder on the third server, alpha object is included in + # shards that it cleaves + self.assert_container_state(self.brain.nodes[2], 'unsharded', 2) + self.sharders.once(number=node_numbers[2], + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(self.brain.nodes[2], 'sharded', 2) + self.assert_container_listing(['alpha'] + obj_names) + + def test_sharding_requires_sufficient_replication(self): + # verify that cleaving only progresses if each cleaved shard range is + # sufficiently replicated + + # put enough objects for 4 shard ranges + obj_names = self._make_object_names(2 * self.max_shard_size) + self.put_objects(obj_names) + + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + node_numbers = self.brain.node_numbers + leader_node = self.brain.nodes[0] + leader_num = node_numbers[0] + + # run replicators first time to get sync points set + self.replicators.once() + + # start sharding on the leader node + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + shard_ranges = self.assert_container_state(leader_node, 'sharding', 4) + self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2, + [sr.state for sr in shard_ranges]) + + # stop *all* container servers for third shard range + sr_part, sr_node_nums = self.get_part_and_node_numbers(shard_ranges[2]) + for node_num in sr_node_nums: + self.brain.servers.stop(number=node_num) + + # attempt to continue sharding on the leader node + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + + # no cleaving progress was made + for node_num in sr_node_nums: + self.brain.servers.start(number=node_num) + shard_ranges = self.assert_container_state(leader_node, 'sharding', 4) + self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2, + [sr.state for sr in shard_ranges]) + + # stop two of the servers for third shard range, not including any + # server that happens to be the leader node + stopped = [] + for node_num in sr_node_nums: + if node_num != leader_num: + self.brain.servers.stop(number=node_num) + stopped.append(node_num) + if len(stopped) >= 2: + break + self.assertLengthEqual(stopped, 2) # sanity check + + # attempt to continue sharding on the leader node + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + + # no cleaving progress was made + for node_num in stopped: + self.brain.servers.start(number=node_num) + shard_ranges = self.assert_container_state(leader_node, 'sharding', 4) + self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2, + [sr.state for sr in shard_ranges]) + + # stop just one of the servers for third shard range + stopped = [] + for node_num in sr_node_nums: + if node_num != leader_num: + self.brain.servers.stop(number=node_num) + stopped.append(node_num) + break + self.assertLengthEqual(stopped, 1) # sanity check + + # attempt to continue sharding the container + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + + # this time cleaving completed + self.brain.servers.start(number=stopped[0]) + shard_ranges = self.assert_container_state(leader_node, 'sharded', 4) + self.assertEqual([ShardRange.ACTIVE] * 4, + [sr.state for sr in shard_ranges]) + + def test_sharded_delete(self): + all_obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(all_obj_names) + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + for n in self.brain.node_numbers: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + # sanity checks + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 2) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + self.assert_container_listing(all_obj_names) + + # delete all objects - updates redirected to shards + self.delete_objects(all_obj_names) + self.assert_container_listing([]) + self.assert_container_post_ok('has objects') + # root not yet updated with shard stats + self.assert_container_object_count(len(all_obj_names)) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + + # run sharder on shard containers to update root stats + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.run_sharders(shard_ranges) + self.assert_container_listing([]) + self.assert_container_post_ok('empty') + self.assert_container_object_count(0) + + # put a new object - update redirected to shard + self.put_objects(['alpha']) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) + + # before root learns about new object in shard, delete the container + client.delete_container(self.url, self.token, self.container_name) + self.assert_container_post_fails('deleted') + self.assert_container_not_found() + + # run the sharders to update root with shard stats + self.run_sharders(shard_ranges) + + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + self.assert_container_delete_fails() + self.assert_container_post_ok('revived') + + def test_object_update_redirection(self): + all_obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(all_obj_names) + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + for n in self.brain.node_numbers: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + # sanity checks + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 2) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + self.assert_container_listing(all_obj_names) + + # delete all objects - updates redirected to shards + self.delete_objects(all_obj_names) + self.assert_container_listing([]) + self.assert_container_post_ok('has objects') + + # run sharder on shard containers to update root stats + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.run_sharders(shard_ranges) + self.assert_container_object_count(0) + + # First, test a misplaced object moving from one shard to another. + # with one shard server down, put a new 'alpha' object... + shard_part, shard_nodes = self.get_part_and_node_numbers( + shard_ranges[0]) + self.brain.servers.stop(number=shard_nodes[2]) + self.put_objects(['alpha']) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) + self.assertLengthEqual( + self.gather_async_pendings(self.get_all_object_nodes()), 1) + self.brain.servers.start(number=shard_nodes[2]) + + # run sharder on root to discover first shrink candidate + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on the shard node without the alpha object + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=shard_nodes[2]) + # root sees first shard has shrunk, only second shard range used for + # listing so alpha object not in listing + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + self.assert_container_listing([]) + self.assert_container_object_count(0) + + # run the updaters: the async pending update will be redirected from + # shrunk shard to second shard + self.updaters.once() + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) # root not yet updated + + # then run sharder on other shard nodes to complete shrinking + for number in shard_nodes[:2]: + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=number) + # and get root updated + self.run_sharders(shard_ranges[1]) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + + # Now we have just one active shard, test a misplaced object moving + # from that shard to the root. + # with one shard server down, delete 'alpha' and put a 'beta' object... + shard_part, shard_nodes = self.get_part_and_node_numbers( + shard_ranges[1]) + self.brain.servers.stop(number=shard_nodes[2]) + self.delete_objects(['alpha']) + self.put_objects(['beta']) + self.assert_container_listing(['beta']) + self.assert_container_object_count(1) + self.assertLengthEqual( + self.gather_async_pendings(self.get_all_object_nodes()), 2) + self.brain.servers.start(number=shard_nodes[2]) + + # run sharder on root to discover second shrink candidate - root is not + # yet aware of the beta object + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on the shard node without the beta object, to shrink + # it to root - note this moves stale copy of alpha to the root db + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=shard_nodes[2]) + # now there are no active shards + self.assertFalse(self.get_container_shard_ranges()) + + # with other two shard servers down, listing won't find beta object + for number in shard_nodes[:2]: + self.brain.servers.stop(number=number) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + + # run the updaters: the async pending update will be redirected from + # shrunk shard to the root + self.updaters.once() + self.assert_container_listing(['beta']) + self.assert_container_object_count(1) + + def test_misplaced_object_movement(self): + def merge_object(shard_range, name, deleted=0): + # it's hard to get a test to put a misplaced object into a shard, + # so this hack is used force an object record directly into a shard + # container db. Note: the actual object won't exist, we're just + # using this to test object records in container dbs. + shard_part, shard_nodes = self.brain.ring.get_nodes( + shard_range.account, shard_range.container) + shard_broker = self.get_broker( + shard_part, shard_nodes[0], shard_range.account, + shard_range.container) + shard_broker.merge_items( + [{'name': name, 'created_at': Timestamp.now().internal, + 'size': 0, 'content_type': 'text/plain', + 'etag': hashlib.md5().hexdigest(), 'deleted': deleted}]) + return shard_nodes[0] + + all_obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(all_obj_names) + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + for n in self.brain.node_numbers: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + # sanity checks + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 2) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + self.assert_container_listing(all_obj_names) + + # delete all objects - updates redirected to shards + self.delete_objects(all_obj_names) + self.assert_container_listing([]) + self.assert_container_post_ok('has objects') + + # run sharder on shard containers to update root stats + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.run_sharders(shard_ranges) + self.assert_container_object_count(0) + + # First, test a misplaced object moving from one shard to another. + # run sharder on root to discover first shrink candidate + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on first shard range to shrink it + shard_part, shard_nodes_numbers = self.get_part_and_node_numbers( + shard_ranges[0]) + self.sharders.once(additional_args='--partitions=%s' % shard_part) + # force a misplaced object into the shrunken shard range to simulate + # a client put that was in flight when it started to shrink + misplaced_node = merge_object(shard_ranges[0], 'alpha', deleted=0) + # root sees first shard has shrunk, only second shard range used for + # listing so alpha object not in listing + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + self.assert_container_listing([]) + self.assert_container_object_count(0) + # until sharder runs on that node to move the misplaced object to the + # second shard range + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=misplaced_node['id'] + 1) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) # root not yet updated + + # run sharder to get root updated + self.run_sharders(shard_ranges[1]) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + + # Now we have just one active shard, test a misplaced object moving + # from that shard to the root. + # run sharder on root to discover second shrink candidate + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on the shard node to shrink it to root - note this + # moves alpha to the root db + shard_part, shard_nodes_numbers = self.get_part_and_node_numbers( + shard_ranges[1]) + self.sharders.once(additional_args='--partitions=%s' % shard_part) + # now there are no active shards + self.assertFalse(self.get_container_shard_ranges()) + + # force some misplaced object updates into second shrunk shard range + merge_object(shard_ranges[1], 'alpha', deleted=1) + misplaced_node = merge_object(shard_ranges[1], 'beta', deleted=0) + # root is not yet aware of them + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + # until sharder runs on that node to move the misplaced object + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=misplaced_node['id'] + 1) + self.assert_container_listing(['beta']) + self.assert_container_object_count(1) + self.assert_container_delete_fails() + + def test_replication_to_sharded_container_from_unsharded_old_primary(self): + primary_ids = [n['id'] for n in self.brain.nodes] + handoff_node = next(n for n in self.brain.ring.devs + if n['id'] not in primary_ids) + + # start with two sharded replicas and one unsharded with extra object + obj_names = self._setup_replication_scenario(2) + for node in self.brain.nodes[:2]: + self.assert_container_state(node, 'sharded', 2) + + # Fake a ring change - copy unsharded db which has no shard ranges to a + # handoff to create illusion of a new unpopulated primary node + node_numbers = self.brain.node_numbers + new_primary_node = self.brain.nodes[2] + new_primary_node_number = node_numbers[2] + new_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, new_primary_node) + old_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, handoff_node) + utils.mkdirs(os.path.dirname(old_primary_dir)) + os.rename(new_primary_dir, old_primary_dir) + + # make the cluster more or less "healthy" again + self.brain.servers.start(number=new_primary_node_number) + + # get a db on every node... + client.put_container(self.url, self.token, self.container_name) + self.assertTrue(os.path.exists(os.path.join( + new_primary_dir, container_hash + '.db'))) + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['normal_dbs'], 1) # "new" primary + self.assertLengthEqual(found['shard_dbs'], 2) # existing primaries + + # catastrophic failure! drive dies and is replaced on unchanged primary + failed_node = self.brain.nodes[0] + failed_dir, _container_hash = self.get_storage_dir( + self.brain.part, failed_node) + shutil.rmtree(failed_dir) + + # replicate the "old primary" to everybody except the "new primary" + self.brain.servers.stop(number=new_primary_node_number) + self.replicators.once(number=handoff_node['id'] + 1) + + # We're willing to rsync the retiring db to the failed primary. + # This may or may not have shard ranges, depending on the order in + # which we hit the primaries, but it definitely *doesn't* have an + # epoch in its name yet. All objects are replicated. + self.assertTrue(os.path.exists(os.path.join( + failed_dir, container_hash + '.db'))) + self.assertLengthEqual(os.listdir(failed_dir), 1) + broker = self.get_broker(self.brain.part, failed_node) + self.assertLengthEqual(broker.get_objects(), len(obj_names) + 1) + + # The other out-of-date primary is within usync range but objects are + # not replicated to it because the handoff db learns about shard ranges + broker = self.get_broker(self.brain.part, self.brain.nodes[1]) + self.assertLengthEqual(broker.get_objects(), 0) + + # Handoff db still exists and now has shard ranges! + self.assertTrue(os.path.exists(os.path.join( + old_primary_dir, container_hash + '.db'))) + broker = self.get_broker(self.brain.part, handoff_node) + shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.assert_container_state(handoff_node, 'unsharded', 2) + + # Replicate again, this time *including* "new primary" + self.brain.servers.start(number=new_primary_node_number) + self.replicators.once(number=handoff_node['id'] + 1) + + # Ordinarily, we would have rsync_then_merge'd to "new primary" + # but instead we wait + broker = self.get_broker(self.brain.part, new_primary_node) + self.assertLengthEqual(broker.get_objects(), 0) + shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + + # so the next time the sharder comes along, it can push rows out + # and delete the big db + self.sharders.once(number=handoff_node['id'] + 1, + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(handoff_node, 'sharded', 2) + self.assertFalse(os.path.exists(os.path.join( + old_primary_dir, container_hash + '.db'))) + # the sharded db hangs around until replication confirms durability + # first attempt is not sufficiently successful + self.brain.servers.stop(number=node_numbers[0]) + self.replicators.once(number=handoff_node['id'] + 1) + self.assertTrue(os.path.exists(old_primary_dir)) + self.assert_container_state(handoff_node, 'sharded', 2) + # second attempt is successful and handoff db is deleted + self.brain.servers.start(number=node_numbers[0]) + self.replicators.once(number=handoff_node['id'] + 1) + self.assertFalse(os.path.exists(old_primary_dir)) + + # run all the sharders, get us into a consistent state + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + self.assert_container_listing(['alpha'] + obj_names) + + def test_replication_to_empty_new_primary_from_sharding_old_primary(self): + primary_ids = [n['id'] for n in self.brain.nodes] + handoff_node = next(n for n in self.brain.ring.devs + if n['id'] not in primary_ids) + num_shards = 3 + obj_names = self._make_object_names( + num_shards * self.max_shard_size / 2) + self.put_objects(obj_names) + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + + # run replicators first time to get sync points set + self.replicators.once() + # start sharding on only the leader node + leader_node = self.brain.nodes[0] + leader_node_number = self.brain.node_numbers[0] + self.sharders.once(number=leader_node_number) + self.assert_container_state(leader_node, 'sharding', 3) + for node in self.brain.nodes[1:]: + self.assert_container_state(node, 'unsharded', 3) + + # Fake a ring change - copy leader node db to a handoff to create + # illusion of a new unpopulated primary leader node + new_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, leader_node) + old_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, handoff_node) + utils.mkdirs(os.path.dirname(old_primary_dir)) + os.rename(new_primary_dir, old_primary_dir) + self.assert_container_state(handoff_node, 'sharding', 3) + + # run replicator on handoff node to create a fresh db on new primary + self.assertFalse(os.path.exists(new_primary_dir)) + self.replicators.once(number=handoff_node['id'] + 1) + self.assertTrue(os.path.exists(new_primary_dir)) + self.assert_container_state(leader_node, 'sharded', 3) + broker = self.get_broker(self.brain.part, leader_node) + shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(shard_ranges, 3) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED], + [sr.state for sr in shard_ranges]) + + # db still exists on handoff + self.assertTrue(os.path.exists(old_primary_dir)) + self.assert_container_state(handoff_node, 'sharding', 3) + # continue sharding it... + self.sharders.once(number=handoff_node['id'] + 1) + self.assert_container_state(leader_node, 'sharded', 3) + # now handoff is fully sharded the replicator will delete it + self.replicators.once(number=handoff_node['id'] + 1) + self.assertFalse(os.path.exists(old_primary_dir)) + + # all primaries now have active shard ranges but only one is in sharded + # state + self.assert_container_state(leader_node, 'sharded', 3) + for node in self.brain.nodes[1:]: + self.assert_container_state(node, 'unsharded', 3) + node_data = self.direct_get_container_shard_ranges() + for node_id, (hdrs, shard_ranges) in node_data.items(): + with annotate_failure( + 'node id %s from %s' % (node_id, node_data.keys)): + self.assert_shard_range_state(ShardRange.ACTIVE, shard_ranges) + + # check handoff cleaved all objects before it was deleted - stop all + # but leader node so that listing is fetched from shards + for number in self.brain.node_numbers[1:3]: + self.brain.servers.stop(number=number) + + self.assert_container_listing(obj_names) + + for number in self.brain.node_numbers[1:3]: + self.brain.servers.start(number=number) + + self.sharders.once() + self.assert_container_state(leader_node, 'sharded', 3) + for node in self.brain.nodes[1:]: + self.assert_container_state(node, 'sharding', 3) + self.sharders.once() + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 3) + + self.assert_container_listing(obj_names) diff --git a/test/unit/__init__.py b/test/unit/__init__.py index 2e611806a4..278c55a4ca 100644 --- a/test/unit/__init__.py +++ b/test/unit/__init__.py @@ -1343,3 +1343,46 @@ def unlink_files(paths): except OSError as err: if err.errno != errno.ENOENT: raise + + +class FakeHTTPResponse(object): + + def __init__(self, resp): + self.resp = resp + + @property + def status(self): + return self.resp.status_int + + @property + def data(self): + return self.resp.body + + +def attach_fake_replication_rpc(rpc, replicate_hook=None, errors=None): + class FakeReplConnection(object): + + def __init__(self, node, partition, hash_, logger): + self.logger = logger + self.node = node + self.partition = partition + self.path = '/%s/%s/%s' % (node['device'], partition, hash_) + self.host = node['replication_ip'] + + def replicate(self, op, *sync_args): + print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args)) + resp = None + if errors and op in errors and errors[op]: + resp = errors[op].pop(0) + if not resp: + replicate_args = self.path.lstrip('/').split('/') + args = [op] + copy.deepcopy(list(sync_args)) + with mock_check_drive(isdir=not rpc.mount_check, + ismount=rpc.mount_check): + swob_response = rpc.dispatch(replicate_args, args) + resp = FakeHTTPResponse(swob_response) + if replicate_hook: + replicate_hook(op, *sync_args) + return resp + + return FakeReplConnection diff --git a/test/unit/cli/test_manage_shard_ranges.py b/test/unit/cli/test_manage_shard_ranges.py new file mode 100644 index 0000000000..8cefa5b19c --- /dev/null +++ b/test/unit/cli/test_manage_shard_ranges.py @@ -0,0 +1,362 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from __future__ import unicode_literals + +import json +import os +import unittest +import mock +from shutil import rmtree +from tempfile import mkdtemp + +from six.moves import cStringIO as StringIO + +from swift.cli.manage_shard_ranges import main +from swift.common import utils +from swift.common.utils import Timestamp, ShardRange +from swift.container.backend import ContainerBroker +from test.unit import mock_timestamp_now + + +class TestManageShardRanges(unittest.TestCase): + def setUp(self): + self.testdir = os.path.join(mkdtemp(), 'tmp_test_cli_find_shards') + utils.mkdirs(self.testdir) + rmtree(self.testdir) + self.shard_data = [ + {'index': 0, 'lower': '', 'upper': 'obj09', 'object_count': 10}, + {'index': 1, 'lower': 'obj09', 'upper': 'obj19', + 'object_count': 10}, + {'index': 2, 'lower': 'obj19', 'upper': 'obj29', + 'object_count': 10}, + {'index': 3, 'lower': 'obj29', 'upper': 'obj39', + 'object_count': 10}, + {'index': 4, 'lower': 'obj39', 'upper': 'obj49', + 'object_count': 10}, + {'index': 5, 'lower': 'obj49', 'upper': 'obj59', + 'object_count': 10}, + {'index': 6, 'lower': 'obj59', 'upper': 'obj69', + 'object_count': 10}, + {'index': 7, 'lower': 'obj69', 'upper': 'obj79', + 'object_count': 10}, + {'index': 8, 'lower': 'obj79', 'upper': 'obj89', + 'object_count': 10}, + {'index': 9, 'lower': 'obj89', 'upper': '', 'object_count': 10}, + ] + + def tearDown(self): + rmtree(os.path.dirname(self.testdir)) + + def assert_starts_with(self, value, prefix): + self.assertTrue(value.startswith(prefix), + "%r does not start with %r" % (value, prefix)) + + def assert_formatted_json(self, output, expected): + try: + loaded = json.loads(output) + except ValueError as err: + self.fail('Invalid JSON: %s\n%r' % (err, output)) + # Check this one first, for a prettier diff + self.assertEqual(loaded, expected) + formatted = json.dumps(expected, sort_keys=True, indent=2) + '\n' + self.assertEqual(output, formatted) + + def _make_broker(self, account='a', container='c', + device='sda', part=0): + datadir = os.path.join( + self.testdir, device, 'containers', str(part), 'ash', 'hash') + db_file = os.path.join(datadir, 'hash.db') + broker = ContainerBroker( + db_file, account=account, container=container) + broker.initialize() + return broker + + def test_find_shard_ranges(self): + db_file = os.path.join(self.testdir, 'hash.db') + broker = ContainerBroker(db_file) + broker.account = 'a' + broker.container = 'c' + broker.initialize() + ts = utils.Timestamp.now() + broker.merge_items([ + {'name': 'obj%02d' % i, 'created_at': ts.internal, 'size': 0, + 'content_type': 'application/octet-stream', 'etag': 'not-really', + 'deleted': 0, 'storage_policy_index': 0, + 'ctype_timestamp': ts.internal, 'meta_timestamp': ts.internal} + for i in range(100)]) + + # Default uses a large enough value that sharding isn't required + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find']) + self.assert_formatted_json(out.getvalue(), []) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 0 ranges in ') + + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find', '100']) + self.assert_formatted_json(out.getvalue(), []) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 0 ranges in ') + + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find', '99']) + self.assert_formatted_json(out.getvalue(), [ + {'index': 0, 'lower': '', 'upper': 'obj98', 'object_count': 99}, + {'index': 1, 'lower': 'obj98', 'upper': '', 'object_count': 1}, + ]) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 2 ranges in ') + + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find', '10']) + self.assert_formatted_json(out.getvalue(), [ + {'index': 0, 'lower': '', 'upper': 'obj09', 'object_count': 10}, + {'index': 1, 'lower': 'obj09', 'upper': 'obj19', + 'object_count': 10}, + {'index': 2, 'lower': 'obj19', 'upper': 'obj29', + 'object_count': 10}, + {'index': 3, 'lower': 'obj29', 'upper': 'obj39', + 'object_count': 10}, + {'index': 4, 'lower': 'obj39', 'upper': 'obj49', + 'object_count': 10}, + {'index': 5, 'lower': 'obj49', 'upper': 'obj59', + 'object_count': 10}, + {'index': 6, 'lower': 'obj59', 'upper': 'obj69', + 'object_count': 10}, + {'index': 7, 'lower': 'obj69', 'upper': 'obj79', + 'object_count': 10}, + {'index': 8, 'lower': 'obj79', 'upper': 'obj89', + 'object_count': 10}, + {'index': 9, 'lower': 'obj89', 'upper': '', 'object_count': 10}, + ]) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 10 ranges in ') + + def test_info(self): + broker = self._make_broker() + broker.update_metadata({'X-Container-Sysmeta-Sharding': + (True, Timestamp.now().internal)}) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'info']) + expected = ['Sharding enabled = True', + 'Own shard range: None', + 'db_state = unsharded', + 'Metadata:', + ' X-Container-Sysmeta-Sharding = True'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + retiring_db_id = broker.get_info()['id'] + broker.merge_shard_ranges(ShardRange('.shards/cc', Timestamp.now())) + epoch = Timestamp.now() + with mock_timestamp_now(epoch) as now: + broker.enable_sharding(epoch) + self.assertTrue(broker.set_sharding_state()) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now(now): + main([broker.db_file, 'info']) + expected = ['Sharding enabled = True', + 'Own shard range: {', + ' "bytes_used": 0, ', + ' "deleted": 0, ', + ' "epoch": "%s", ' % epoch.internal, + ' "lower": "", ', + ' "meta_timestamp": "%s", ' % now.internal, + ' "name": "a/c", ', + ' "object_count": 0, ', + ' "state": "sharding", ', + ' "state_timestamp": "%s", ' % now.internal, + ' "timestamp": "%s", ' % now.internal, + ' "upper": ""', + '}', + 'db_state = sharding', + 'Retiring db id: %s' % retiring_db_id, + 'Cleaving context: {', + ' "cleave_to_row": null, ', + ' "cleaving_done": false, ', + ' "cursor": "", ', + ' "last_cleave_to_row": null, ', + ' "max_row": -1, ', + ' "misplaced_done": false, ', + ' "ranges_done": 0, ', + ' "ranges_todo": 0, ', + ' "ref": "%s"' % retiring_db_id, + '}', + 'Metadata:', + ' X-Container-Sysmeta-Sharding = True'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + self.assertTrue(broker.set_sharded_state()) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now(now): + main([broker.db_file, 'info']) + expected = ['Sharding enabled = True', + 'Own shard range: {', + ' "bytes_used": 0, ', + ' "deleted": 0, ', + ' "epoch": "%s", ' % epoch.internal, + ' "lower": "", ', + ' "meta_timestamp": "%s", ' % now.internal, + ' "name": "a/c", ', + ' "object_count": 0, ', + ' "state": "sharding", ', + ' "state_timestamp": "%s", ' % now.internal, + ' "timestamp": "%s", ' % now.internal, + ' "upper": ""', + '}', + 'db_state = sharded', + 'Metadata:', + ' X-Container-Sysmeta-Sharding = True'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + def test_replace(self): + broker = self._make_broker() + broker.update_metadata({'X-Container-Sysmeta-Sharding': + (True, Timestamp.now().internal)}) + input_file = os.path.join(self.testdir, 'shards') + with open(input_file, 'wb') as fd: + json.dump(self.shard_data, fd) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'replace', input_file]) + expected = [ + 'No shard ranges found to delete.', + 'Injected 10 shard ranges.', + 'Run container-replicator to replicate them to other nodes.', + 'Use the enable sub-command to enable sharding.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self.assertEqual( + [(data['lower'], data['upper']) for data in self.shard_data], + [(sr.lower_str, sr.upper_str) for sr in broker.get_shard_ranges()]) + + def _assert_enabled(self, broker, epoch): + own_sr = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_sr.state) + self.assertEqual(epoch, own_sr.epoch) + self.assertEqual(ShardRange.MIN, own_sr.lower) + self.assertEqual(ShardRange.MAX, own_sr.upper) + self.assertEqual( + 'True', broker.metadata['X-Container-Sysmeta-Sharding'][0]) + + def test_enable(self): + broker = self._make_broker() + broker.update_metadata({'X-Container-Sysmeta-Sharding': + (True, Timestamp.now().internal)}) + # no shard ranges + out = StringIO() + err = StringIO() + with self.assertRaises(SystemExit): + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'enable']) + expected = ["WARNING: invalid shard ranges: ['No shard ranges.'].", + 'Aborting.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + # success + shard_ranges = [] + for data in self.shard_data: + path = ShardRange.make_path( + '.shards_a', 'c', 'c', Timestamp.now(), data['index']) + shard_ranges.append( + ShardRange(path, Timestamp.now(), data['lower'], + data['upper'], data['object_count'])) + broker.merge_shard_ranges(shard_ranges) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now() as now: + main([broker.db_file, 'enable']) + expected = [ + "Container moved to state 'sharding' with epoch %s." % + now.internal, + 'Run container-sharder on all nodes to shard the container.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self._assert_enabled(broker, now) + + # already enabled + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'enable']) + expected = [ + "Container already in state 'sharding' with epoch %s." % + now.internal, + 'No action required.', + 'Run container-sharder on all nodes to shard the container.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self._assert_enabled(broker, now) + + def test_find_replace_enable(self): + db_file = os.path.join(self.testdir, 'hash.db') + broker = ContainerBroker(db_file) + broker.account = 'a' + broker.container = 'c' + broker.initialize() + ts = utils.Timestamp.now() + broker.merge_items([ + {'name': 'obj%02d' % i, 'created_at': ts.internal, 'size': 0, + 'content_type': 'application/octet-stream', 'etag': 'not-really', + 'deleted': 0, 'storage_policy_index': 0, + 'ctype_timestamp': ts.internal, 'meta_timestamp': ts.internal} + for i in range(100)]) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now() as now: + main([broker.db_file, 'find_and_replace', '10', '--enable']) + expected = [ + 'No shard ranges found to delete.', + 'Injected 10 shard ranges.', + 'Run container-replicator to replicate them to other nodes.', + "Container moved to state 'sharding' with epoch %s." % + now.internal, + 'Run container-sharder on all nodes to shard the container.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self._assert_enabled(broker, now) + self.assertEqual( + [(data['lower'], data['upper']) for data in self.shard_data], + [(sr.lower_str, sr.upper_str) for sr in broker.get_shard_ranges()]) diff --git a/test/unit/common/test_db_replicator.py b/test/unit/common/test_db_replicator.py index e4fdce8e91..21eedb9b7d 100644 --- a/test/unit/common/test_db_replicator.py +++ b/test/unit/common/test_db_replicator.py @@ -28,7 +28,6 @@ from tempfile import mkdtemp, NamedTemporaryFile import json import mock -from copy import deepcopy from mock import patch, call from six.moves import reload_module @@ -40,7 +39,7 @@ from swift.common.exceptions import DriveNotMounted from swift.common.swob import HTTPException from test import unit -from test.unit import FakeLogger +from test.unit import FakeLogger, attach_fake_replication_rpc from test.unit.common.test_db import ExampleBroker @@ -2054,49 +2053,6 @@ class TestReplToNode(unittest.TestCase): ]) -class FakeHTTPResponse(object): - - def __init__(self, resp): - self.resp = resp - - @property - def status(self): - return self.resp.status_int - - @property - def data(self): - return self.resp.body - - -def attach_fake_replication_rpc(rpc, replicate_hook=None, errors=None): - class FakeReplConnection(object): - - def __init__(self, node, partition, hash_, logger): - self.logger = logger - self.node = node - self.partition = partition - self.path = '/%s/%s/%s' % (node['device'], partition, hash_) - self.host = node['replication_ip'] - - def replicate(self, op, *sync_args): - print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args)) - resp = None - if errors and op in errors and errors[op]: - resp = errors[op].pop(0) - if not resp: - replicate_args = self.path.lstrip('/').split('/') - args = [op] + deepcopy(list(sync_args)) - with unit.mock_check_drive(isdir=not rpc.mount_check, - ismount=rpc.mount_check): - swob_response = rpc.dispatch(replicate_args, args) - resp = FakeHTTPResponse(swob_response) - if replicate_hook: - replicate_hook(op, *sync_args) - return resp - - return FakeReplConnection - - class ExampleReplicator(db_replicator.Replicator): server_type = 'fake' brokerclass = ExampleBroker diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index bfb83bf871..7abad33ec2 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -2766,6 +2766,53 @@ cluster_dfw1 = http://dfw1.host/v1/ else: self.assertEqual(expected, rv) + def test_config_float_value(self): + for args, expected in ( + ((99, None, None), 99.0), + ((99.01, None, None), 99.01), + (('99', None, None), 99.0), + (('99.01', None, None), 99.01), + ((99, 99, None), 99.0), + ((99.01, 99.01, None), 99.01), + (('99', 99, None), 99.0), + (('99.01', 99.01, None), 99.01), + ((99, None, 99), 99.0), + ((99.01, None, 99.01), 99.01), + (('99', None, 99), 99.0), + (('99.01', None, 99.01), 99.01), + ((-99, -99, -99), -99.0), + ((-99.01, -99.01, -99.01), -99.01), + (('-99', -99, -99), -99.0), + (('-99.01', -99.01, -99.01), -99.01),): + actual = utils.config_float_value(*args) + self.assertEqual(expected, actual) + + for val, minimum in ((99, 100), + ('99', 100), + (-99, -98), + ('-98.01', -98)): + with self.assertRaises(ValueError) as cm: + utils.config_float_value(val, minimum=minimum) + self.assertIn('greater than %s' % minimum, cm.exception.args[0]) + self.assertNotIn('less than', cm.exception.args[0]) + + for val, maximum in ((99, 98), + ('99', 98), + (-99, -100), + ('-97.9', -98)): + with self.assertRaises(ValueError) as cm: + utils.config_float_value(val, maximum=maximum) + self.assertIn('less than %s' % maximum, cm.exception.args[0]) + self.assertNotIn('greater than', cm.exception.args[0]) + + for val, minimum, maximum in ((99, 99, 98), + ('99', 100, 100), + (99, 98, 98),): + with self.assertRaises(ValueError) as cm: + utils.config_float_value(val, minimum=minimum, maximum=maximum) + self.assertIn('greater than %s' % minimum, cm.exception.args[0]) + self.assertIn('less than %s' % maximum, cm.exception.args[0]) + def test_config_auto_int_value(self): expectations = { # (value, default) : expected, diff --git a/test/unit/container/test_backend.py b/test/unit/container/test_backend.py index 0069f812e1..79ede02901 100644 --- a/test/unit/container/test_backend.py +++ b/test/unit/container/test_backend.py @@ -2013,6 +2013,75 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(info['reported_object_count'], 2) self.assertEqual(info['reported_bytes_used'], 1123) + @with_tempdir + def test_remove_objects(self, tempdir): + objects = (('undeleted', Timestamp.now().internal, 0, 'text/plain', + EMPTY_ETAG, 0, 0), + ('other_policy', Timestamp.now().internal, 0, 'text/plain', + EMPTY_ETAG, 0, 1), + ('deleted', Timestamp.now().internal, 0, 'text/plain', + EMPTY_ETAG, 1, 0)) + object_names = [o[0] for o in objects] + + def get_rows(broker): + with broker.get() as conn: + cursor = conn.execute("SELECT * FROM object") + return [r[1] for r in cursor] + + def do_setup(): + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', '%s.db' % uuid4()) + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(Timestamp.now().internal, 0) + for obj in objects: + # ensure row order matches put order + broker.put_object(*obj) + broker._commit_puts() + + self.assertEqual(3, broker.get_max_row()) # sanity check + self.assertEqual(object_names, get_rows(broker)) # sanity check + return broker + + broker = do_setup() + broker.remove_objects('', '') + self.assertFalse(get_rows(broker)) + + broker = do_setup() + broker.remove_objects('deleted', '') + self.assertEqual([object_names[2]], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', 'deleted', max_row=2) + self.assertEqual(object_names, get_rows(broker)) + + broker = do_setup() + broker.remove_objects('deleted', 'un') + self.assertEqual([object_names[0], object_names[2]], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=-1) + self.assertEqual(object_names, get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=0) + self.assertEqual(object_names, get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=1) + self.assertEqual(object_names[1:], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=2) + self.assertEqual(object_names[2:], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=3) + self.assertFalse(get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=99) + self.assertFalse(get_rows(broker)) + def test_get_objects(self): broker = ContainerBroker(':memory:', account='a', container='c') broker.initialize(Timestamp('1').internal, 0) diff --git a/test/unit/container/test_sharder.py b/test/unit/container/test_sharder.py new file mode 100644 index 0000000000..353d980bbf --- /dev/null +++ b/test/unit/container/test_sharder.py @@ -0,0 +1,4580 @@ +# Copyright (c) 2010-2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import hashlib +import json +import random + +import eventlet +import os +import shutil +from contextlib import contextmanager +from tempfile import mkdtemp + +import mock +import unittest + +from collections import defaultdict + +import time + +from copy import deepcopy + +from swift.common import internal_client +from swift.container import replicator +from swift.container.backend import ContainerBroker, UNSHARDED, SHARDING, \ + SHARDED, DATADIR +from swift.container.sharder import ContainerSharder, sharding_enabled, \ + CleavingContext, DEFAULT_SHARD_SHRINK_POINT, \ + DEFAULT_SHARD_CONTAINER_THRESHOLD +from swift.common.utils import ShardRange, Timestamp, hash_path, \ + encode_timestamps, parse_db_filename, quorum_size, Everything +from test import annotate_failure + +from test.unit import FakeLogger, debug_logger, FakeRing, \ + make_timestamp_iter, unlink_files, mocked_http_conn, mock_timestamp_now, \ + attach_fake_replication_rpc + + +class BaseTestSharder(unittest.TestCase): + def setUp(self): + self.tempdir = mkdtemp() + self.ts_iter = make_timestamp_iter() + + def tearDown(self): + shutil.rmtree(self.tempdir, ignore_errors=True) + + def _assert_shard_ranges_equal(self, expected, actual): + self.assertEqual([dict(sr) for sr in expected], + [dict(sr) for sr in actual]) + + def _make_broker(self, account='a', container='c', epoch=None, + device='sda', part=0, hash_=None): + hash_ = hash_ or hashlib.md5(container).hexdigest() + datadir = os.path.join( + self.tempdir, device, 'containers', str(part), hash_[-3:], hash_) + if epoch: + filename = '%s_%s.db' % (hash, epoch) + else: + filename = hash_ + '.db' + db_file = os.path.join(datadir, filename) + broker = ContainerBroker( + db_file, account=account, container=container, + logger=debug_logger()) + broker.initialize() + return broker + + def _make_sharding_broker(self, account='a', container='c', + shard_bounds=(('', 'middle'), ('middle', ''))): + broker = self._make_broker(account=account, container=container) + broker.set_sharding_sysmeta('Root', 'a/c') + old_db_id = broker.get_info()['id'] + broker.enable_sharding(next(self.ts_iter)) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CLEAVED) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + broker = ContainerBroker(broker.db_file, account='a', container='c') + self.assertNotEqual(old_db_id, broker.get_info()['id']) # sanity check + return broker + + def _make_shard_ranges(self, bounds, state=None, object_count=0): + return [ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), + lower, upper, state=state, + object_count=object_count) + for lower, upper in bounds] + + def ts_encoded(self): + # make a unique timestamp string with multiple timestamps encoded; + # use different deltas between component timestamps + timestamps = [next(self.ts_iter) for i in range(4)] + return encode_timestamps( + timestamps[0], timestamps[1], timestamps[3]) + + +class TestSharder(BaseTestSharder): + def test_init(self): + def do_test(conf, expected): + with mock.patch( + 'swift.container.sharder.internal_client.InternalClient') \ + as mock_ic: + with mock.patch('swift.common.db_replicator.ring.Ring') \ + as mock_ring: + mock_ring.return_value = mock.MagicMock() + mock_ring.return_value.replica_count = 3 + sharder = ContainerSharder(conf) + mock_ring.assert_called_once_with( + '/etc/swift', ring_name='container') + self.assertEqual( + 'container-sharder', sharder.logger.logger.name) + for k, v in expected.items(): + self.assertTrue(hasattr(sharder, k), 'Missing attr %s' % k) + self.assertEqual(v, getattr(sharder, k), + 'Incorrect value: expected %s=%s but got %s' % + (k, v, getattr(sharder, k))) + return mock_ic + + expected = { + 'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201, + 'per_diff': 1000, 'max_diffs': 100, 'interval': 30, + 'cleave_row_batch_size': 10000, + 'node_timeout': 10, 'conn_timeout': 5, + 'rsync_compress': False, + 'rsync_module': '{replication_ip}::container', + 'reclaim_age': 86400 * 7, + 'shard_shrink_point': 0.25, + 'shrink_merge_point': 0.75, + 'shard_container_threshold': 10000000, + 'split_size': 5000000, + 'cleave_batch_size': 2, + 'scanner_batch_size': 10, + 'rcache': '/var/cache/swift/container.recon', + 'shards_account_prefix': '.shards_', + 'auto_shard': False, + 'recon_candidates_limit': 5, + 'shard_replication_quorum': 2, + 'existing_shard_replication_quorum': 2 + } + mock_ic = do_test({}, expected) + mock_ic.assert_called_once_with( + '/etc/swift/internal-client.conf', 'Swift Container Sharder', 3, + allow_modify_pipeline=False) + + conf = { + 'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010, + 'per_diff': 2000, 'max_diffs': 200, 'interval': 60, + 'cleave_row_batch_size': 3000, + 'node_timeout': 20, 'conn_timeout': 1, + 'rsync_compress': True, + 'rsync_module': '{replication_ip}::container_sda/', + 'reclaim_age': 86400 * 14, + 'shard_shrink_point': 35, + 'shard_shrink_merge_point': 85, + 'shard_container_threshold': 20000000, + 'cleave_batch_size': 4, + 'shard_scanner_batch_size': 8, + 'request_tries': 2, + 'internal_client_conf_path': '/etc/swift/my-sharder-ic.conf', + 'recon_cache_path': '/var/cache/swift-alt', + 'auto_create_account_prefix': '...', + 'auto_shard': 'yes', + 'recon_candidates_limit': 10, + 'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0 + } + expected = { + 'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010, + 'per_diff': 2000, 'max_diffs': 200, 'interval': 60, + 'cleave_row_batch_size': 3000, + 'node_timeout': 20, 'conn_timeout': 1, + 'rsync_compress': True, + 'rsync_module': '{replication_ip}::container_sda', + 'reclaim_age': 86400 * 14, + 'shard_shrink_point': 0.35, + 'shrink_merge_point': 0.85, + 'shard_container_threshold': 20000000, + 'split_size': 10000000, + 'cleave_batch_size': 4, + 'scanner_batch_size': 8, + 'rcache': '/var/cache/swift-alt/container.recon', + 'shards_account_prefix': '...shards_', + 'auto_shard': True, + 'recon_candidates_limit': 10, + 'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0 + } + mock_ic = do_test(conf, expected) + mock_ic.assert_called_once_with( + '/etc/swift/my-sharder-ic.conf', 'Swift Container Sharder', 2, + allow_modify_pipeline=False) + + expected.update({'shard_replication_quorum': 3, + 'existing_shard_replication_quorum': 3}) + conf.update({'shard_replication_quorum': 4, + 'existing_shard_replication_quorum': 4}) + do_test(conf, expected) + + with self.assertRaises(ValueError) as cm: + do_test({'shard_shrink_point': 101}, {}) + self.assertIn( + 'greater than 0, less than 100, not "101"', cm.exception.message) + self.assertIn('shard_shrink_point', cm.exception.message) + + with self.assertRaises(ValueError) as cm: + do_test({'shard_shrink_merge_point': 101}, {}) + self.assertIn( + 'greater than 0, less than 100, not "101"', cm.exception.message) + self.assertIn('shard_shrink_merge_point', cm.exception.message) + + def test_init_internal_client_conf_loading_error(self): + with mock.patch('swift.common.db_replicator.ring.Ring') \ + as mock_ring: + mock_ring.return_value = mock.MagicMock() + mock_ring.return_value.replica_count = 3 + with self.assertRaises(SystemExit) as cm: + ContainerSharder( + {'internal_client_conf_path': + os.path.join(self.tempdir, 'nonexistent')}) + self.assertIn('Unable to load internal client', str(cm.exception)) + + with mock.patch('swift.common.db_replicator.ring.Ring') \ + as mock_ring: + mock_ring.return_value = mock.MagicMock() + mock_ring.return_value.replica_count = 3 + with mock.patch( + 'swift.container.sharder.internal_client.InternalClient', + side_effect=Exception('kaboom')): + with self.assertRaises(Exception) as cm: + ContainerSharder({}) + self.assertIn('kaboom', str(cm.exception)) + + def _assert_stats(self, expected, sharder, category): + # assertEqual doesn't work with a defaultdict + stats = sharder.stats['sharding'][category] + for k, v in expected.items(): + actual = stats[k] + self.assertEqual( + v, actual, 'Expected %s but got %s for %s in %s' % + (v, actual, k, stats)) + return stats + + def _assert_recon_stats(self, expected, sharder, category): + with open(sharder.rcache, 'rb') as fd: + recon = json.load(fd) + stats = recon['sharding_stats']['sharding'].get(category) + self.assertEqual(expected, stats) + + def test_increment_stats(self): + with self._mock_sharder() as sharder: + sharder._increment_stat('visited', 'success') + sharder._increment_stat('visited', 'success') + sharder._increment_stat('visited', 'failure') + sharder._increment_stat('visited', 'completed') + sharder._increment_stat('cleaved', 'success') + sharder._increment_stat('scanned', 'found', step=4) + expected = {'success': 2, + 'failure': 1, + 'completed': 1} + self._assert_stats(expected, sharder, 'visited') + self._assert_stats({'success': 1}, sharder, 'cleaved') + self._assert_stats({'found': 4}, sharder, 'scanned') + + def test_increment_stats_with_statsd(self): + with self._mock_sharder() as sharder: + sharder._increment_stat('visited', 'success', statsd=True) + sharder._increment_stat('visited', 'success', statsd=True) + sharder._increment_stat('visited', 'failure', statsd=True) + sharder._increment_stat('visited', 'failure', statsd=False) + sharder._increment_stat('visited', 'completed') + expected = {'success': 2, + 'failure': 2, + 'completed': 1} + self._assert_stats(expected, sharder, 'visited') + counts = sharder.logger.get_increment_counts() + self.assertEqual(2, counts.get('visited_success')) + self.assertEqual(1, counts.get('visited_failure')) + self.assertIsNone(counts.get('visited_completed')) + + def test_run_forever(self): + conf = {'recon_cache_path': self.tempdir, + 'devices': self.tempdir} + with self._mock_sharder(conf) as sharder: + sharder._check_node = lambda *args: True + sharder.logger.clear() + brokers = [] + for container in ('c1', 'c2'): + broker = self._make_broker( + container=container, hash_=container + 'hash', + device=sharder.ring.devs[0]['device'], part=0) + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('true', next(self.ts_iter).internal)}) + brokers.append(broker) + + fake_stats = { + 'scanned': {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 2, 'min_time': 99, 'max_time': 123}, + 'created': {'attempted': 1, 'success': 1, 'failure': 1}, + 'cleaved': {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': 0.01, 'max_time': 1.3}, + 'misplaced': {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 1, 'unplaced': 0}, + 'audit_root': {'attempted': 5, 'success': 4, 'failure': 1}, + 'audit_shard': {'attempted': 2, 'success': 2, 'failure': 0}, + } + # NB these are time increments not absolute times... + fake_periods = [1, 2, 3, 3600, 4, 15, 15, 0] + fake_periods_iter = iter(fake_periods) + recon_data = [] + fake_process_broker_calls = [] + + def mock_dump_recon_cache(data, *args): + recon_data.append(deepcopy(data)) + + with mock.patch('swift.container.sharder.time.time') as fake_time: + def fake_process_broker(broker, *args, **kwargs): + # increment time and inject some fake stats + fake_process_broker_calls.append((broker, args, kwargs)) + try: + fake_time.return_value += next(fake_periods_iter) + except StopIteration: + # bail out + fake_time.side_effect = Exception('Test over') + sharder.stats['sharding'].update(fake_stats) + + with mock.patch( + 'swift.container.sharder.time.sleep') as mock_sleep: + with mock.patch( + 'swift.container.sharder.is_sharding_candidate', + return_value=True): + with mock.patch( + 'swift.container.sharder.dump_recon_cache', + mock_dump_recon_cache): + fake_time.return_value = next(fake_periods_iter) + sharder._is_sharding_candidate = lambda x: True + sharder._process_broker = fake_process_broker + with self.assertRaises(Exception) as cm: + sharder.run_forever() + + self.assertEqual('Test over', cm.exception.message) + # four cycles are started, two brokers visited per cycle, but + # fourth never completes + self.assertEqual(8, len(fake_process_broker_calls)) + # expect initial random sleep then one sleep between first and + # second pass + self.assertEqual(2, mock_sleep.call_count) + self.assertLessEqual(mock_sleep.call_args_list[0][0][0], 30) + self.assertLessEqual(mock_sleep.call_args_list[1][0][0], + 30 - fake_periods[0]) + + lines = sharder.logger.get_lines_for_level('info') + categories = ('visited', 'scanned', 'created', 'cleaved', + 'misplaced', 'audit_root', 'audit_shard') + + def check_categories(start_time): + for category in categories: + line = lines.pop(0) + self.assertIn('Since %s' % time.ctime(start_time), line) + self.assertIn(category, line) + for k, v in fake_stats.get(category, {}).items(): + self.assertIn('%s:%s' % (k, v), line) + + def check_logs(cycle_time, start_time, + expect_periodic_stats=False): + self.assertIn('Container sharder cycle starting', lines.pop(0)) + check_categories(start_time) + if expect_periodic_stats: + check_categories(start_time) + self.assertIn('Container sharder cycle completed: %.02fs' % + cycle_time, lines.pop(0)) + + check_logs(sum(fake_periods[1:3]), fake_periods[0]) + check_logs(sum(fake_periods[3:5]), sum(fake_periods[:3]), + expect_periodic_stats=True) + check_logs(sum(fake_periods[5:7]), sum(fake_periods[:5])) + # final cycle start but then exception pops to terminate test + self.assertIn('Container sharder cycle starting', lines.pop(0)) + self.assertFalse(lines) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn( + 'Unhandled exception while dumping progress', lines[0]) + self.assertIn('Test over', lines[0]) + + def check_recon(data, time, last, expected_stats): + self.assertEqual(time, data['sharding_time']) + self.assertEqual(last, data['sharding_last']) + self.assertEqual( + expected_stats, dict(data['sharding_stats']['sharding'])) + + def stats_for_candidate(broker): + return {'object_count': 0, + 'account': broker.account, + 'meta_timestamp': mock.ANY, + 'container': broker.container, + 'file_size': os.stat(broker.db_file).st_size, + 'path': broker.db_file, + 'root': broker.path, + 'node_index': 0} + + self.assertEqual(4, len(recon_data)) + # stats report at end of first cycle + fake_stats.update({'visited': {'attempted': 2, 'skipped': 0, + 'success': 2, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 2, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[:2]] + } + }) + check_recon(recon_data[0], sum(fake_periods[1:3]), + sum(fake_periods[:3]), fake_stats) + # periodic stats report after first broker has been visited during + # second cycle - one candidate identified so far this cycle + fake_stats.update({'visited': {'attempted': 1, 'skipped': 0, + 'success': 1, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 1, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[2:3]] + } + }) + check_recon(recon_data[1], fake_periods[3], + sum(fake_periods[:4]), fake_stats) + # stats report at end of second cycle - both candidates reported + fake_stats.update({'visited': {'attempted': 2, 'skipped': 0, + 'success': 2, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 2, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[2:4]] + } + }) + check_recon(recon_data[2], sum(fake_periods[3:5]), + sum(fake_periods[:5]), fake_stats) + # stats report at end of third cycle + fake_stats.update({'visited': {'attempted': 2, 'skipped': 0, + 'success': 2, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 2, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[4:6]] + } + }) + check_recon(recon_data[3], sum(fake_periods[5:7]), + sum(fake_periods[:7]), fake_stats) + + def test_one_shard_cycle(self): + conf = {'recon_cache_path': self.tempdir, + 'devices': self.tempdir, + 'shard_container_threshold': 9} + with self._mock_sharder(conf) as sharder: + sharder._check_node = lambda *args: True + sharder.reported = time.time() + sharder.logger = debug_logger() + brokers = [] + device_ids = set(range(3)) + for device_id in device_ids: + brokers.append(self._make_broker( + container='c%s' % device_id, hash_='c%shash' % device_id, + device=sharder.ring.devs[device_id]['device'], part=0)) + # enable a/c2 and a/c3 for sharding + for broker in brokers[1:]: + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('true', next(self.ts_iter).internal)}) + # make a/c2 a candidate for sharding + for i in range(10): + brokers[1].put_object('o%s' % i, next(self.ts_iter).internal, + 0, 'text/plain', 'etag', 0) + + # check only sharding enabled containers are processed + with mock.patch.object( + sharder, '_process_broker' + ) as mock_process_broker: + sharder._local_device_ids = {'stale_node_id'} + sharder._one_shard_cycle(Everything(), Everything()) + + self.assertEqual(device_ids, sharder._local_device_ids) + self.assertEqual(2, mock_process_broker.call_count) + processed_paths = [call[0][0].path + for call in mock_process_broker.call_args_list] + self.assertEqual({'a/c1', 'a/c2'}, set(processed_paths)) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + expected_stats = {'attempted': 2, 'success': 2, 'failure': 0, + 'skipped': 1, 'completed': 0} + self._assert_recon_stats(expected_stats, sharder, 'visited') + expected_candidate_stats = { + 'found': 1, + 'top': [{'object_count': 10, 'account': 'a', 'container': 'c1', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[1].db_file).st_size, + 'path': brokers[1].db_file, 'root': 'a/c1', + 'node_index': 1}]} + self._assert_recon_stats( + expected_candidate_stats, sharder, 'sharding_candidates') + self._assert_recon_stats(None, sharder, 'sharding_progress') + + # enable and progress container a/c1 by giving it shard ranges + now = next(self.ts_iter) + brokers[0].merge_shard_ranges( + [ShardRange('a/c0', now, '', '', state=ShardRange.SHARDING), + ShardRange('.s_a/1', now, '', 'b', state=ShardRange.ACTIVE), + ShardRange('.s_a/2', now, 'b', 'c', state=ShardRange.CLEAVED), + ShardRange('.s_a/3', now, 'c', 'd', state=ShardRange.CREATED), + ShardRange('.s_a/4', now, 'd', 'e', state=ShardRange.CREATED), + ShardRange('.s_a/5', now, 'e', '', state=ShardRange.FOUND)]) + brokers[1].merge_shard_ranges( + [ShardRange('a/c1', now, '', '', state=ShardRange.SHARDING), + ShardRange('.s_a/6', now, '', 'b', state=ShardRange.ACTIVE), + ShardRange('.s_a/7', now, 'b', 'c', state=ShardRange.ACTIVE), + ShardRange('.s_a/8', now, 'c', 'd', state=ShardRange.CLEAVED), + ShardRange('.s_a/9', now, 'd', 'e', state=ShardRange.CREATED), + ShardRange('.s_a/0', now, 'e', '', state=ShardRange.CREATED)]) + for i in range(11): + brokers[2].put_object('o%s' % i, next(self.ts_iter).internal, + 0, 'text/plain', 'etag', 0) + + def mock_processing(broker, node, part): + if broker.path == 'a/c1': + raise Exception('kapow!') + elif broker.path not in ('a/c0', 'a/c2'): + raise BaseException("I don't know how to handle a broker " + "for %s" % broker.path) + + # check exceptions are handled + with mock.patch.object( + sharder, '_process_broker', side_effect=mock_processing + ) as mock_process_broker: + sharder._local_device_ids = {'stale_node_id'} + sharder._one_shard_cycle(Everything(), Everything()) + + self.assertEqual(device_ids, sharder._local_device_ids) + self.assertEqual(3, mock_process_broker.call_count) + processed_paths = [call[0][0].path + for call in mock_process_broker.call_args_list] + self.assertEqual({'a/c0', 'a/c1', 'a/c2'}, set(processed_paths)) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn('Unhandled exception while processing', lines[0]) + self.assertFalse(lines[1:]) + sharder.logger.clear() + expected_stats = {'attempted': 3, 'success': 2, 'failure': 1, + 'skipped': 0, 'completed': 0} + self._assert_recon_stats(expected_stats, sharder, 'visited') + expected_candidate_stats = { + 'found': 1, + 'top': [{'object_count': 11, 'account': 'a', 'container': 'c2', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[1].db_file).st_size, + 'path': brokers[2].db_file, 'root': 'a/c2', + 'node_index': 2}]} + self._assert_recon_stats( + expected_candidate_stats, sharder, 'sharding_candidates') + expected_in_progress_stats = { + 'all': [{'object_count': 0, 'account': 'a', 'container': 'c0', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[0].db_file).st_size, + 'path': brokers[0].db_file, 'root': 'a/c0', + 'node_index': 0, + 'found': 1, 'created': 2, 'cleaved': 1, 'active': 1, + 'state': 'sharding', 'db_state': 'unsharded', + 'error': None}, + {'object_count': 10, 'account': 'a', 'container': 'c1', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[1].db_file).st_size, + 'path': brokers[1].db_file, 'root': 'a/c1', + 'node_index': 1, + 'found': 0, 'created': 2, 'cleaved': 1, 'active': 2, + 'state': 'sharding', 'db_state': 'unsharded', + 'error': 'kapow!'}]} + self._assert_stats( + expected_in_progress_stats, sharder, 'sharding_in_progress') + + # check that candidates and in progress stats don't stick in recon + own_shard_range = brokers[0].get_own_shard_range() + own_shard_range.state = ShardRange.ACTIVE + brokers[0].merge_shard_ranges([own_shard_range]) + for i in range(10): + brokers[1].delete_object( + 'o%s' % i, next(self.ts_iter).internal) + with mock.patch.object( + sharder, '_process_broker' + ) as mock_process_broker: + sharder._local_device_ids = {999} + sharder._one_shard_cycle(Everything(), Everything()) + + self.assertEqual(device_ids, sharder._local_device_ids) + self.assertEqual(3, mock_process_broker.call_count) + processed_paths = [call[0][0].path + for call in mock_process_broker.call_args_list] + self.assertEqual({'a/c0', 'a/c1', 'a/c2'}, set(processed_paths)) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + expected_stats = {'attempted': 3, 'success': 3, 'failure': 0, + 'skipped': 0, 'completed': 0} + self._assert_recon_stats(expected_stats, sharder, 'visited') + self._assert_recon_stats( + expected_candidate_stats, sharder, 'sharding_candidates') + self._assert_recon_stats(None, sharder, 'sharding_progress') + + @contextmanager + def _mock_sharder(self, conf=None, replicas=3): + conf = conf or {} + conf['devices'] = self.tempdir + with mock.patch( + 'swift.container.sharder.internal_client.InternalClient'): + with mock.patch( + 'swift.common.db_replicator.ring.Ring', + lambda *args, **kwargs: FakeRing(replicas=replicas)): + sharder = ContainerSharder(conf, logger=FakeLogger()) + sharder._local_device_ids = {0, 1, 2} + sharder._replicate_object = mock.MagicMock( + return_value=(True, [True] * sharder.ring.replica_count)) + yield sharder + + def _get_raw_object_records(self, broker): + # use list_objects_iter with no-op transform_func to get back actual + # un-transformed rows with encoded timestamps + return [list(obj) for obj in broker.list_objects_iter( + 10, '', '', '', '', include_deleted=None, all_policies=True, + transform_func=lambda record: record)] + + def _check_objects(self, expected_objs, shard_db): + shard_broker = ContainerBroker(shard_db) + shard_objs = self._get_raw_object_records(shard_broker) + expected_objs = [list(obj) for obj in expected_objs] + self.assertEqual(expected_objs, shard_objs) + + def _check_shard_range(self, expected, actual): + expected_dict = dict(expected) + actual_dict = dict(actual) + self.assertGreater(actual_dict.pop('meta_timestamp'), + expected_dict.pop('meta_timestamp')) + self.assertEqual(expected_dict, actual_dict) + + def test_fetch_shard_ranges_unexpected_response(self): + broker = self._make_broker() + exc = internal_client.UnexpectedResponse( + 'Unexpected response: 404', None) + with self._mock_sharder() as sharder: + sharder.int_client.make_request.side_effect = exc + self.assertIsNone(sharder._fetch_shard_ranges(broker)) + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Unexpected response: 404', lines[0]) + self.assertFalse(lines[1:]) + + def test_fetch_shard_ranges_bad_record_type(self): + def do_test(mock_resp_headers): + with self._mock_sharder() as sharder: + mock_make_request = mock.MagicMock( + return_value=mock.MagicMock(headers=mock_resp_headers)) + sharder.int_client.make_request = mock_make_request + self.assertIsNone(sharder._fetch_shard_ranges(broker)) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn('unexpected record type', lines[0]) + self.assertFalse(lines[1:]) + + broker = self._make_broker() + do_test({}) + do_test({'x-backend-record-type': 'object'}) + do_test({'x-backend-record-type': 'disco'}) + + def test_fetch_shard_ranges_bad_data(self): + def do_test(mock_resp_body): + mock_resp_headers = {'x-backend-record-type': 'shard'} + with self._mock_sharder() as sharder: + mock_make_request = mock.MagicMock( + return_value=mock.MagicMock(headers=mock_resp_headers, + body=mock_resp_body)) + sharder.int_client.make_request = mock_make_request + self.assertIsNone(sharder._fetch_shard_ranges(broker)) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn('invalid data', lines[0]) + self.assertFalse(lines[1:]) + + broker = self._make_broker() + do_test({}) + do_test('') + do_test(json.dumps({})) + do_test(json.dumps([{'account': 'a', 'container': 'c'}])) + + def test_fetch_shard_ranges_ok(self): + def do_test(mock_resp_body, params): + mock_resp_headers = {'x-backend-record-type': 'shard'} + with self._mock_sharder() as sharder: + mock_make_request = mock.MagicMock( + return_value=mock.MagicMock(headers=mock_resp_headers, + body=mock_resp_body)) + sharder.int_client.make_request = mock_make_request + mock_make_path = mock.MagicMock(return_value='/v1/a/c') + sharder.int_client.make_path = mock_make_path + actual = sharder._fetch_shard_ranges(broker, params=params) + sharder.int_client.make_path.assert_called_once_with('a', 'c') + self.assertFalse(sharder.logger.get_lines_for_level('error')) + return actual, mock_make_request + + expected_headers = {'X-Backend-Record-Type': 'shard', + 'X-Backend-Include-Deleted': 'False', + 'X-Backend-Override-Deleted': 'true'} + broker = self._make_broker() + shard_ranges = self._make_shard_ranges((('', 'm'), ('m', ''))) + + params = {'format': 'json'} + actual, mock_call = do_test(json.dumps([dict(shard_ranges[0])]), + params={}) + mock_call.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + self._assert_shard_ranges_equal([shard_ranges[0]], actual) + + params = {'format': 'json', 'includes': 'thing'} + actual, mock_call = do_test( + json.dumps([dict(sr) for sr in shard_ranges]), params=params) + self._assert_shard_ranges_equal(shard_ranges, actual) + mock_call.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + params = {'format': 'json', 'end_marker': 'there', 'marker': 'here'} + actual, mock_call = do_test(json.dumps([]), params=params) + self._assert_shard_ranges_equal([], actual) + mock_call.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + def _check_cleave_root(self, conf=None): + broker = self._make_broker() + objects = [ + # shard 0 + ('a', self.ts_encoded(), 10, 'text/plain', 'etag_a', 0, 0), + ('here', self.ts_encoded(), 10, 'text/plain', 'etag_here', 0, 0), + # shard 1 + ('m', self.ts_encoded(), 1, 'text/plain', 'etag_m', 0, 0), + ('n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0), + ('there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0), + # shard 2 + ('where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0), + # shard 3 + ('x', self.ts_encoded(), 0, '', '', 1, 0), # deleted + ('y', self.ts_encoded(), 1000, 'text/plain', 'etag_y', 0, 0), + # shard 4 + ('yyyy', self.ts_encoded(), 14, 'text/plain', 'etag_yyyy', 0, 0), + ] + for obj in objects: + broker.put_object(*obj) + initial_root_info = broker.get_info() + broker.enable_sharding(Timestamp.now()) + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', 'yonder'), + ('yonder', '')) + shard_ranges = self._make_shard_ranges(shard_bounds) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + # used to accumulate stats from sharded dbs + total_shard_stats = {'object_count': 0, 'bytes_used': 0} + # run cleave - no shard ranges, nothing happens + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(0, context.ranges_done) + self.assertEqual(0, context.ranges_todo) + + self.assertEqual(UNSHARDED, broker.get_db_state()) + sharder._replicate_object.assert_not_called() + for db in expected_shard_dbs: + with annotate_failure(db): + self.assertFalse(os.path.exists(db)) + + # run cleave - all shard ranges in found state, nothing happens + broker.merge_shard_ranges(shard_ranges[:4]) + self.assertTrue(broker.set_sharding_state()) + + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(0, context.ranges_done) + self.assertEqual(4, context.ranges_todo) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_not_called() + for db in expected_shard_dbs: + with annotate_failure(db): + self.assertFalse(os.path.exists(db)) + for shard_range in broker.get_shard_ranges(): + with annotate_failure(shard_range): + self.assertEqual(ShardRange.FOUND, shard_range.state) + + # move first shard range to created state, first shard range is cleaved + shard_ranges[0].update_state(ShardRange.CREATED) + broker.merge_shard_ranges(shard_ranges[:1]) + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + expected = {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[0], 0) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + # update expected state and metadata, check cleaved shard range + shard_ranges[0].bytes_used = 20 + shard_ranges[0].object_count = 2 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(objects[:2], expected_shard_dbs[0]) + # other shard ranges should be unchanged + for i in range(1, len(shard_ranges)): + with annotate_failure(i): + self.assertFalse(os.path.exists(expected_shard_dbs[i])) + for i in range(1, len(updated_shard_ranges)): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_ranges[i])) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('here', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(1, context.ranges_done) + self.assertEqual(3, context.ranges_todo) + + unlink_files(expected_shard_dbs) + + # move more shard ranges to created state + for i in range(1, 4): + shard_ranges[i].update_state(ShardRange.CREATED) + broker.merge_shard_ranges(shard_ranges[1:4]) + + # replication of next shard range is not sufficiently successful + with self._mock_sharder(conf=conf) as sharder: + quorum = quorum_size(sharder.ring.replica_count) + successes = [True] * (quorum - 1) + fails = [False] * (sharder.ring.replica_count - len(successes)) + responses = successes + fails + random.shuffle(responses) + sharder._replicate_object = mock.MagicMock( + side_effect=((False, responses),)) + self.assertFalse(sharder._cleave(broker)) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + + # cleaving state is unchanged + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + for i in range(1, len(updated_shard_ranges)): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_ranges[i])) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('here', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(1, context.ranges_done) + self.assertEqual(3, context.ranges_todo) + + # try again, this time replication is sufficiently successful + with self._mock_sharder(conf=conf) as sharder: + successes = [True] * quorum + fails = [False] * (sharder.ring.replica_count - len(successes)) + responses1 = successes + fails + responses2 = fails + successes + sharder._replicate_object = mock.MagicMock( + side_effect=((False, responses1), (False, responses2))) + self.assertFalse(sharder._cleave(broker)) + + expected = {'attempted': 2, 'success': 2, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in expected_shard_dbs[1:3]] + ) + for db in expected_shard_dbs[1:3]: + shard_broker = ContainerBroker(db) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + + # only 2 are cleaved per batch + # update expected state and metadata, check cleaved shard ranges + shard_ranges[1].bytes_used = 6 + shard_ranges[1].object_count = 3 + shard_ranges[1].state = ShardRange.CLEAVED + shard_ranges[2].bytes_used = 100 + shard_ranges[2].object_count = 1 + shard_ranges[2].state = ShardRange.CLEAVED + for i in range(0, 3): + with annotate_failure(i): + self._check_shard_range( + shard_ranges[i], updated_shard_ranges[i]) + self._check_objects(objects[2:5], expected_shard_dbs[1]) + self._check_objects(objects[5:6], expected_shard_dbs[2]) + # other shard ranges should be unchanged + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + for i, db in enumerate(expected_shard_dbs[3:], 3): + with annotate_failure(i): + self.assertFalse(os.path.exists(db)) + for i, updated_shard_range in enumerate(updated_shard_ranges[3:], 3): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_range)) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('where', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(3, context.ranges_done) + self.assertEqual(1, context.ranges_todo) + + unlink_files(expected_shard_dbs) + + # run cleave again - should process the fourth range + with self._mock_sharder(conf=conf) as sharder: + sharder.logger = debug_logger() + self.assertFalse(sharder._cleave(broker)) + + expected = {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[3], 0) + shard_broker = ContainerBroker(expected_shard_dbs[3]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + + shard_ranges[3].bytes_used = 1000 + shard_ranges[3].object_count = 1 + shard_ranges[3].state = ShardRange.CLEAVED + for i in range(0, 4): + with annotate_failure(i): + self._check_shard_range( + shard_ranges[i], updated_shard_ranges[i]) + # NB includes the deleted object + self._check_objects(objects[6:8], expected_shard_dbs[3]) + # other shard ranges should be unchanged + for i, db in enumerate(expected_shard_dbs[:3]): + with annotate_failure(i): + self.assertFalse(os.path.exists(db)) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + for i, updated_shard_range in enumerate(updated_shard_ranges[4:], 4): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_range)) + + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('yonder', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(4, context.ranges_done) + self.assertEqual(0, context.ranges_todo) + + unlink_files(expected_shard_dbs) + + # run cleave - should be a no-op, all existing ranges have been cleaved + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_not_called() + + # add final shard range - move this to ACTIVE state and update stats to + # simulate another replica having cleaved it and replicated its state + shard_ranges[4].update_state(ShardRange.ACTIVE) + shard_ranges[4].update_meta(2, 15) + broker.merge_shard_ranges(shard_ranges[4:]) + + with self._mock_sharder(conf=conf) as sharder: + self.assertTrue(sharder._cleave(broker)) + + expected = {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[4], 0) + shard_broker = ContainerBroker(expected_shard_dbs[4]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.ACTIVE, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(5, len(updated_shard_ranges)) + # NB stats of the ACTIVE shard range should not be reset by cleaving + for i in range(0, 4): + with annotate_failure(i): + self._check_shard_range( + shard_ranges[i], updated_shard_ranges[i]) + self.assertEqual(dict(shard_ranges[4]), dict(updated_shard_ranges[4])) + + # object copied to shard + self._check_objects(objects[8:], expected_shard_dbs[4]) + # other shard ranges should be unchanged + for i, db in enumerate(expected_shard_dbs[:4]): + with annotate_failure(i): + self.assertFalse(os.path.exists(db)) + + self.assertEqual(initial_root_info['object_count'], + total_shard_stats['object_count']) + self.assertEqual(initial_root_info['bytes_used'], + total_shard_stats['bytes_used']) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(5, context.ranges_done) + self.assertEqual(0, context.ranges_todo) + + with self._mock_sharder(conf=conf) as sharder: + self.assertTrue(sharder._cleave(broker)) + sharder._replicate_object.assert_not_called() + + self.assertTrue(broker.set_sharded_state()) + # run cleave - should be a no-op + with self._mock_sharder(conf=conf) as sharder: + self.assertTrue(sharder._cleave(broker)) + + sharder._replicate_object.assert_not_called() + + def test_cleave_root(self): + self._check_cleave_root() + + def test_cleave_root_listing_limit_one(self): + # force yield_objects to update its marker and call to the broker's + # get_objects() for each shard range, to check the marker moves on + self._check_cleave_root(conf={'cleave_row_batch_size': 1}) + + def test_cleave_root_ranges_change(self): + # verify that objects are not missed if shard ranges change between + # cleaving batches + broker = self._make_broker() + objects = [ + ('a', self.ts_encoded(), 10, 'text/plain', 'etag_a', 0, 0), + ('b', self.ts_encoded(), 10, 'text/plain', 'etag_b', 0, 0), + ('c', self.ts_encoded(), 1, 'text/plain', 'etag_c', 0, 0), + ('d', self.ts_encoded(), 2, 'text/plain', 'etag_d', 0, 0), + ('e', self.ts_encoded(), 3, 'text/plain', 'etag_e', 0, 0), + ('f', self.ts_encoded(), 100, 'text/plain', 'etag_f', 0, 0), + ('x', self.ts_encoded(), 0, '', '', 1, 0), # deleted + ('z', self.ts_encoded(), 1000, 'text/plain', 'etag_z', 0, 0) + ] + for obj in objects: + broker.put_object(*obj) + broker.enable_sharding(Timestamp.now()) + + shard_bounds = (('', 'd'), ('d', 'x'), ('x', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + broker.merge_shard_ranges(shard_ranges[:3]) + self.assertTrue(broker.set_sharding_state()) + + # run cleave - first batch is cleaved + with self._mock_sharder() as sharder: + self.assertFalse(sharder._cleave(broker)) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual(str(shard_ranges[1].upper), context.cursor) + self.assertEqual(8, context.cleave_to_row) + self.assertEqual(8, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in expected_shard_dbs[:2]] + ) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(3, len(updated_shard_ranges)) + + # first 2 shard ranges should have updated object count, bytes used and + # meta_timestamp + shard_ranges[0].bytes_used = 23 + shard_ranges[0].object_count = 4 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + shard_ranges[1].bytes_used = 103 + shard_ranges[1].object_count = 2 + shard_ranges[1].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[1], updated_shard_ranges[1]) + self._check_objects(objects[:4], expected_shard_dbs[0]) + self._check_objects(objects[4:7], expected_shard_dbs[1]) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # third shard range should be unchanged - not yet cleaved + self.assertEqual(dict(shard_ranges[2]), + dict(updated_shard_ranges[2])) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual(str(shard_ranges[1].upper), context.cursor) + self.assertEqual(8, context.cleave_to_row) + self.assertEqual(8, context.max_row) + + # now change the shard ranges so that third consumes second + shard_ranges[1].set_deleted() + shard_ranges[2].lower = 'd' + shard_ranges[2].timestamp = Timestamp.now() + + broker.merge_shard_ranges(shard_ranges[1:3]) + + # run cleave - should process the extended third (final) range + with self._mock_sharder() as sharder: + self.assertTrue(sharder._cleave(broker)) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[2], 0) + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(2, len(updated_shard_ranges)) + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + # third shard range should now have updated object count, bytes used, + # including objects previously in the second shard range + shard_ranges[2].bytes_used = 1103 + shard_ranges[2].object_count = 3 + shard_ranges[2].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[2], updated_shard_ranges[1]) + self._check_objects(objects[4:8], expected_shard_dbs[2]) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual(str(shard_ranges[2].upper), context.cursor) + self.assertEqual(8, context.cleave_to_row) + self.assertEqual(8, context.max_row) + + def test_cleave_shard(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + own_shard_range = ShardRange( + broker.path, Timestamp.now(), 'here', 'where', + state=ShardRange.SHARDING, epoch=Timestamp.now()) + broker.merge_shard_ranges([own_shard_range]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertFalse(broker.is_root_container()) # sanity check + + objects = [ + ('m', self.ts_encoded(), 1, 'text/plain', 'etag_m', 0, 0), + ('n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0), + ('there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0), + ('where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0), + ] + misplaced_objects = [ + ('a', self.ts_encoded(), 1, 'text/plain', 'etag_a', 0, 0), + ('z', self.ts_encoded(), 100, 'text/plain', 'etag_z', 1, 0), + ] + for obj in objects + misplaced_objects: + broker.put_object(*obj) + + shard_bounds = (('here', 'there'), + ('there', 'where')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + misplaced_bounds = (('', 'here'), + ('where', '')) + misplaced_ranges = self._make_shard_ranges( + misplaced_bounds, state=ShardRange.ACTIVE) + misplaced_dbs = [] + for shard_range in misplaced_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + misplaced_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + + # run cleave - first range is cleaved but move of misplaced objects is + # not successful + sharder_conf = {'cleave_batch_size': 1} + with self._mock_sharder(sharder_conf) as sharder: + with mock.patch.object( + sharder, '_make_shard_range_fetcher', + return_value=lambda: iter(misplaced_ranges)): + # cause misplaced objects replication to not succeed + quorum = quorum_size(sharder.ring.replica_count) + successes = [True] * (quorum - 1) + fails = [False] * (sharder.ring.replica_count - len(successes)) + responses = successes + fails + random.shuffle(responses) + bad_result = (False, responses) + ok_result = (True, [True] * sharder.ring.replica_count) + sharder._replicate_object = mock.MagicMock( + # result for misplaced, misplaced, cleave + side_effect=(bad_result, ok_result, ok_result)) + self.assertFalse(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertFalse(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual(str(shard_ranges[0].upper), context.cursor) + self.assertEqual(6, context.cleave_to_row) + self.assertEqual(6, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, misplaced_dbs[0], 0), + mock.call(0, misplaced_dbs[1], 0), + mock.call(0, expected_shard_dbs[0], 0)]) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + # NB cleaving a shard, state goes to CLEAVED not ACTIVE + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(2, len(updated_shard_ranges)) + + # first shard range should have updated object count, bytes used and + # meta_timestamp + shard_ranges[0].bytes_used = 6 + shard_ranges[0].object_count = 3 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(objects[:3], expected_shard_dbs[0]) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self._check_objects(misplaced_objects[:1], misplaced_dbs[0]) + self._check_objects(misplaced_objects[1:], misplaced_dbs[1]) + unlink_files(expected_shard_dbs) + unlink_files(misplaced_dbs) + + # run cleave - second (final) range is cleaved; move this range to + # CLEAVED state and update stats to simulate another replica having + # cleaved it and replicated its state + shard_ranges[1].update_state(ShardRange.CLEAVED) + shard_ranges[1].update_meta(2, 15) + broker.merge_shard_ranges(shard_ranges[1:2]) + with self._mock_sharder(sharder_conf) as sharder: + with mock.patch.object( + sharder, '_make_shard_range_fetcher', + return_value=lambda: iter(misplaced_ranges)): + self.assertTrue(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual(str(shard_ranges[1].upper), context.cursor) + self.assertEqual(6, context.cleave_to_row) + self.assertEqual(6, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, misplaced_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + shard_broker = ContainerBroker(expected_shard_dbs[1]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(2, len(updated_shard_ranges)) + + # second shard range should have updated object count, bytes used and + # meta_timestamp + self.assertEqual(dict(shard_ranges[1]), dict(updated_shard_ranges[1])) + self._check_objects(objects[3:], expected_shard_dbs[1]) + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self._check_objects(misplaced_objects[:1], misplaced_dbs[0]) + self.assertFalse(os.path.exists(misplaced_dbs[1])) + + def test_cleave_shard_shrinking(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + own_shard_range = ShardRange( + broker.path, next(self.ts_iter), 'here', 'where', + state=ShardRange.SHRINKING, epoch=next(self.ts_iter)) + broker.merge_shard_ranges([own_shard_range]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertFalse(broker.is_root_container()) # sanity check + + objects = [ + ('there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0), + ('where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0), + ] + for obj in objects: + broker.put_object(*obj) + acceptor_epoch = next(self.ts_iter) + acceptor = ShardRange('.shards_a/acceptor', Timestamp.now(), + 'here', 'yonder', '1000', '11111', + state=ShardRange.ACTIVE, epoch=acceptor_epoch) + db_hash = hash_path(acceptor.account, acceptor.container) + # NB expected cleave db includes acceptor epoch + expected_shard_db = os.path.join( + self.tempdir, 'sda', 'containers', '0', db_hash[-3:], db_hash, + '%s_%s.db' % (db_hash, acceptor_epoch.internal)) + + broker.merge_shard_ranges([acceptor]) + broker.set_sharding_state() + + # run cleave + with self._mock_sharder() as sharder: + self.assertTrue(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual(str(acceptor.upper), context.cursor) + self.assertEqual(2, context.cleave_to_row) + self.assertEqual(2, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_db, 0)]) + shard_broker = ContainerBroker(expected_shard_db) + # NB when cleaving a shard container to a larger acceptor namespace + # then expect the shard broker's own shard range to reflect that of the + # acceptor shard range rather than being set to CLEAVED. + self.assertEqual( + ShardRange.ACTIVE, shard_broker.get_own_shard_range().state) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(1, len(updated_shard_ranges)) + self.assertEqual(dict(acceptor), dict(updated_shard_ranges[0])) + + # shard range should have unmodified acceptor, bytes used and + # meta_timestamp + self._check_objects(objects, expected_shard_db) + + def test_cleave_repeated(self): + # verify that if new objects are merged into retiring db after cleaving + # started then cleaving will repeat but only new objects are cleaved + # in the repeated cleaving pass + broker = self._make_broker() + objects = [ + ('obj%03d' % i, next(self.ts_iter), 1, 'text/plain', 'etag', 0, 0) + for i in range(10) + ] + new_objects = [ + (name, next(self.ts_iter), 1, 'text/plain', 'etag', 0, 0) + for name in ('alpha', 'zeta') + ] + for obj in objects: + broker.put_object(*obj) + broker._commit_puts() + broker.enable_sharding(Timestamp.now()) + shard_bounds = (('', 'obj004'), ('obj004', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + old_broker = broker.get_brokers()[0] + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + + calls = [] + key = ('name', 'created_at', 'size', 'content_type', 'etag', 'deleted') + + def mock_replicate_object(part, db, node_id): + # merge new objects between cleave of first and second shard ranges + if not calls: + old_broker.merge_items( + [dict(zip(key, obj)) for obj in new_objects]) + calls.append((part, db, node_id)) + return True, [True, True, True] + + with self._mock_sharder() as sharder: + sharder._audit_container = mock.MagicMock() + sharder._replicate_object = mock_replicate_object + sharder._process_broker(broker, node, 99) + + # sanity check - the new objects merged into the old db + self.assertFalse(broker.get_objects()) + self.assertEqual(12, len(old_broker.get_objects())) + + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self.assertEqual([(0, expected_shard_dbs[0], 0), + (0, expected_shard_dbs[1], 0)], calls) + + # check shard ranges were updated to CLEAVED + updated_shard_ranges = broker.get_shard_ranges() + # 'alpha' was not in table when first shard was cleaved + shard_ranges[0].bytes_used = 5 + shard_ranges[0].object_count = 5 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(objects[:5], expected_shard_dbs[0]) + # 'zeta' was in table when second shard was cleaved + shard_ranges[1].bytes_used = 6 + shard_ranges[1].object_count = 6 + shard_ranges[1].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[1], updated_shard_ranges[1]) + self._check_objects(objects[5:] + new_objects[1:], + expected_shard_dbs[1]) + + context = CleavingContext.load(broker) + self.assertFalse(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(10, context.cleave_to_row) + self.assertEqual(12, context.max_row) # note that max row increased + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Repeat cleaving required', lines[0]) + self.assertFalse(lines[1:]) + unlink_files(expected_shard_dbs) + + # repeat the cleaving - the newer objects get cleaved + with self._mock_sharder() as sharder: + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + # this time the sharding completed + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + + # shard ranges are now ACTIVE - stats not updated by cleaving + updated_shard_ranges = broker.get_shard_ranges() + shard_ranges[0].state = ShardRange.ACTIVE + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(new_objects[:1], expected_shard_dbs[0]) + # both new objects are included in repeat cleaving but no older objects + shard_ranges[1].state = ShardRange.ACTIVE + self._check_shard_range(shard_ranges[1], updated_shard_ranges[1]) + self._check_objects(new_objects[1:], expected_shard_dbs[1]) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + def test_cleave_multiple_storage_policies(self): + # verify that objects in all storage policies are cleaved + broker = self._make_broker() + # add objects in multiple policies + objects = [{'name': 'obj_%03d' % i, + 'created_at': Timestamp.now().normal, + 'content_type': 'text/plain', + 'etag': 'etag_%d' % i, + 'size': 1024 * i, + 'deleted': i % 2, + 'storage_policy_index': i % 2, + } for i in range(1, 8)] + # merge_items mutates items + broker.merge_items([dict(obj) for obj in objects]) + broker.enable_sharding(Timestamp.now()) + shard_ranges = self._make_shard_ranges( + (('', 'obj_004'), ('obj_004', '')), state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + + with self._mock_sharder() as sharder: + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + # check shard ranges were updated to ACTIVE + self.assertEqual([ShardRange.ACTIVE] * 2, + [sr.state for sr in broker.get_shard_ranges()]) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + actual_objects = shard_broker.get_objects() + self.assertEqual(objects[:4], actual_objects) + + shard_broker = ContainerBroker(expected_shard_dbs[1]) + actual_objects = shard_broker.get_objects() + self.assertEqual(objects[4:], actual_objects) + + def test_cleave_insufficient_replication(self): + # verify that if replication of a cleaved shard range fails then rows + # are not merged again to the existing shard db + broker = self._make_broker() + retiring_db_id = broker.get_info()['id'] + objects = [ + {'name': 'obj%03d' % i, 'created_at': next(self.ts_iter), + 'size': 1, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + for i in range(10) + ] + broker.merge_items([dict(obj) for obj in objects]) + broker._commit_puts() + broker.enable_sharding(Timestamp.now()) + shard_bounds = (('', 'obj004'), ('obj004', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + new_object = {'name': 'alpha', 'created_at': next(self.ts_iter), + 'size': 0, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + broker.merge_items([dict(new_object)]) + + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + orig_merge_items = ContainerBroker.merge_items + + def mock_merge_items(broker, items): + merge_items_calls.append((broker.path, + # merge mutates item so make a copy + [dict(item) for item in items])) + orig_merge_items(broker, items) + + # first shard range cleaved but fails to replicate + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + return_value=(False, [False, False, True])) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + # first shard range cleaved to shard broker + self.assertEqual([(shard_ranges[0].name, objects[:5])], + merge_items_calls) + # replication of first shard range fails - no more shards attempted + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[0], 0) + # shard broker has sync points + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertEqual( + [{'remote_id': retiring_db_id, 'sync_point': len(objects)}], + shard_broker.get_syncs()) + self.assertEqual(objects[:5], shard_broker.get_objects()) + + # first shard range replicates ok, no new merges required, second is + # cleaved but fails to replicate + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items), self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, True, True]), + (False, [False, False, True])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + + broker_shard_ranges = broker.get_shard_ranges() + shard_ranges[0].object_count = 5 + shard_ranges[0].bytes_used = sum(obj['size'] for obj in objects[:5]) + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], broker_shard_ranges[0]) + # second shard range still in created state + self._assert_shard_ranges_equal([shard_ranges[1]], + [broker_shard_ranges[1]]) + # only second shard range rows were merged to shard db + self.assertEqual([(shard_ranges[1].name, objects[5:])], + merge_items_calls) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + # shard broker has sync points + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': retiring_db_id, 'sync_point': len(objects)}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:], shard_broker.get_objects()) + + # repeat - second shard range cleaves fully because its previously + # cleaved shard db no longer exists + unlink_files(expected_shard_dbs) + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(True, [True, True, True]), # misplaced obj + (False, [False, True, True])]) + sharder._audit_container = mock.MagicMock() + sharder.logger = debug_logger() + sharder._process_broker(broker, node, 99) + + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + + broker_shard_ranges = broker.get_shard_ranges() + shard_ranges[1].object_count = 5 + shard_ranges[1].bytes_used = sum(obj['size'] for obj in objects[5:]) + shard_ranges[1].state = ShardRange.ACTIVE + self._check_shard_range(shard_ranges[1], broker_shard_ranges[1]) + # second shard range rows were merged to shard db again + self.assertEqual([(shard_ranges[0].name, [new_object]), + (shard_ranges[1].name, objects[5:])], + merge_items_calls) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + # first shard broker was created by misplaced object - no sync point + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertFalse(shard_broker.get_syncs()) + self.assertEqual([new_object], shard_broker.get_objects()) + # second shard broker has sync points + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': retiring_db_id, 'sync_point': len(objects)}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:], shard_broker.get_objects()) + + def test_shard_replication_quorum_failures(self): + broker = self._make_broker() + objects = [ + {'name': 'obj%03d' % i, 'created_at': next(self.ts_iter), + 'size': 1, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + for i in range(10) + ] + broker.merge_items([dict(obj) for obj in objects]) + broker._commit_puts() + shard_bounds = (('', 'obj002'), ('obj002', 'obj004'), + ('obj004', 'obj006'), ('obj006', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.enable_sharding(Timestamp.now()) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + with self._mock_sharder({'shard_replication_quorum': 3}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, True, True]), + (False, [False, False, True])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + # replication of first shard range fails - no more shards attempted + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[0], 0) + self.assertEqual([ShardRange.CREATED] * 4, + [sr.state for sr in broker.get_shard_ranges()]) + + # and again with a chilled out quorom, so cleaving moves onto second + # shard range which fails to reach even chilled quorum + with self._mock_sharder({'shard_replication_quorum': 1}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, False, True]), + (False, [False, False, False])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self.assertEqual(sharder._replicate_object.call_args_list, [ + mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0), + ]) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CREATED, ShardRange.CREATED, + ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + + # now pretend another node successfully cleaved the second shard range, + # but this node still fails to replicate so still cannot move on + shard_ranges[1].update_state(ShardRange.CLEAVED) + broker.merge_shard_ranges(shard_ranges[1]) + with self._mock_sharder({'shard_replication_quorum': 1}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, False, False])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED, + ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + + # until a super-chilled quorum is used - but even then there must have + # been an attempt to replicate + with self._mock_sharder( + {'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [])]) # maybe shard db was deleted + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED, + ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + + # next pass - the second shard replication is attempted and fails, but + # that's ok because another node has cleaved it and + # existing_shard_replication_quorum is zero + with self._mock_sharder( + {'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, False, False]), + (False, [False, True, False])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self.assertEqual(sharder._replicate_object.call_args_list, [ + mock.call(0, expected_shard_dbs[1], 0), + mock.call(0, expected_shard_dbs[2], 0), + ]) + self.assertEqual([ShardRange.CLEAVED] * 3 + [ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + self.assertEqual(1, sharder.shard_replication_quorum) + self.assertEqual(0, sharder.existing_shard_replication_quorum) + + # crazy replication quorums will be capped to replica_count + with self._mock_sharder( + {'shard_replication_quorum': 99, + 'existing_shard_replication_quorum': 99}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, True, True])]) + sharder._audit_container = mock.MagicMock() + sharder.logger = debug_logger() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[3], 0) + self.assertEqual([ShardRange.CLEAVED] * 3 + [ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + self.assertEqual(3, sharder.shard_replication_quorum) + self.assertEqual(3, sharder.existing_shard_replication_quorum) + + # ...and progress is still made if replication fully succeeds + with self._mock_sharder( + {'shard_replication_quorum': 99, + 'existing_shard_replication_quorum': 99}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(True, [True, True, True])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[3], 0) + self.assertEqual([ShardRange.ACTIVE] * 4, + [sr.state for sr in broker.get_shard_ranges()]) + warnings = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'shard_replication_quorum of 99 exceeds replica count', + warnings[0]) + self.assertIn( + 'existing_shard_replication_quorum of 99 exceeds replica count', + warnings[1]) + self.assertEqual(3, sharder.shard_replication_quorum) + self.assertEqual(3, sharder.existing_shard_replication_quorum) + + def test_cleave_to_existing_shard_db(self): + # verify that when cleaving to an already existing shard db + def replicate(node, from_broker, part): + # short circuit replication + rpc = replicator.ContainerReplicatorRpc( + self.tempdir, DATADIR, ContainerBroker, mount_check=False) + + fake_repl_connection = attach_fake_replication_rpc(rpc) + with mock.patch('swift.common.db_replicator.ReplConnection', + fake_repl_connection): + with mock.patch('swift.common.db_replicator.ring.Ring', + lambda *args, **kwargs: FakeRing()): + daemon = replicator.ContainerReplicator({}) + info = from_broker.get_replication_info() + success = daemon._repl_to_node( + node, from_broker, part, info) + self.assertTrue(success) + + orig_merge_items = ContainerBroker.merge_items + + def mock_merge_items(broker, items): + # capture merge_items calls + merge_items_calls.append((broker.path, + # merge mutates item so make a copy + [dict(item) for item in items])) + orig_merge_items(broker, items) + + objects = [ + {'name': 'obj%03d' % i, 'created_at': next(self.ts_iter), + 'size': 1, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + for i in range(10) + ] + # local db gets 4 objects + local_broker = self._make_broker() + local_broker.merge_items([dict(obj) for obj in objects[2:6]]) + local_broker._commit_puts() + local_retiring_db_id = local_broker.get_info()['id'] + + # remote db gets 5 objects + remote_broker = self._make_broker(device='sdb') + remote_broker.merge_items([dict(obj) for obj in objects[2:7]]) + remote_broker._commit_puts() + remote_retiring_db_id = remote_broker.get_info()['id'] + + local_node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda', + 'id': '2', 'index': 0, 'replication_ip': '1.2.3.4', + 'replication_port': 6040} + remote_node = {'ip': '1.2.3.5', 'port': 6040, 'device': 'sdb', + 'id': '3', 'index': 1, 'replication_ip': '1.2.3.5', + 'replication_port': 6040} + + # remote db replicates to local, bringing local db's total to 5 objects + self.assertNotEqual(local_broker.get_objects(), + remote_broker.get_objects()) + replicate(local_node, remote_broker, 0) + self.assertEqual(local_broker.get_objects(), + remote_broker.get_objects()) + + # local db gets 2 new objects, bringing its total to 7 + local_broker.merge_items([dict(obj) for obj in objects[1:2]]) + local_broker.merge_items([dict(obj) for obj in objects[7:8]]) + + # local db gets shard ranges + own_shard_range = local_broker.get_own_shard_range() + now = Timestamp.now() + own_shard_range.update_state(ShardRange.SHARDING, state_timestamp=now) + own_shard_range.epoch = now + shard_ranges = self._make_shard_ranges( + (('', 'obj004'), ('obj004', '')), state=ShardRange.CREATED) + local_broker.merge_shard_ranges([own_shard_range] + shard_ranges) + self.assertTrue(local_broker.set_sharding_state()) + + # local db shards + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + return_value=(True, [True, True, True])) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(local_broker, local_node, 0) + + # all objects merged from local to shard ranges + self.assertEqual([(shard_ranges[0].name, objects[1:5]), + (shard_ranges[1].name, objects[5:8])], + merge_items_calls) + + # shard brokers have sync points + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 5}], + shard_broker.get_syncs()) + self.assertEqual(objects[1:5], shard_broker.get_objects()) + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 5}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:8], shard_broker.get_objects()) + + # local db replicates to remote, so remote now has shard ranges + # note: no objects replicated because local is sharded + self.assertFalse(remote_broker.get_shard_ranges()) + replicate(remote_node, local_broker, 0) + self._assert_shard_ranges_equal(local_broker.get_shard_ranges(), + remote_broker.get_shard_ranges()) + + # remote db gets 3 new objects, bringing its total to 8 + remote_broker.merge_items([dict(obj) for obj in objects[:1]]) + remote_broker.merge_items([dict(obj) for obj in objects[8:]]) + + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + return_value=(True, [True, True, True])) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(remote_broker, remote_node, 0) + + # shard brokers have sync points for the remote db so only new objects + # are merged from remote broker to shard brokers + self.assertEqual([(shard_ranges[0].name, objects[:1]), + (shard_ranges[1].name, objects[8:])], + merge_items_calls) + # sync points are updated + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 8}], + shard_broker.get_syncs()) + self.assertEqual(objects[:5], shard_broker.get_objects()) + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 8}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:], shard_broker.get_objects()) + + def _check_complete_sharding(self, account, container, shard_bounds): + broker = self._make_sharding_broker( + account=account, container=container, shard_bounds=shard_bounds) + obj = {'name': 'obj', 'created_at': next(self.ts_iter).internal, + 'size': 14, 'content_type': 'text/plain', 'etag': 'an etag', + 'deleted': 0} + broker.get_brokers()[0].merge_items([obj]) + self.assertEqual(2, len(broker.db_files)) # sanity check + + def check_not_complete(): + with self._mock_sharder() as sharder: + self.assertFalse(sharder._complete_sharding(broker)) + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'Repeat cleaving required for %r' % broker.db_files[0], + warning_lines[0]) + self.assertFalse(warning_lines[1:]) + sharder.logger.clear() + context = CleavingContext.load(broker) + self.assertFalse(context.cleaving_done) + self.assertFalse(context.misplaced_done) + self.assertEqual('', context.cursor) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + for shard_range in broker.get_shard_ranges(): + self.assertEqual(ShardRange.CLEAVED, shard_range.state) + self.assertEqual(SHARDING, broker.get_db_state()) + + # no cleave context progress + check_not_complete() + + # cleaving_done is False + context = CleavingContext.load(broker) + self.assertEqual(1, context.max_row) + context.cleave_to_row = 1 # pretend all rows have been cleaved + context.cleaving_done = False + context.misplaced_done = True + context.store(broker) + check_not_complete() + + # misplaced_done is False + context.misplaced_done = False + context.cleaving_done = True + context.store(broker) + check_not_complete() + + # modified db max row + old_broker = broker.get_brokers()[0] + obj = {'name': 'obj', 'created_at': next(self.ts_iter).internal, + 'size': 14, 'content_type': 'text/plain', 'etag': 'an etag', + 'deleted': 1} + old_broker.merge_items([obj]) + self.assertGreater(old_broker.get_max_row(), context.max_row) + context.misplaced_done = True + context.cleaving_done = True + context.store(broker) + check_not_complete() + + # db id changes + broker.get_brokers()[0].newid('fake_remote_id') + context.cleave_to_row = 2 # pretend all rows have been cleaved, again + context.store(broker) + check_not_complete() + + # context ok + context = CleavingContext.load(broker) + context.cleave_to_row = context.max_row + context.misplaced_done = True + context.cleaving_done = True + context.store(broker) + with self._mock_sharder() as sharder: + self.assertTrue(sharder._complete_sharding(broker)) + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + for shard_range in broker.get_shard_ranges(): + self.assertEqual(ShardRange.ACTIVE, shard_range.state) + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertFalse(warning_lines) + sharder.logger.clear() + return broker + + def test_complete_sharding_root(self): + broker = self._check_complete_sharding( + 'a', 'c', (('', 'mid'), ('mid', ''))) + self.assertEqual(0, broker.get_own_shard_range().deleted) + + def test_complete_sharding_shard(self): + broker = self._check_complete_sharding( + '.shards_', 'shard_c', (('l', 'mid'), ('mid', 'u'))) + self.assertEqual(1, broker.get_own_shard_range().deleted) + + def test_identify_sharding_candidate(self): + brokers = [self._make_broker(container='c%03d' % i) for i in range(6)] + for broker in brokers: + broker.set_sharding_sysmeta('Root', 'a/c') + node = {'index': 2} + # containers are all empty + with self._mock_sharder() as sharder: + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + expected_stats = {} + self._assert_stats(expected_stats, sharder, 'sharding_candidates') + + objects = [ + ['obj%3d' % i, next(self.ts_iter).internal, i, 'text/plain', + 'etag%s' % i, 0] for i in range(160)] + + # one container has 100 objects, which is below the sharding threshold + for obj in objects[:100]: + brokers[0].put_object(*obj) + conf = {'recon_cache_path': self.tempdir} + with self._mock_sharder(conf=conf) as sharder: + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + self.assertFalse(sharder.sharding_candidates) + expected_recon = { + 'found': 0, + 'top': []} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # reduce the sharding threshold and the container is reported + conf = {'shard_container_threshold': 100, + 'recon_cache_path': self.tempdir} + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now() as now: + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + stats_0 = {'path': brokers[0].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c000', + 'root': 'a/c', + 'object_count': 100, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[0].db_file).st_size} + self.assertEqual([stats_0], sharder.sharding_candidates) + expected_recon = { + 'found': 1, + 'top': [stats_0]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # repeat with handoff node and db_file error + with self._mock_sharder(conf=conf) as sharder: + with mock.patch('os.stat', side_effect=OSError('test error')): + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, {}) + stats_0_b = {'path': brokers[0].db_file, + 'node_index': None, + 'account': 'a', + 'container': 'c000', + 'root': 'a/c', + 'object_count': 100, + 'meta_timestamp': now.internal, + 'file_size': None} + self.assertEqual([stats_0_b], sharder.sharding_candidates) + self._assert_stats(expected_stats, sharder, 'sharding_candidates') + expected_recon = { + 'found': 1, + 'top': [stats_0_b]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # load up another container, but not to threshold for sharding, and + # verify it is never a candidate for sharding + for obj in objects[:50]: + brokers[2].put_object(*obj) + own_sr = brokers[2].get_own_shard_range() + for state in ShardRange.STATES: + own_sr.update_state(state, state_timestamp=Timestamp.now()) + brokers[2].merge_shard_ranges([own_sr]) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + with annotate_failure(state): + self.assertEqual([stats_0], sharder.sharding_candidates) + + # reduce the threshold and the second container is included + conf = {'shard_container_threshold': 50, + 'recon_cache_path': self.tempdir} + own_sr.update_state(ShardRange.ACTIVE, state_timestamp=Timestamp.now()) + brokers[2].merge_shard_ranges([own_sr]) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + stats_2 = {'path': brokers[2].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c002', + 'root': 'a/c', + 'object_count': 50, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[2].db_file).st_size} + self.assertEqual([stats_0, stats_2], sharder.sharding_candidates) + expected_recon = { + 'found': 2, + 'top': [stats_0, stats_2]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # a broker not in active state is not included + own_sr = brokers[0].get_own_shard_range() + for state in ShardRange.STATES: + if state == ShardRange.ACTIVE: + continue + own_sr.update_state(state, state_timestamp=Timestamp.now()) + brokers[0].merge_shard_ranges([own_sr]) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + with annotate_failure(state): + self.assertEqual([stats_2], sharder.sharding_candidates) + + own_sr.update_state(ShardRange.ACTIVE, state_timestamp=Timestamp.now()) + brokers[0].merge_shard_ranges([own_sr]) + + # load up a third container with 150 objects + for obj in objects[:150]: + brokers[5].put_object(*obj) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + stats_5 = {'path': brokers[5].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c005', + 'root': 'a/c', + 'object_count': 150, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[5].db_file).st_size} + self.assertEqual([stats_0, stats_2, stats_5], + sharder.sharding_candidates) + # note recon top list is sorted by size + expected_recon = { + 'found': 3, + 'top': [stats_5, stats_0, stats_2]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # restrict the number of reported candidates + conf = {'shard_container_threshold': 50, + 'recon_cache_path': self.tempdir, + 'recon_candidates_limit': 2} + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + self.assertEqual([stats_0, stats_2, stats_5], + sharder.sharding_candidates) + expected_recon = { + 'found': 3, + 'top': [stats_5, stats_0]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # unrestrict the number of reported candidates + conf = {'shard_container_threshold': 50, + 'recon_cache_path': self.tempdir, + 'recon_candidates_limit': -1} + for i, broker in enumerate([brokers[1]] + brokers[3:5]): + for obj in objects[:(151 + i)]: + broker.put_object(*obj) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + + stats_4 = {'path': brokers[4].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c004', + 'root': 'a/c', + 'object_count': 153, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[4].db_file).st_size} + stats_3 = {'path': brokers[3].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c003', + 'root': 'a/c', + 'object_count': 152, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[3].db_file).st_size} + stats_1 = {'path': brokers[1].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c001', + 'root': 'a/c', + 'object_count': 151, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[1].db_file).st_size} + + self.assertEqual( + [stats_0, stats_1, stats_2, stats_3, stats_4, stats_5], + sharder.sharding_candidates) + self._assert_stats(expected_stats, sharder, 'sharding_candidates') + expected_recon = { + 'found': 6, + 'top': [stats_4, stats_3, stats_1, stats_5, stats_0, stats_2]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + def test_misplaced_objects_root_container(self): + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + + objects = [ + # misplaced objects in second and third shard ranges + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 1], + ['where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0], + # deleted + ['x', self.ts_encoded(), 0, '', '', 1, 1], + ] + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', 'yonder'), + ('yonder', '')) + initial_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for shard_range in initial_shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(initial_shard_ranges) + + # unsharded + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # sharding - no misplaced objects + self.assertTrue(broker.set_sharding_state()) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # pretend we cleaved up to end of second shard range + context = CleavingContext.load(broker) + context.cursor = 'there' + context.store(broker) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # sharding - misplaced objects + for obj in objects: + broker.put_object(*obj) + # pretend we have not cleaved any ranges + context.cursor = '' + context.store(broker) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + self.assertFalse(os.path.exists(expected_shard_dbs[3])) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + + # pretend we cleaved up to end of second shard range + context.cursor = 'there' + context.store(broker) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects(objects[2:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + self.assertFalse(os.path.exists(expected_shard_dbs[3])) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + + # pretend we cleaved up to end of fourth shard range + context.cursor = 'yonder' + context.store(broker) + # and some new misplaced updates arrived in the first shard range + new_objects = [ + ['b', self.ts_encoded(), 10, 'text/plain', 'etag_b', 0, 0], + ['c', self.ts_encoded(), 20, 'text/plain', 'etag_c', 0, 0], + ] + for obj in new_objects: + broker.put_object(*obj) + + # check that *all* misplaced objects are moved despite exceeding + # the listing limit + with self._mock_sharder(conf={'cleave_row_batch_size': 2}) as sharder: + sharder._move_misplaced_objects(broker) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in expected_shard_dbs[2:4]], + any_order=True + ) + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check misplaced objects were moved + self._check_objects(new_objects, expected_shard_dbs[0]) + self._check_objects(objects[:2], expected_shard_dbs[1]) + self._check_objects(objects[2:3], expected_shard_dbs[2]) + self._check_objects(objects[3:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + + # pretend we cleaved all ranges - sharded state + self.assertTrue(broker.set_sharded_state()) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # and then more misplaced updates arrive + newer_objects = [ + ['a', self.ts_encoded(), 51, 'text/plain', 'etag_a', 0, 0], + ['z', self.ts_encoded(), 52, 'text/plain', 'etag_z', 0, 0], + ] + for obj in newer_objects: + broker.put_object(*obj) + broker.get_info() # force updates to be committed + # sanity check the puts landed in sharded broker + self._check_objects(newer_objects, broker.db_file) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) + for db in (expected_shard_dbs[0], expected_shard_dbs[-1])], + any_order=True + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check new misplaced objects were moved + self._check_objects(newer_objects[:1] + new_objects, + expected_shard_dbs[0]) + self._check_objects(newer_objects[1:], expected_shard_dbs[4]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + # ... and other shard dbs were unchanged + self._check_objects(objects[:2], expected_shard_dbs[1]) + self._check_objects(objects[2:3], expected_shard_dbs[2]) + self._check_objects(objects[3:], expected_shard_dbs[3]) + + def _setup_misplaced_objects(self): + # make a broker with shard ranges, move it to sharded state and then + # put some misplaced objects in it + broker = self._make_broker() + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', 'yonder'), + ('yonder', '')) + initial_shard_ranges = [ + ShardRange('.shards_a/%s-%s' % (lower, upper), + Timestamp.now(), lower, upper, state=ShardRange.ACTIVE) + for lower, upper in shard_bounds + ] + expected_dbs = [] + for shard_range in initial_shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(initial_shard_ranges) + objects = [ + # misplaced objects in second, third and fourth shard ranges + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0], + ['where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0], + # deleted + ['x', self.ts_encoded(), 0, '', '', 1, 0], + ] + broker.enable_sharding(Timestamp.now()) + self.assertTrue(broker.set_sharding_state()) + self.assertTrue(broker.set_sharded_state()) + for obj in objects: + broker.put_object(*obj) + self.assertEqual(SHARDED, broker.get_db_state()) + return broker, objects, expected_dbs + + def test_misplaced_objects_newer_objects(self): + # verify that objects merged to the db after misplaced objects have + # been identified are not removed from the db + broker, objects, expected_dbs = self._setup_misplaced_objects() + newer_objects = [ + ['j', self.ts_encoded(), 51, 'text/plain', 'etag_j', 0, 0], + ['k', self.ts_encoded(), 52, 'text/plain', 'etag_k', 1, 0], + ] + + calls = [] + pre_removal_objects = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + if db == expected_dbs[1]: + # put some new objects in the shard range that is being + # replicated before misplaced objects are removed from that + # range in the source db + for obj in newer_objects: + broker.put_object(*obj) + # grab a snapshot of the db contents - a side effect is + # that the newer objects are now committed to the db + pre_removal_objects.extend( + broker.get_objects()) + return True, [True, True, True] + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + # sanity check - the newer objects were in the db before the misplaced + # object were removed + for obj in newer_objects: + self.assertIn(obj[0], [o['name'] for o in pre_removal_objects]) + for obj in objects[:2]: + self.assertIn(obj[0], [o['name'] for o in pre_removal_objects]) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but newer objects were not removed from the source db + self._check_objects(newer_objects, broker.db_file) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + # they will be moved on next cycle + unlink_files(expected_dbs) + with self._mock_sharder(replicas=3) as sharder: + sharder._move_misplaced_objects(broker) + + self._check_objects(newer_objects, expected_dbs[1]) + self._check_objects([], broker.db_file) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + def test_misplaced_objects_db_id_changed(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + pre_info = broker.get_info() + calls = [] + expected_retained_objects = [] + expected_retained_objects_dbs = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + if len(calls) == 2: + broker.newid('fake_remote_id') + # grab snapshot of the objects in the broker when it changed id + expected_retained_objects.extend( + self._get_raw_object_records(broker)) + if len(calls) >= 2: + expected_retained_objects_dbs.append(db) + return True, [True, True, True] + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + # sanity checks + self.assertNotEqual(pre_info['id'], broker.get_info()['id']) + self.assertTrue(expected_retained_objects) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but objects were not removed after the source db id changed + self._check_objects(expected_retained_objects, broker.db_file) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Refused to remove misplaced objects', lines[0]) + self.assertIn('Refused to remove misplaced objects', lines[1]) + self.assertFalse(lines[2:]) + + # they will be moved again on next cycle + unlink_files(expected_dbs) + sharder.logger.clear() + with self._mock_sharder(replicas=3) as sharder: + sharder._move_misplaced_objects(broker) + + self.assertEqual(2, len(set(expected_retained_objects_dbs))) + for db in expected_retained_objects_dbs: + if db == expected_dbs[1]: + self._check_objects(objects[:2], expected_dbs[1]) + if db == expected_dbs[2]: + self._check_objects(objects[2:3], expected_dbs[2]) + if db == expected_dbs[3]: + self._check_objects(objects[3:], expected_dbs[3]) + self._check_objects([], broker.db_file) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': len(expected_retained_objects), + 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + def test_misplaced_objects_sufficient_replication(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object.return_value = (True, [True, True, True]) + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_dbs[2:4])], + any_order=True) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def test_misplaced_objects_insufficient_replication_3_replicas(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + returns = {expected_dbs[1]: (True, [True, True, True]), # ok + expected_dbs[2]: (False, [True, False, False]), # < quorum + expected_dbs[3]: (False, [False, True, True])} # ok + calls = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + return returns[db] + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved to shard dbs + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but only removed from the source db if sufficiently replicated + self._check_objects(objects[2:3], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def test_misplaced_objects_insufficient_replication_2_replicas(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + returns = {expected_dbs[1]: (True, [True, True]), # ok + expected_dbs[2]: (False, [True, False]), # ok + expected_dbs[3]: (False, [False, False])} # < quorum> + calls = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + return returns[db] + + with self._mock_sharder(replicas=2) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved to shard dbs + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but only removed from the source db if sufficiently replicated + self._check_objects(objects[3:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def test_misplaced_objects_insufficient_replication_4_replicas(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + returns = {expected_dbs[1]: (False, [True, False, False, False]), + expected_dbs[2]: (True, [True, False, False, True]), + expected_dbs[3]: (False, [False, False, False, False])} + calls = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + return returns[db] + + with self._mock_sharder(replicas=4) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved to shard dbs + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but only removed from the source db if sufficiently replicated + self._check_objects(objects[:2] + objects[3:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def _check_misplaced_objects_shard_container_unsharded(self, conf=None): + broker = self._make_broker(account='.shards_a', container='.shard_c') + ts_shard = next(self.ts_iter) + own_sr = ShardRange(broker.path, ts_shard, 'here', 'where') + broker.merge_shard_ranges([own_sr]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertEqual(own_sr, broker.get_own_shard_range()) # sanity check + self.assertEqual(UNSHARDED, broker.get_db_state()) + + objects = [ + # some of these are misplaced objects + ['b', self.ts_encoded(), 2, 'text/plain', 'etag_b', 0, 0], + ['here', self.ts_encoded(), 2, 'text/plain', 'etag_here', 0, 0], + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0], + ['x', self.ts_encoded(), 0, '', '', 1, 0], # deleted + ['y', self.ts_encoded(), 10, 'text/plain', 'etag_y', 0, 0], + ] + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', '')) + root_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for sr in root_shard_ranges: + db_hash = hash_path(sr.account, sr.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + # no objects + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_not_called() + + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # now put objects + for obj in objects: + broker.put_object(*obj) + self._check_objects(objects, broker.db_file) # sanity check + + # NB final shard range not available + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges[:-1]) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'}), + mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + sharder._replicate_object.assert_called_with( + 0, expected_shard_dbs[0], 0), + + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 1, 'placed': 2, 'unplaced': 2} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # some misplaced objects could not be moved... + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'Failed to find destination for at least 2 misplaced objects', + warning_lines[0]) + self.assertFalse(warning_lines[1:]) + sharder.logger.clear() + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[0]) + # ... and removed from the source db + self._check_objects(objects[2:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + self.assertFalse(os.path.exists(expected_shard_dbs[3])) + + # repeat with final shard range available + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + + sharder._replicate_object.assert_called_with( + 0, expected_shard_dbs[-1], 0), + + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[0]) + self._check_objects(objects[4:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[2:4], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # repeat - no work remaining + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_not_called() + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # and then more misplaced updates arrive + new_objects = [ + ['a', self.ts_encoded(), 51, 'text/plain', 'etag_a', 0, 0], + ['z', self.ts_encoded(), 52, 'text/plain', 'etag_z', 0, 0], + ] + for obj in new_objects: + broker.put_object(*obj) + # sanity check the puts landed in sharded broker + self._check_objects(new_objects[:1] + objects[2:4] + new_objects[1:], + broker.db_file) + + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'}), + mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) + for db in (expected_shard_dbs[0], expected_shard_dbs[3])], + any_order=True + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check new misplaced objects were moved + self._check_objects(new_objects[:1] + objects[:2], + expected_shard_dbs[0]) + self._check_objects(objects[4:] + new_objects[1:], + expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[2:4], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + def test_misplaced_objects_shard_container_unsharded(self): + self._check_misplaced_objects_shard_container_unsharded() + + def test_misplaced_objects_shard_container_unsharded_limit_two(self): + self._check_misplaced_objects_shard_container_unsharded( + conf={'cleave_row_batch_size': 2}) + + def test_misplaced_objects_shard_container_unsharded_limit_one(self): + self._check_misplaced_objects_shard_container_unsharded( + conf={'cleave_row_batch_size': 1}) + + def test_misplaced_objects_shard_container_sharding(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + ts_shard = next(self.ts_iter) + # note that own_sr spans two root shard ranges + own_sr = ShardRange(broker.path, ts_shard, 'here', 'where') + own_sr.update_state(ShardRange.SHARDING) + own_sr.epoch = next(self.ts_iter) + broker.merge_shard_ranges([own_sr]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertEqual(own_sr, broker.get_own_shard_range()) # sanity check + self.assertEqual(UNSHARDED, broker.get_db_state()) + + objects = [ + # some of these are misplaced objects + ['b', self.ts_encoded(), 2, 'text/plain', 'etag_b', 0, 0], + ['here', self.ts_encoded(), 2, 'text/plain', 'etag_here', 0, 0], + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0], + ['v', self.ts_encoded(), 10, 'text/plain', 'etag_v', 0, 0], + ['y', self.ts_encoded(), 10, 'text/plain', 'etag_y', 0, 0], + ] + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', '')) + root_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for sr in root_shard_ranges: + db_hash = hash_path(sr.account, sr.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + # pretend broker is sharding but not yet cleaved a shard + self.assertTrue(broker.set_sharding_state()) + broker.merge_shard_ranges([dict(sr) for sr in root_shard_ranges[1:3]]) + # then some updates arrive + for obj in objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(objects, broker.db_file) # sanity check + + # first destination is not available + with self._mock_sharder() as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges[1:]) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'}), + mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[-1], 0)], + ) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 1, 'placed': 1, 'unplaced': 2} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'Failed to find destination for at least 2 misplaced objects', + warning_lines[0]) + self.assertFalse(warning_lines[1:]) + sharder.logger.clear() + + # check some misplaced objects were moved + self._check_objects(objects[5:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[:5], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # normality resumes and all destinations are available + with self._mock_sharder() as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'})] + ) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0)], + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[0]) + self._check_objects(objects[5:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[2:5], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # pretend first shard has been cleaved + context = CleavingContext.load(broker) + context.cursor = 'there' + context.store(broker) + # and then more misplaced updates arrive + new_objects = [ + ['a', self.ts_encoded(), 51, 'text/plain', 'etag_a', 0, 0], + # this one is in the now cleaved shard range... + ['k', self.ts_encoded(), 52, 'text/plain', 'etag_k', 0, 0], + ['z', self.ts_encoded(), 53, 'text/plain', 'etag_z', 0, 0], + ] + for obj in new_objects: + broker.put_object(*obj) + broker.get_info() # force updates to be committed + # sanity check the puts landed in sharded broker + self._check_objects(sorted(new_objects + objects[2:5]), broker.db_file) + with self._mock_sharder() as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, + params={'states': 'updating', 'marker': '', + 'end_marker': 'there\x00'}), + mock.call(broker, newest=True, + params={'states': 'updating', 'marker': 'where', + 'end_marker': ''})]) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1], + expected_shard_dbs[-1])], + any_order=True + ) + + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 5, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check *all* the misplaced objects were moved + self._check_objects(new_objects[:1] + objects[:2], + expected_shard_dbs[0]) + self._check_objects(new_objects[1:2] + objects[2:4], + expected_shard_dbs[1]) + self._check_objects(objects[5:] + new_objects[2:], + expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[4:5], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + def test_misplaced_objects_deleted_and_updated(self): + # setup + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + + shard_bounds = (('', 'here'), ('here', '')) + root_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for sr in root_shard_ranges: + db_hash = hash_path(sr.account, sr.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(root_shard_ranges) + self.assertTrue(broker.set_sharding_state()) + + ts_older_internal = self.ts_encoded() # used later + # put deleted objects into source + objects = [ + ['b', self.ts_encoded(), 0, '', '', 1, 0], + ['x', self.ts_encoded(), 0, '', '', 1, 0] + ] + for obj in objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(objects, broker.db_file) # sanity check + # pretend we cleaved all ranges - sharded state + self.assertTrue(broker.set_sharded_state()) + + with self._mock_sharder() as sharder: + sharder.logger = debug_logger() + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1])], + any_order=True + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check new misplaced objects were moved + self._check_objects(objects[:1], expected_shard_dbs[0]) + self._check_objects(objects[1:], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + + # update source db with older undeleted versions of same objects + old_objects = [ + ['b', ts_older_internal, 2, 'text/plain', 'etag_b', 0, 0], + ['x', ts_older_internal, 4, 'text/plain', 'etag_x', 0, 0] + ] + for obj in old_objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(old_objects, broker.db_file) # sanity check + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1])], + any_order=True + ) + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check older misplaced objects were not merged to shard brokers + self._check_objects(objects[:1], expected_shard_dbs[0]) + self._check_objects(objects[1:], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + + # the destination shard dbs for misplaced objects may already exist so + # check they are updated correctly when overwriting objects + # update source db with newer deleted versions of same objects + new_objects = [ + ['b', self.ts_encoded(), 0, '', '', 1, 0], + ['x', self.ts_encoded(), 0, '', '', 1, 0] + ] + for obj in new_objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(new_objects, broker.db_file) # sanity check + shard_broker = ContainerBroker( + expected_shard_dbs[0], account=root_shard_ranges[0].account, + container=root_shard_ranges[0].container) + # update one shard container with even newer version of object + timestamps = [next(self.ts_iter) for i in range(7)] + ts_newer = encode_timestamps( + timestamps[1], timestamps[3], timestamps[5]) + newer_object = ('b', ts_newer, 10, 'text/plain', 'etag_b', 0, 0) + shard_broker.put_object(*newer_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1])], + any_order=True + ) + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check only the newer misplaced object was moved + self._check_objects([newer_object], expected_shard_dbs[0]) + self._check_objects(new_objects[1:], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + + # update source with a version of 'b' that has newer data + # but older content-type and metadata relative to shard object + ts_update = encode_timestamps( + timestamps[2], timestamps[3], timestamps[4]) + update_object = ('b', ts_update, 20, 'text/ignored', 'etag_newer', 0, + 0) + broker.put_object(*update_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + ts_expected = encode_timestamps( + timestamps[2], timestamps[3], timestamps[5]) + expected = ('b', ts_expected, 20, 'text/plain', 'etag_newer', 0, 0) + self._check_objects([expected], expected_shard_dbs[0]) + self._check_objects([], broker.db_file) + + # update source with a version of 'b' that has older data + # and content-type but newer metadata relative to shard object + ts_update = encode_timestamps( + timestamps[1], timestamps[3], timestamps[6]) + update_object = ('b', ts_update, 999, 'text/ignored', 'etag_b', 0, 0) + broker.put_object(*update_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + ts_expected = encode_timestamps( + timestamps[2], timestamps[3], timestamps[6]) + expected = ('b', ts_expected, 20, 'text/plain', 'etag_newer', 0, 0) + self._check_objects([expected], expected_shard_dbs[0]) + self._check_objects([], broker.db_file) + + # update source with a version of 'b' that has older data + # but newer content-type and metadata + ts_update = encode_timestamps( + timestamps[2], timestamps[6], timestamps[6]) + update_object = ('b', ts_update, 999, 'text/newer', 'etag_b', 0, 0) + broker.put_object(*update_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + ts_expected = encode_timestamps( + timestamps[2], timestamps[6], timestamps[6]) + expected = ('b', ts_expected, 20, 'text/newer', 'etag_newer', 0, 0) + self._check_objects([expected], expected_shard_dbs[0]) + self._check_objects([], broker.db_file) + + def _setup_find_ranges(self, account, cont, lower, upper): + broker = self._make_broker(account=account, container=cont) + own_sr = ShardRange('%s/%s' % (account, cont), Timestamp.now(), + lower, upper) + broker.merge_shard_ranges([own_sr]) + broker.set_sharding_sysmeta('Root', 'a/c') + objects = [ + # some of these are misplaced objects + ['obj%3d' % i, self.ts_encoded(), i, 'text/plain', 'etag%s' % i, 0] + for i in range(100)] + for obj in objects: + broker.put_object(*obj) + return broker, objects + + def _check_find_shard_ranges_none_found(self, broker, objects): + with self._mock_sharder() as sharder: + num_found = sharder._find_shard_ranges(broker) + self.assertGreater(sharder.split_size, len(objects)) + self.assertEqual(0, num_found) + self.assertFalse(broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + with self._mock_sharder( + conf={'shard_container_threshold': 200}) as sharder: + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(sharder.split_size, len(objects)) + self.assertEqual(0, num_found) + self.assertFalse(broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + def test_find_shard_ranges_none_found_root(self): + broker, objects = self._setup_find_ranges('a', 'c', '', '') + self._check_find_shard_ranges_none_found(broker, objects) + + def test_find_shard_ranges_none_found_shard(self): + broker, objects = self._setup_find_ranges( + '.shards_a', 'c', 'lower', 'upper') + self._check_find_shard_ranges_none_found(broker, objects) + + def _check_find_shard_ranges_finds_two(self, account, cont, lower, upper): + def check_ranges(): + self.assertEqual(2, len(broker.get_shard_ranges())) + expected_ranges = [ + ShardRange( + ShardRange.make_path('.int_shards_a', 'c', cont, now, 0), + now, lower, objects[98][0], 99), + ShardRange( + ShardRange.make_path('.int_shards_a', 'c', cont, now, 1), + now, objects[98][0], upper, 1), + ] + self._assert_shard_ranges_equal(expected_ranges, + broker.get_shard_ranges()) + + # first invocation finds both ranges + broker, objects = self._setup_find_ranges( + account, cont, lower, upper) + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'auto_create_account_prefix': '.int_'} + ) as sharder: + with mock_timestamp_now() as now: + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(99, sharder.split_size) + self.assertEqual(2, num_found) + check_ranges() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 2, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + # second invocation finds none + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'auto_create_account_prefix': '.int_'} + ) as sharder: + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(0, num_found) + self.assertEqual(2, len(broker.get_shard_ranges())) + check_ranges() + expected_stats = {'attempted': 0, 'success': 0, 'failure': 0, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + def test_find_shard_ranges_finds_two_root(self): + self._check_find_shard_ranges_finds_two('a', 'c', '', '') + + def test_find_shard_ranges_finds_two_shard(self): + self._check_find_shard_ranges_finds_two('.shards_a', 'c_', 'l', 'u') + + def _check_find_shard_ranges_finds_three(self, account, cont, lower, + upper): + broker, objects = self._setup_find_ranges( + account, cont, lower, upper) + now = Timestamp.now() + expected_ranges = [ + ShardRange( + ShardRange.make_path('.shards_a', 'c', cont, now, 0), + now, lower, objects[44][0], 45), + ShardRange( + ShardRange.make_path('.shards_a', 'c', cont, now, 1), + now, objects[44][0], objects[89][0], 45), + ShardRange( + ShardRange.make_path('.shards_a', 'c', cont, now, 2), + now, objects[89][0], upper, 10), + ] + # first invocation finds 2 ranges + with self._mock_sharder( + conf={'shard_container_threshold': 90, + 'shard_scanner_batch_size': 2}) as sharder: + with mock_timestamp_now(now): + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(45, sharder.split_size) + self.assertEqual(2, num_found) + self.assertEqual(2, len(broker.get_shard_ranges())) + self._assert_shard_ranges_equal(expected_ranges[:2], + broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 2, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + # second invocation finds third shard range + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'shard_scanner_batch_size': 2} + ) as sharder: + with mock_timestamp_now(now): + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(1, num_found) + self.assertEqual(3, len(broker.get_shard_ranges())) + self._assert_shard_ranges_equal(expected_ranges, + broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + # third invocation finds none + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'shard_scanner_batch_size': 2} + ) as sharder: + sharder._send_shard_ranges = mock.MagicMock(return_value=True) + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(0, num_found) + self.assertEqual(3, len(broker.get_shard_ranges())) + self._assert_shard_ranges_equal(expected_ranges, + broker.get_shard_ranges()) + expected_stats = {'attempted': 0, 'success': 0, 'failure': 0, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + def test_find_shard_ranges_finds_three_root(self): + self._check_find_shard_ranges_finds_three('a', 'c', '', '') + + def test_find_shard_ranges_finds_three_shard(self): + self._check_find_shard_ranges_finds_three('.shards_a', 'c_', 'l', 'u') + + def test_sharding_enabled(self): + broker = self._make_broker() + self.assertFalse(sharding_enabled(broker)) + broker.update_metadata( + {'X-Container-Sysmeta-Sharding': + ('yes', Timestamp.now().internal)}) + self.assertTrue(sharding_enabled(broker)) + # deleting broker clears sharding sysmeta + broker.delete_db(Timestamp.now().internal) + self.assertFalse(sharding_enabled(broker)) + # but if broker has a shard range then sharding is enabled + broker.merge_shard_ranges( + ShardRange('acc/a_shard', Timestamp.now(), 'l', 'u')) + self.assertTrue(sharding_enabled(broker)) + + def test_send_shard_ranges(self): + shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + + def do_test(replicas, *resp_codes): + sent_data = defaultdict(str) + + def on_send(fake_conn, data): + sent_data[fake_conn] += data + + with self._mock_sharder(replicas=replicas) as sharder: + with mocked_http_conn(*resp_codes, give_send=on_send) as conn: + with mock_timestamp_now() as now: + res = sharder._send_shard_ranges( + 'a', 'c', shard_ranges) + + self.assertEqual(sharder.ring.replica_count, len(conn.requests)) + expected_body = json.dumps([dict(sr) for sr in shard_ranges]) + expected_headers = {'Content-Type': 'application/json', + 'Content-Length': str(len(expected_body)), + 'X-Timestamp': now.internal, + 'X-Backend-Record-Type': 'shard', + 'User-Agent': mock.ANY} + for data in sent_data.values(): + self.assertEqual(expected_body, data) + hosts = set() + for req in conn.requests: + path_parts = req['path'].split('/')[1:] + hosts.add('%s:%s/%s' % (req['ip'], req['port'], path_parts[0])) + # FakeRing only has one partition + self.assertEqual('0', path_parts[1]) + self.assertEqual('PUT', req['method']) + self.assertEqual(['a', 'c'], path_parts[-2:]) + req_headers = req['headers'] + for k, v in expected_headers.items(): + self.assertEqual(v, req_headers[k]) + self.assertTrue( + req_headers['User-Agent'].startswith('container-sharder')) + self.assertEqual(sharder.ring.replica_count, len(hosts)) + return res, sharder + + replicas = 3 + res, sharder = do_test(replicas, 202, 202, 202) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 202, 404) + self.assertTrue(res) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 202, Exception) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, 202, 404, 404) + self.assertFalse(res) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 500, 500, 500) + self.assertFalse(res) + self.assertEqual([True, True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, Exception, Exception, 202) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, Exception, eventlet.Timeout(), 202) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + + replicas = 2 + res, sharder = do_test(replicas, 202, 202) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 404) + self.assertTrue(res) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, Exception) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, 404, 404) + self.assertFalse(res) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, Exception, Exception) + self.assertFalse(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, eventlet.Timeout(), Exception) + self.assertFalse(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + + replicas = 4 + res, sharder = do_test(replicas, 202, 202, 202, 202) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertTrue(res) + res, sharder = do_test(replicas, 202, 202, 404, 404) + self.assertTrue(res) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 202, Exception, Exception) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, 202, 404, 404, 404) + self.assertFalse(res) + self.assertEqual([True, True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 500, 500, 500, 202) + self.assertFalse(res) + self.assertEqual([True, True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, Exception, Exception, 202, 404) + self.assertFalse(res) + self.assertEqual([True], [ + all(msg in line for msg in ('Failed to put shard ranges', '404')) + for line in sharder.logger.get_lines_for_level('warning')]) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test( + replicas, eventlet.Timeout(), eventlet.Timeout(), 202, 404) + self.assertFalse(res) + self.assertEqual([True], [ + all(msg in line for msg in ('Failed to put shard ranges', '404')) + for line in sharder.logger.get_lines_for_level('warning')]) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + + def test_process_broker_not_sharding_no_others(self): + # verify that sharding process will not start when own shard range is + # missing or in wrong state or there are no other shard ranges + broker = self._make_broker() + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + # sanity check + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + + # no own shard range + with self._mock_sharder() as sharder: + sharder._process_broker(broker, node, 99) + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + # now add own shard range + for state in sorted(ShardRange.STATES): + own_sr = broker.get_own_shard_range() # returns the default + own_sr.update_state(state) + broker.merge_shard_ranges([own_sr]) + with mock.patch.object( + broker, 'set_sharding_state') as mock_set_sharding_state: + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + with mock.patch.object(sharder, '_audit_container'): + sharder.logger = debug_logger() + sharder._process_broker(broker, node, 99) + own_shard_range = broker.get_own_shard_range( + no_default=True) + mock_set_sharding_state.assert_not_called() + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(own_shard_range)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + def _check_process_broker_sharding_no_others(self, state): + # verify that when existing own_shard_range has given state and there + # are other shard ranges then the sharding process will begin + broker = self._make_broker(hash_='hash%s' % state) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + own_sr = broker.get_own_shard_range() + self.assertTrue(own_sr.update_state(state)) + epoch = Timestamp.now() + own_sr.epoch = epoch + shard_ranges = self._make_shard_ranges((('', 'm'), ('m', ''))) + broker.merge_shard_ranges([own_sr] + shard_ranges) + + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_create_shard_containers', return_value=0): + with mock_timestamp_now() as now: + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + final_own_sr = broker.get_own_shard_range(no_default=True) + + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(final_own_sr)) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(epoch.normal, parse_db_filename(broker.db_file)[1]) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + + def test_process_broker_sharding_with_own_shard_range_no_others(self): + self._check_process_broker_sharding_no_others(ShardRange.SHARDING) + self._check_process_broker_sharding_no_others(ShardRange.SHRINKING) + + def test_process_broker_not_sharding_others(self): + # verify that sharding process will not start when own shard range is + # missing or in wrong state even when other shard ranges are in the db + broker = self._make_broker() + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + # sanity check + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + + # add shard ranges - but not own + shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + broker.merge_shard_ranges(shard_ranges) + + with self._mock_sharder() as sharder: + sharder._process_broker(broker, node, 99) + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + # now add own shard range + for state in sorted(ShardRange.STATES): + if state in (ShardRange.SHARDING, + ShardRange.SHRINKING, + ShardRange.SHARDED): + epoch = None + else: + epoch = Timestamp.now() + + own_sr = broker.get_own_shard_range() # returns the default + own_sr.update_state(state) + own_sr.epoch = epoch + broker.merge_shard_ranges([own_sr]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._process_broker(broker, node, 99) + own_shard_range = broker.get_own_shard_range( + no_default=True) + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(own_shard_range)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + if epoch: + self.assertFalse(broker.logger.get_lines_for_level('warning')) + else: + self.assertIn('missing epoch', + broker.logger.get_lines_for_level('warning')[0]) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + def _check_process_broker_sharding_others(self, state): + # verify states in which own_shard_range will cause sharding + # process to start when other shard ranges are in the db + broker = self._make_broker(hash_='hash%s' % state) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + # add shard ranges - but not own + shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + broker.merge_shard_ranges(shard_ranges) + # sanity check + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + + # now set own shard range to given state and persist it + own_sr = broker.get_own_shard_range() # returns the default + self.assertTrue(own_sr.update_state(state)) + epoch = Timestamp.now() + own_sr.epoch = epoch + broker.merge_shard_ranges([own_sr]) + with self._mock_sharder() as sharder: + + sharder.logger = debug_logger() + with mock_timestamp_now() as now: + # we're not testing rest of the process here so prevent any + # attempt to progress shard range states + sharder._create_shard_containers = lambda *args: 0 + sharder._process_broker(broker, node, 99) + own_shard_range = broker.get_own_shard_range(no_default=True) + + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(own_shard_range)) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(epoch.normal, parse_db_filename(broker.db_file)[1]) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + + def test_process_broker_sharding_with_own_shard_range_and_others(self): + self._check_process_broker_sharding_others(ShardRange.SHARDING) + self._check_process_broker_sharding_others(ShardRange.SHRINKING) + self._check_process_broker_sharding_others(ShardRange.SHARDED) + + def check_shard_ranges_sent(self, broker, expected_sent): + bodies = [] + + def capture_send(conn, data): + bodies.append(data) + + with self._mock_sharder() as sharder: + with mocked_http_conn(204, 204, 204, + give_send=capture_send) as mock_conn: + sharder._update_root_container(broker) + + for req in mock_conn.requests: + self.assertEqual('PUT', req['method']) + self.assertEqual([expected_sent] * 3, + [json.loads(b) for b in bodies]) + + def test_update_root_container_own_range(self): + broker = self._make_broker() + + # nothing to send + with self._mock_sharder() as sharder: + with mocked_http_conn() as mock_conn: + sharder._update_root_container(broker) + self.assertFalse(mock_conn.requests) + + def check_only_own_shard_range_sent(state): + own_shard_range = broker.get_own_shard_range() + self.assertTrue(own_shard_range.update_state( + state, state_timestamp=next(self.ts_iter))) + broker.merge_shard_ranges([own_shard_range]) + # add an object, expect to see it reflected in the own shard range + # that is sent + broker.put_object(str(own_shard_range.object_count + 1), + next(self.ts_iter).internal, 1, '', '') + with mock_timestamp_now() as now: + # force own shard range meta updates to be at fixed timestamp + expected_sent = [ + dict(own_shard_range, + meta_timestamp=now.internal, + object_count=own_shard_range.object_count + 1, + bytes_used=own_shard_range.bytes_used + 1)] + self.check_shard_ranges_sent(broker, expected_sent) + + for state in ShardRange.STATES: + with annotate_failure(state): + check_only_own_shard_range_sent(state) + + def test_update_root_container_all_ranges(self): + broker = self._make_broker() + other_shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + self.assertTrue(other_shard_ranges[0].set_deleted()) + broker.merge_shard_ranges(other_shard_ranges) + + # own range missing - send nothing + with self._mock_sharder() as sharder: + with mocked_http_conn() as mock_conn: + sharder._update_root_container(broker) + self.assertFalse(mock_conn.requests) + + def check_all_shard_ranges_sent(state): + own_shard_range = broker.get_own_shard_range() + self.assertTrue(own_shard_range.update_state( + state, state_timestamp=next(self.ts_iter))) + broker.merge_shard_ranges([own_shard_range]) + # add an object, expect to see it reflected in the own shard range + # that is sent + broker.put_object(str(own_shard_range.object_count + 1), + next(self.ts_iter).internal, 1, '', '') + with mock_timestamp_now() as now: + shard_ranges = broker.get_shard_ranges(include_deleted=True) + expected_sent = sorted([ + own_shard_range.copy( + meta_timestamp=now.internal, + object_count=own_shard_range.object_count + 1, + bytes_used=own_shard_range.bytes_used + 1)] + + shard_ranges, + key=lambda sr: (sr.upper, sr.state, sr.lower)) + self.check_shard_ranges_sent( + broker, [dict(sr) for sr in expected_sent]) + + for state in ShardRange.STATES.keys(): + with annotate_failure(state): + check_all_shard_ranges_sent(state) + + def test_audit_root_container(self): + broker = self._make_broker() + + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0} + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + self._assert_stats(expected_stats, sharder, 'audit_root') + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + mocked.assert_not_called() + + def assert_overlap_warning(line, state_text): + self.assertIn( + 'Audit failed for root %s' % broker.db_file, line) + self.assertIn( + 'overlapping ranges in state %s: k-t s-z' % state_text, + line) + + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1} + shard_bounds = (('a', 'j'), ('k', 't'), ('s', 'z')) + for state, state_text in ShardRange.STATES.items(): + shard_ranges = self._make_shard_ranges(shard_bounds, state) + broker.merge_shard_ranges(shard_ranges) + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + lines = sharder.logger.get_lines_for_level('warning') + assert_overlap_warning(lines[0], state_text) + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self._assert_stats(expected_stats, sharder, 'audit_root') + mocked.assert_not_called() + + def assert_missing_warning(line): + self.assertIn( + 'Audit failed for root %s' % broker.db_file, line) + self.assertIn('missing range(s): -a j-k z-', line) + + own_shard_range = broker.get_own_shard_range() + states = (ShardRange.SHARDING, ShardRange.SHARDED) + for state in states: + own_shard_range.update_state( + state, state_timestamp=next(self.ts_iter)) + broker.merge_shard_ranges([own_shard_range]) + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + lines = sharder.logger.get_lines_for_level('warning') + assert_missing_warning(lines[0]) + assert_overlap_warning(lines[0], state_text) + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self._assert_stats(expected_stats, sharder, 'audit_root') + mocked.assert_not_called() + + def test_audit_shard_container(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + broker.set_sharding_sysmeta('Root', 'a/c') + # include overlaps to verify correct match for updating own shard range + shard_bounds = ( + ('a', 'j'), ('k', 't'), ('k', 's'), ('l', 's'), ('s', 'z')) + shard_ranges = self._make_shard_ranges(shard_bounds, ShardRange.ACTIVE) + shard_ranges[1].name = broker.path + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1} + + def call_audit_container(exc=None): + with self._mock_sharder() as sharder: + sharder.logger = debug_logger() + with mock.patch.object(sharder, '_audit_root_container') \ + as mocked, mock.patch.object( + sharder, 'int_client') as mock_swift: + mock_response = mock.MagicMock() + mock_response.headers = {'x-backend-record-type': + 'shard'} + mock_response.body = json.dumps( + [dict(sr) for sr in shard_ranges]) + mock_swift.make_request.return_value = mock_response + mock_swift.make_request.side_effect = exc + mock_swift.make_path = (lambda a, c: + '/v1/%s/%s' % (a, c)) + sharder.reclaim_age = 0 + sharder._audit_container(broker) + mocked.assert_not_called() + return sharder, mock_swift + + # bad account name + broker.account = 'bad_account' + sharder, mock_swift = call_audit_container() + lines = sharder.logger.get_lines_for_level('warning') + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertIn('Audit warnings for shard %s' % broker.db_file, lines[0]) + self.assertIn('account not in shards namespace', lines[0]) + self.assertNotIn('root has no matching shard range', lines[0]) + self.assertNotIn('unable to get shard ranges from root', lines[0]) + self.assertIn('Audit failed for shard %s' % broker.db_file, lines[1]) + self.assertIn('missing own shard range', lines[1]) + self.assertFalse(lines[2:]) + self.assertFalse(broker.is_deleted()) + + # missing own shard range + broker.get_info() + sharder, mock_swift = call_audit_container() + lines = sharder.logger.get_lines_for_level('warning') + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertIn('Audit failed for shard %s' % broker.db_file, lines[0]) + self.assertIn('missing own shard range', lines[0]) + self.assertNotIn('unable to get shard ranges from root', lines[0]) + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertFalse(broker.is_deleted()) + + # create own shard range, no match in root + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0} + own_shard_range = broker.get_own_shard_range() # get the default + own_shard_range.lower = 'j' + own_shard_range.upper = 'k' + broker.merge_shard_ranges([own_shard_range]) + sharder, mock_swift = call_audit_container() + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Audit warnings for shard %s' % broker.db_file, lines[0]) + self.assertNotIn('account not in shards namespace', lines[0]) + self.assertNotIn('missing own shard range', lines[0]) + self.assertIn('root has no matching shard range', lines[0]) + self.assertNotIn('unable to get shard ranges from root', lines[0]) + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertFalse(broker.is_deleted()) + expected_headers = {'X-Backend-Record-Type': 'shard', + 'X-Newest': 'true', + 'X-Backend-Include-Deleted': 'True', + 'X-Backend-Override-Deleted': 'true'} + params = {'format': 'json', 'marker': 'j', 'end_marker': 'k'} + mock_swift.make_request.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + # create own shard range, failed response from root + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0} + own_shard_range = broker.get_own_shard_range() # get the default + own_shard_range.lower = 'j' + own_shard_range.upper = 'k' + broker.merge_shard_ranges([own_shard_range]) + sharder, mock_swift = call_audit_container( + exc=internal_client.UnexpectedResponse('bad', 'resp')) + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Failed to get shard ranges', lines[0]) + self.assertIn('Audit warnings for shard %s' % broker.db_file, lines[1]) + self.assertNotIn('account not in shards namespace', lines[1]) + self.assertNotIn('missing own shard range', lines[1]) + self.assertNotIn('root has no matching shard range', lines[1]) + self.assertIn('unable to get shard ranges from root', lines[1]) + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertFalse(lines[2:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertFalse(broker.is_deleted()) + mock_swift.make_request.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + def assert_ok(): + sharder, mock_swift = call_audit_container() + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self._assert_stats(expected_stats, sharder, 'audit_shard') + params = {'format': 'json', 'marker': 'k', 'end_marker': 't'} + mock_swift.make_request.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + # make own shard range match one in root, but different state + shard_ranges[1].timestamp = Timestamp.now() + broker.merge_shard_ranges([shard_ranges[1]]) + now = Timestamp.now() + shard_ranges[1].update_state(ShardRange.SHARDING, state_timestamp=now) + assert_ok() + self.assertFalse(broker.is_deleted()) + # own shard range state is updated from root version + own_shard_range = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_shard_range.state) + self.assertEqual(now, own_shard_range.state_timestamp) + + own_shard_range.update_state(ShardRange.SHARDED, + state_timestamp=Timestamp.now()) + broker.merge_shard_ranges([own_shard_range]) + assert_ok() + + own_shard_range.deleted = 1 + own_shard_range.timestamp = Timestamp.now() + broker.merge_shard_ranges([own_shard_range]) + assert_ok() + self.assertTrue(broker.is_deleted()) + + def test_find_and_enable_sharding_candidates(self): + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + shard_bounds = (('', 'here'), ('here', 'there'), ('there', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CLEAVED) + shard_ranges[0].state = ShardRange.ACTIVE + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + self.assertTrue(broker.set_sharded_state()) + with self._mock_sharder() as sharder: + sharder._find_and_enable_sharding_candidates(broker) + + # one range just below threshold + shard_ranges[0].update_meta(sharder.shard_container_threshold - 1, 0) + broker.merge_shard_ranges(shard_ranges[0]) + with self._mock_sharder() as sharder: + sharder._find_and_enable_sharding_candidates(broker) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + + # two ranges above threshold, only one ACTIVE + shard_ranges[0].update_meta(sharder.shard_container_threshold, 0) + shard_ranges[2].update_meta(sharder.shard_container_threshold + 1, 0) + broker.merge_shard_ranges([shard_ranges[0], shard_ranges[2]]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + expected = shard_ranges[0].copy(state=ShardRange.SHARDING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal([expected] + shard_ranges[1:], + broker.get_shard_ranges()) + + # check idempotency + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + self._assert_shard_ranges_equal([expected] + shard_ranges[1:], + broker.get_shard_ranges()) + + # two ranges above threshold, both ACTIVE + shard_ranges[2].update_state(ShardRange.ACTIVE) + broker.merge_shard_ranges(shard_ranges[2]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + expected_2 = shard_ranges[2].copy(state=ShardRange.SHARDING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal( + [expected, shard_ranges[1], expected_2], broker.get_shard_ranges()) + + # check idempotency + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + self._assert_shard_ranges_equal( + [expected, shard_ranges[1], expected_2], broker.get_shard_ranges()) + + def test_find_and_enable_sharding_candidates_bootstrap(self): + broker = self._make_broker() + with self._mock_sharder( + conf={'shard_container_threshold': 1}) as sharder: + sharder._find_and_enable_sharding_candidates(broker) + self.assertEqual(ShardRange.ACTIVE, broker.get_own_shard_range().state) + broker.put_object('obj', next(self.ts_iter).internal, 1, '', '') + self.assertEqual(1, broker.get_info()['object_count']) + with self._mock_sharder( + conf={'shard_container_threshold': 1}) as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates( + broker, [broker.get_own_shard_range()]) + own_sr = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_sr.state) + self.assertEqual(now, own_sr.state_timestamp) + self.assertEqual(now, own_sr.epoch) + + # check idempotency + with self._mock_sharder( + conf={'shard_container_threshold': 1}) as sharder: + with mock_timestamp_now(): + sharder._find_and_enable_sharding_candidates( + broker, [broker.get_own_shard_range()]) + own_sr = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_sr.state) + self.assertEqual(now, own_sr.state_timestamp) + self.assertEqual(now, own_sr.epoch) + + def test_find_and_enable_shrinking_candidates(self): + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + shard_bounds = (('', 'here'), ('here', 'there'), ('there', '')) + size = (DEFAULT_SHARD_SHRINK_POINT * + DEFAULT_SHARD_CONTAINER_THRESHOLD / 100) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE, object_count=size) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + self.assertTrue(broker.set_sharded_state()) + with self._mock_sharder() as sharder: + sharder._find_and_enable_shrinking_candidates(broker) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + + # one range just below threshold + shard_ranges[0].update_meta(size - 1, 0) + broker.merge_shard_ranges(shard_ranges[0]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + acceptor = shard_ranges[1].copy(lower=shard_ranges[0].lower) + acceptor.timestamp = now + donor = shard_ranges[0].copy(state=ShardRange.SHRINKING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal([donor, acceptor, shard_ranges[2]], + broker.get_shard_ranges()) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(acceptor.account, acceptor.container, [acceptor]), + mock.call(donor.account, donor.container, [donor, acceptor])] + ) + + # check idempotency + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + self._assert_shard_ranges_equal([donor, acceptor, shard_ranges[2]], + broker.get_shard_ranges()) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(acceptor.account, acceptor.container, [acceptor]), + mock.call(donor.account, donor.container, [donor, acceptor])] + ) + + # acceptor falls below threshold - not a candidate + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + acceptor.update_meta(0, 0, meta_timestamp=now) + broker.merge_shard_ranges(acceptor) + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + self._assert_shard_ranges_equal([donor, acceptor, shard_ranges[2]], + broker.get_shard_ranges()) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(acceptor.account, acceptor.container, [acceptor]), + mock.call(donor.account, donor.container, [donor, acceptor])] + ) + + # ...until donor has shrunk + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + donor.update_state(ShardRange.SHARDED, state_timestamp=now) + donor.set_deleted(timestamp=now) + broker.merge_shard_ranges(donor) + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + new_acceptor = shard_ranges[2].copy(lower=acceptor.lower) + new_acceptor.timestamp = now + new_donor = acceptor.copy(state=ShardRange.SHRINKING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal( + [donor, new_donor, new_acceptor], + broker.get_shard_ranges(include_deleted=True)) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(new_acceptor.account, new_acceptor.container, + [new_acceptor]), + mock.call(new_donor.account, new_donor.container, + [new_donor, new_acceptor])] + ) + + # ..finally last shard shrinks to root + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + new_donor.update_state(ShardRange.SHARDED, state_timestamp=now) + new_donor.set_deleted(timestamp=now) + new_acceptor.update_meta(0, 0, meta_timestamp=now) + broker.merge_shard_ranges([new_donor, new_acceptor]) + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + final_donor = new_acceptor.copy(state=ShardRange.SHRINKING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal( + [donor, new_donor, final_donor], + broker.get_shard_ranges(include_deleted=True)) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(final_donor.account, final_donor.container, + [final_donor, broker.get_own_shard_range()])] + ) + + def test_partition_and_device_filters(self): + # verify partitions and devices kwargs result in filtering of processed + # containers but not of the local device ids. + ring = FakeRing() + dev_ids = set() + container_data = [] + for dev in ring.devs: + dev_ids.add(dev['id']) + part = str(dev['id']) + broker = self._make_broker( + container='c%s' % dev['id'], hash_='c%shash' % dev['id'], + device=dev['device'], part=part) + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('true', next(self.ts_iter).internal)}) + container_data.append((broker.path, dev['id'], part)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once() + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set(container_data), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(partitions='0') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set([container_data[0]]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(partitions='2,0') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set([container_data[0], container_data[2]]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(partitions='2,0', devices='sdc') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set([container_data[2]]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(devices='sdb,sdc') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set(container_data[1:]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + +class TestCleavingContext(BaseTestSharder): + def test_init(self): + ctx = CleavingContext(ref='test') + self.assertEqual('test', ctx.ref) + self.assertEqual('', ctx.cursor) + self.assertIsNone(ctx.max_row) + self.assertIsNone(ctx.cleave_to_row) + self.assertIsNone(ctx.last_cleave_to_row) + self.assertFalse(ctx.misplaced_done) + self.assertFalse(ctx.cleaving_done) + + def test_iter(self): + ctx = CleavingContext('test', 'curs', 12, 11, 10, False, True, 0, 4) + expected = {'ref': 'test', + 'cursor': 'curs', + 'max_row': 12, + 'cleave_to_row': 11, + 'last_cleave_to_row': 10, + 'cleaving_done': False, + 'misplaced_done': True, + 'ranges_done': 0, + 'ranges_todo': 4} + self.assertEqual(expected, dict(ctx)) + + def test_cursor(self): + broker = self._make_broker() + ref = CleavingContext._make_ref(broker) + + for curs in ('curs', u'curs\u00e4\u00fb'): + with annotate_failure('%r' % curs): + ctx = CleavingContext(ref, curs, 12, 11, 10, False, True) + self.assertEqual(curs.encode('utf8'), ctx.cursor) + ctx.store(broker) + ctx = CleavingContext.load(broker) + self.assertEqual(curs.encode('utf8'), ctx.cursor) + + def test_load(self): + broker = self._make_broker() + for i in range(6): + broker.put_object('o%s' % i, next(self.ts_iter).internal, 10, + 'text/plain', 'etag_a', 0) + + db_id = broker.get_info()['id'] + params = {'ref': db_id, + 'cursor': 'curs', + 'max_row': 2, + 'cleave_to_row': 2, + 'last_cleave_to_row': 1, + 'cleaving_done': False, + 'misplaced_done': True, + 'ranges_done': 2, + 'ranges_todo': 4} + key = 'X-Container-Sysmeta-Shard-Context-%s' % db_id + broker.update_metadata( + {key: (json.dumps(params), Timestamp.now().internal)}) + ctx = CleavingContext.load(broker) + self.assertEqual(db_id, ctx.ref) + self.assertEqual('curs', ctx.cursor) + # note max_row is dynamically updated during load + self.assertEqual(6, ctx.max_row) + self.assertEqual(2, ctx.cleave_to_row) + self.assertEqual(1, ctx.last_cleave_to_row) + self.assertTrue(ctx.misplaced_done) + self.assertFalse(ctx.cleaving_done) + self.assertEqual(2, ctx.ranges_done) + self.assertEqual(4, ctx.ranges_todo) + + def test_store(self): + broker = self._make_sharding_broker() + old_db_id = broker.get_brokers()[0].get_info()['id'] + ctx = CleavingContext(old_db_id, 'curs', 12, 11, 2, True, True, 2, 4) + ctx.store(broker) + key = 'X-Container-Sysmeta-Shard-Context-%s' % old_db_id + data = json.loads(broker.metadata[key][0]) + expected = {'ref': old_db_id, + 'cursor': 'curs', + 'max_row': 12, + 'cleave_to_row': 11, + 'last_cleave_to_row': 2, + 'cleaving_done': True, + 'misplaced_done': True, + 'ranges_done': 2, + 'ranges_todo': 4} + self.assertEqual(expected, data) + + def test_store_add_row_load(self): + # adding row to older db changes only max_row in the context + broker = self._make_sharding_broker() + old_broker = broker.get_brokers()[0] + old_db_id = old_broker.get_info()['id'] + old_broker.merge_items([old_broker._record_to_dict( + ('obj', next(self.ts_iter).internal, 0, 'text/plain', 'etag', 1))]) + old_max_row = old_broker.get_max_row() + self.assertEqual(1, old_max_row) # sanity check + ctx = CleavingContext(old_db_id, 'curs', 1, 1, 0, True, True) + ctx.store(broker) + + # adding a row changes max row + old_broker.merge_items([old_broker._record_to_dict( + ('obj', next(self.ts_iter).internal, 0, 'text/plain', 'etag', 1))]) + + new_ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, new_ctx.ref) + self.assertEqual('curs', new_ctx.cursor) + self.assertEqual(2, new_ctx.max_row) + self.assertEqual(1, new_ctx.cleave_to_row) + self.assertEqual(0, new_ctx.last_cleave_to_row) + self.assertTrue(new_ctx.misplaced_done) + self.assertTrue(new_ctx.cleaving_done) + + def test_store_reclaim_load(self): + # reclaiming rows from older db does not change context + broker = self._make_sharding_broker() + old_broker = broker.get_brokers()[0] + old_db_id = old_broker.get_info()['id'] + old_broker.merge_items([old_broker._record_to_dict( + ('obj', next(self.ts_iter).internal, 0, 'text/plain', 'etag', 1))]) + old_max_row = old_broker.get_max_row() + self.assertEqual(1, old_max_row) # sanity check + ctx = CleavingContext(old_db_id, 'curs', 1, 1, 0, True, True) + ctx.store(broker) + + self.assertEqual( + 1, len(old_broker.get_objects())) + now = next(self.ts_iter).internal + broker.get_brokers()[0].reclaim(now, now) + self.assertFalse(old_broker.get_objects()) + + new_ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, new_ctx.ref) + self.assertEqual('curs', new_ctx.cursor) + self.assertEqual(1, new_ctx.max_row) + self.assertEqual(1, new_ctx.cleave_to_row) + self.assertEqual(0, new_ctx.last_cleave_to_row) + self.assertTrue(new_ctx.misplaced_done) + self.assertTrue(new_ctx.cleaving_done) + + def test_store_modify_db_id_load(self): + # changing id changes ref, so results in a fresh context + broker = self._make_sharding_broker() + old_broker = broker.get_brokers()[0] + old_db_id = old_broker.get_info()['id'] + ctx = CleavingContext(old_db_id, 'curs', 12, 11, 2, True, True) + ctx.store(broker) + + old_broker.newid('fake_remote_id') + new_db_id = old_broker.get_info()['id'] + self.assertNotEqual(old_db_id, new_db_id) + + new_ctx = CleavingContext.load(broker) + self.assertEqual(new_db_id, new_ctx.ref) + self.assertEqual('', new_ctx.cursor) + # note max_row is dynamically updated during load + self.assertEqual(-1, new_ctx.max_row) + self.assertEqual(None, new_ctx.cleave_to_row) + self.assertEqual(None, new_ctx.last_cleave_to_row) + self.assertFalse(new_ctx.misplaced_done) + self.assertFalse(new_ctx.cleaving_done) + + def test_load_modify_store_load(self): + broker = self._make_sharding_broker() + old_db_id = broker.get_brokers()[0].get_info()['id'] + ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, ctx.ref) + self.assertEqual('', ctx.cursor) # sanity check + ctx.cursor = 'curs' + ctx.misplaced_done = True + ctx.store(broker) + ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, ctx.ref) + self.assertEqual('curs', ctx.cursor) + self.assertTrue(ctx.misplaced_done) + + def test_reset(self): + ctx = CleavingContext('test', 'curs', 12, 11, 2, True, True) + + def check_context(): + self.assertEqual('test', ctx.ref) + self.assertEqual('', ctx.cursor) + self.assertEqual(12, ctx.max_row) + self.assertEqual(11, ctx.cleave_to_row) + self.assertEqual(11, ctx.last_cleave_to_row) + self.assertFalse(ctx.misplaced_done) + self.assertFalse(ctx.cleaving_done) + self.assertEqual(0, ctx.ranges_done) + self.assertEqual(0, ctx.ranges_todo) + ctx.reset() + # check idempotency + ctx.reset() + + def test_start(self): + ctx = CleavingContext('test', 'curs', 12, 11, 2, True, True) + + def check_context(): + self.assertEqual('test', ctx.ref) + self.assertEqual('', ctx.cursor) + self.assertEqual(12, ctx.max_row) + self.assertEqual(12, ctx.cleave_to_row) + self.assertEqual(2, ctx.last_cleave_to_row) + self.assertTrue(ctx.misplaced_done) # *not* reset here + self.assertFalse(ctx.cleaving_done) + self.assertEqual(0, ctx.ranges_done) + self.assertEqual(0, ctx.ranges_todo) + ctx.start() + # check idempotency + ctx.start() diff --git a/test/unit/proxy/controllers/test_container.py b/test/unit/proxy/controllers/test_container.py index e85e50362a..ae44f8b001 100644 --- a/test/unit/proxy/controllers/test_container.py +++ b/test/unit/proxy/controllers/test_container.py @@ -159,6 +159,91 @@ class TestContainerController(TestRingBase): for key in owner_headers: self.assertIn(key, resp.headers) + def test_reseller_admin(self): + reseller_internal_headers = { + get_sys_meta_prefix('container') + 'sharding': 'True'} + reseller_external_headers = {'x-container-sharding': 'on'} + controller = proxy_server.ContainerController(self.app, 'a', 'c') + + # Normal users, even swift owners, can't set it + req = Request.blank('/v1/a/c', method='PUT', + headers=reseller_external_headers, + environ={'swift_owner': True}) + with mocked_http_conn(*[201] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertNotIn(key.title(), captured['headers']) + + req = Request.blank('/v1/a/c', method='POST', + headers=reseller_external_headers, + environ={'swift_owner': True}) + with mocked_http_conn(*[204] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertNotIn(key.title(), captured['headers']) + + req = Request.blank('/v1/a/c', environ={'swift_owner': True}) + # Heck, they don't even get to know + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.HEAD(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertNotIn(key, resp.headers) + + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.GET(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertNotIn(key, resp.headers) + + # But reseller admins can set it + req = Request.blank('/v1/a/c', method='PUT', + headers=reseller_external_headers, + environ={'reseller_request': True}) + with mocked_http_conn(*[201] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertIn(key.title(), captured['headers']) + + req = Request.blank('/v1/a/c', method='POST', + headers=reseller_external_headers, + environ={'reseller_request': True}) + with mocked_http_conn(*[204] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertIn(key.title(), captured['headers']) + + # And see that they have + req = Request.blank('/v1/a/c', environ={'reseller_request': True}) + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.HEAD(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertIn(key, resp.headers) + self.assertEqual(resp.headers[key], 'True') + + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.GET(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertEqual(resp.headers[key], 'True') + def test_sys_meta_headers_PUT(self): # check that headers in sys meta namespace make it through # the container controller From 5c5b08d0472f695a0d81655ab80725f1be08d308 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Wed, 2 May 2018 09:33:17 +0100 Subject: [PATCH 9/9] Add container sharding documentation Co-Authored-By: Matthew Oliver Co-Authored-By: Tim Burke Co-Authored-By: Clay Gerrard Co-Authored-By: John Dickinson Change-Id: I0693e54c1d7f3b77f53c3df5c616a16f74723b97 --- doc/source/container.rst | 18 +- doc/source/images/sharded_GET.svg | 2019 ++++++++++++++++ doc/source/images/sharding_GET.svg | 2112 +++++++++++++++++ doc/source/images/sharding_cleave1_load.svg | 1694 +++++++++++++ doc/source/images/sharding_cleave2_load.svg | 1754 ++++++++++++++ doc/source/images/sharding_cleave_basic.svg | 649 +++++ doc/source/images/sharding_db_states.svg | 1502 ++++++++++++ doc/source/images/sharding_scan_basic.svg | 259 ++ doc/source/images/sharding_scan_load.svg | 1665 +++++++++++++ doc/source/images/sharding_sharded_load.svg | 1650 +++++++++++++ doc/source/images/sharding_unsharded.svg | 199 ++ doc/source/images/sharding_unsharded_load.svg | 219 ++ doc/source/index.rst | 1 + doc/source/logs.rst | 1 + doc/source/overview_architecture.rst | 2 + doc/source/overview_container_sharding.rst | 784 ++++++ 16 files changed, 14524 insertions(+), 4 deletions(-) create mode 100644 doc/source/images/sharded_GET.svg create mode 100644 doc/source/images/sharding_GET.svg create mode 100644 doc/source/images/sharding_cleave1_load.svg create mode 100644 doc/source/images/sharding_cleave2_load.svg create mode 100644 doc/source/images/sharding_cleave_basic.svg create mode 100644 doc/source/images/sharding_db_states.svg create mode 100644 doc/source/images/sharding_scan_basic.svg create mode 100644 doc/source/images/sharding_scan_load.svg create mode 100644 doc/source/images/sharding_sharded_load.svg create mode 100644 doc/source/images/sharding_unsharded.svg create mode 100644 doc/source/images/sharding_unsharded_load.svg create mode 100644 doc/source/overview_container_sharding.rst diff --git a/doc/source/container.rst b/doc/source/container.rst index dcff33e3aa..bc95753852 100644 --- a/doc/source/container.rst +++ b/doc/source/container.rst @@ -24,6 +24,16 @@ Container Backend :undoc-members: :show-inheritance: +.. _container-replicator: + +Container Replicator +==================== + +.. automodule:: swift.container.replicator + :members: + :undoc-members: + :show-inheritance: + .. _container-server: Container Server @@ -44,12 +54,12 @@ Container Reconciler :undoc-members: :show-inheritance: -.. _container-replicator: +.. _container-sharder: -Container Replicator -==================== +Container Sharder +================= -.. automodule:: swift.container.replicator +.. automodule:: swift.container.sharder :members: :undoc-members: :show-inheritance: diff --git a/doc/source/images/sharded_GET.svg b/doc/source/images/sharded_GET.svg new file mode 100644 index 0000000000..03c271b5cc --- /dev/null +++ b/doc/source/images/sharded_GET.svg @@ -0,0 +1,2019 @@ + + + + + + image/svg+xmlcont (fresh db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + + cont-4837ad-<ts>-4 + "linux" - "" + + proxy + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + diff --git a/doc/source/images/sharding_GET.svg b/doc/source/images/sharding_GET.svg new file mode 100644 index 0000000000..5e9240feeb --- /dev/null +++ b/doc/source/images/sharding_GET.svg @@ -0,0 +1,2112 @@ + + + + + + image/svg+xmlcont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + cat + + giraffe + + igloo + + linux + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + "linux" - "" + + + proxy + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 3 + + + + 4 + + + + + + + diff --git a/doc/source/images/sharding_cleave1_load.svg b/doc/source/images/sharding_cleave1_load.svg new file mode 100644 index 0000000000..4485e3ea09 --- /dev/null +++ b/doc/source/images/sharding_cleave1_load.svg @@ -0,0 +1,1694 @@ + + + + + + image/svg+xmlcont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + + + + + + + cat + + giraffe + + igloo + "igloo" - "" + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + diff --git a/doc/source/images/sharding_cleave2_load.svg b/doc/source/images/sharding_cleave2_load.svg new file mode 100644 index 0000000000..548aab56ab --- /dev/null +++ b/doc/source/images/sharding_cleave2_load.svg @@ -0,0 +1,1754 @@ + + + + + + image/svg+xmlcont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + cat + + + + + + giraffe + + + + + + igloo + + linux + + + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + "linux" - "" + diff --git a/doc/source/images/sharding_cleave_basic.svg b/doc/source/images/sharding_cleave_basic.svg new file mode 100644 index 0000000000..fd5069754f --- /dev/null +++ b/doc/source/images/sharding_cleave_basic.svg @@ -0,0 +1,649 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + /.shards_acct + /acct + cont-568d8e-<ts>-0 + cont-750ed3-<ts>-1 + cont + + diff --git a/doc/source/images/sharding_db_states.svg b/doc/source/images/sharding_db_states.svg new file mode 100644 index 0000000000..6693ef9b3a --- /dev/null +++ b/doc/source/images/sharding_db_states.svg @@ -0,0 +1,1502 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Container DB + + + Container DB + + + + + + + Retiring DB + + + Retiring DB + + + + + + + Fresh DB + + + Fresh DB + + + + + + + Fresh DB + + + Fresh DB + + + + + + + + + SHARDED + + + SHARDED + + + + + + + + + + + UNSHARDED + + + UNSHARDED + + + + + + + SHARDING + + + SHARDING + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/images/sharding_scan_basic.svg b/doc/source/images/sharding_scan_basic.svg new file mode 100644 index 0000000000..54c30f0d8d --- /dev/null +++ b/doc/source/images/sharding_scan_basic.svg @@ -0,0 +1,259 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + /acct + cont + + + cat + giraffe + diff --git a/doc/source/images/sharding_scan_load.svg b/doc/source/images/sharding_scan_load.svg new file mode 100644 index 0000000000..327ac1a06c --- /dev/null +++ b/doc/source/images/sharding_scan_load.svg @@ -0,0 +1,1665 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + + + + + cat + + giraffe + + igloo + "igloo" - "" + diff --git a/doc/source/images/sharding_sharded_load.svg b/doc/source/images/sharding_sharded_load.svg new file mode 100644 index 0000000000..ae9aacb86c --- /dev/null +++ b/doc/source/images/sharding_sharded_load.svg @@ -0,0 +1,1650 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + + + + cont-4837ad-<ts>-4 + "linux" - "" + diff --git a/doc/source/images/sharding_unsharded.svg b/doc/source/images/sharding_unsharded.svg new file mode 100644 index 0000000000..4241b0de13 --- /dev/null +++ b/doc/source/images/sharding_unsharded.svg @@ -0,0 +1,199 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + /acct + cont + diff --git a/doc/source/images/sharding_unsharded_load.svg b/doc/source/images/sharding_unsharded_load.svg new file mode 100644 index 0000000000..e613e8cbbd --- /dev/null +++ b/doc/source/images/sharding_unsharded_load.svg @@ -0,0 +1,219 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont + /acct + diff --git a/doc/source/index.rst b/doc/source/index.rst index 63df790815..b72925c6dd 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -62,6 +62,7 @@ Overview and Concepts overview_erasure_code overview_encryption overview_backing_store + overview_container_sharding ring_background ring_partpower associated_projects diff --git a/doc/source/logs.rst b/doc/source/logs.rst index f9a8ba2c62..1a5d2656c2 100644 --- a/doc/source/logs.rst +++ b/doc/source/logs.rst @@ -105,6 +105,7 @@ RL :ref:`ratelimit` VW :ref:`versioned_writes` SSC :ref:`copy` SYM :ref:`symlink` +SH :ref:`sharding_doc` ======================= ============================= diff --git a/doc/source/overview_architecture.rst b/doc/source/overview_architecture.rst index 30b26a471f..b0ae293d9a 100644 --- a/doc/source/overview_architecture.rst +++ b/doc/source/overview_architecture.rst @@ -172,6 +172,8 @@ replicator for Replication type policies. See :doc:`overview_erasure_code` for complete information on both Erasure Code support as well as the reconstructor. +.. _architecture_updaters: + -------- Updaters -------- diff --git a/doc/source/overview_container_sharding.rst b/doc/source/overview_container_sharding.rst new file mode 100644 index 0000000000..110fcc8f87 --- /dev/null +++ b/doc/source/overview_container_sharding.rst @@ -0,0 +1,784 @@ +.. _sharding_doc: + +================== +Container Sharding +================== + +Container sharding is an operator controlled feature that may be used to shard +very large container databases into a number of smaller shard containers + +.. note:: + + Container sharding is currently an experimental feature. It is strongly + recommended that operators gain experience of sharding containers in a + non-production cluster before using in production. + + The sharding process involves moving all sharding container database + records via the container replication engine; the time taken to complete + sharding is dependent upon the existing cluster load and the performance of + the container database being sharded. + + There is currently no documented process for reversing the sharding + process once sharding has been enabled. + + +---------- +Background +---------- +The metadata for each container in Swift is stored in an SQLite database. This +metadata includes: information about the container such as its name, +modification time and current object count; user metadata that may been written +to the container by clients; a record of every object in the container. The +container database object records are used to generate container listings in +response to container GET requests; each object record stores the object's +name, size, hash and content-type as well as associated timestamps. + +As the number of objects in a container increases then the number of object +records in the container database increases. Eventually the container database +performance starts to degrade and the time taken to update an object record +increases. This can result in object updates timing out, with a corresponding +increase in the backlog of pending :ref:`asynchronous updates +` on object servers. Container databases are typically +replicated on several nodes and any database performance degradation can also +result in longer :doc:`container replication ` times. + +The point at which container database performance starts to degrade depends +upon the choice of hardware in the container ring. Anecdotal evidence suggests +that containers with tens of millions of object records have noticeably +degraded performance. + +This performance degradation can be avoided by ensuring that clients use an +object naming scheme that disperses objects across a number of containers +thereby distributing load across a number of container databases. However, that +is not always desirable nor is it under the control of the cluster operator. + +Swift's container sharding feature provides the operator with a mechanism to +distribute the load on a single client-visible container across multiple, +hidden, shard containers, each of which stores a subset of the container's +object records. Clients are unaware of container sharding; clients continue to +use the same API to access a container that, if sharded, maps to a number of +shard containers within the Swift cluster. + +------------------------ +Deployment and operation +------------------------ + +Upgrade Considerations +---------------------- + +It is essential that all servers in a Swift cluster have been upgraded to +support the container sharding feature before attempting to shard a container. + +Identifying containers in need of sharding +------------------------------------------ + +Container sharding is currently initiated by the ``swift-manage-shard-ranges`` +CLI tool :ref:`described below `. Operators must +first identify containers that are candidates for sharding. To assist with +this, the :ref:`sharder_daemon` inspects the size of containers that it visits +and writes a list of sharding candidates to recon cache. For example:: + + "sharding_candidates": { + "found": 1, + "top": [ + { + "account": "AUTH_test", + "container": "c1", + "file_size": 497763328, + "meta_timestamp": "1525346445.31161", + "node_index": 2, + "object_count": 3349028, + "path": , + "root": "AUTH_test/c1" + } + ] + } + +A container is considered to be a sharding candidate if its object count is +greater than or equal to the ``shard_container_threshold`` option. +The number of candidates reported is limited to a number configured by the +``recon_candidates_limit`` option such that only the largest candidate +containers are included in the ``sharding_candidate`` data. + + +.. _swift-manage-shard-ranges: + +``swift-manage-shard-ranges`` CLI tool +-------------------------------------- + +The ``swift-manage-shard-ranges`` tool provides commands for initiating +sharding of a container. ``swift-manage-shard-ranges`` operates directly on a +container database file. + +.. note:: + + ``swift-manage-shard-ranges`` must only be used on one replica of a + container database to avoid inconsistent results. The modifications made by + ``swift-manage-shard-ranges`` will be automatically copied to other + replicas of the container database via normal replication processes. + +There are three steps in the process of initiating sharding, each of which may +be performed in isolation or, as shown below, using a single command. + +#. The ``find`` sub-command scans the container database to identify how many + shard containers will be required and which objects they will manage. Each + shard container manages a range of the object namespace defined by a + ``lower`` and ``upper`` bound. The maximum number of objects to be allocated + to each shard container is specified on the command line. For example:: + + $ swift-manage-shard-ranges find 500000 + Loaded db broker for AUTH_test/c1. + [ + { + "index": 0, + "lower": "", + "object_count": 500000, + "upper": "o_01086834" + }, + { + "index": 1, + "lower": "o_01086834", + "object_count": 500000, + "upper": "o_01586834" + }, + { + "index": 2, + "lower": "o_01586834", + "object_count": 500000, + "upper": "o_02087570" + }, + { + "index": 3, + "lower": "o_02087570", + "object_count": 500000, + "upper": "o_02587572" + }, + { + "index": 4, + "lower": "o_02587572", + "object_count": 500000, + "upper": "o_03087572" + }, + { + "index": 5, + "lower": "o_03087572", + "object_count": 500000, + "upper": "o_03587572" + }, + { + "index": 6, + "lower": "o_03587572", + "object_count": 349194, + "upper": "" + } + ] + Found 7 ranges in 4.37222s (total object count 3349194) + + This command returns a list of shard ranges each of which describes the + namespace to be managed by a shard container. No other action is taken by + this command and the container database is unchanged. The output may be + redirected to a file for subsequent retrieval by the ``replace`` command. + For example:: + + $ swift-manage-shard-ranges find 500000 > my_shard_ranges + Loaded db broker for AUTH_test/c1. + Found 7 ranges in 2.448s (total object count 3349194) + +#. The ``replace`` sub-command deletes any shard ranges that might already be + in the container database and inserts shard ranges from a given file. The + file contents should be in the format generated by the ``find`` sub-command. + For example:: + + $ swift-manage-shard-ranges replace my_shard_ranges + Loaded db broker for AUTH_test/c1. + No shard ranges found to delete. + Injected 7 shard ranges. + Run container-replicator to replicate them to other nodes. + Use the enable sub-command to enable sharding. + + The container database is modified to store the shard ranges, but the + container will not start sharding until sharding is enabled. The ``info`` + sub-command may be used to inspect the state of the container database at + any point, and the ``show`` sub-command may be used to display the inserted + shard ranges. + + Shard ranges stored in the container database may be replaced using the + ``replace`` sub-command. This will first delete all existing shard ranges + before storing new shard ranges. Shard ranges may also be deleted from the + container database using the ``delete`` sub-command. + + Shard ranges should not be replaced or deleted using + ``swift-manage-shard-ranges`` once the next step of enabling sharding has + been taken. + +#. The ``enable`` sub-command enables the container for sharding. The sharder + daemon and/or container replicator daemon will replicate shard ranges to + other replicas of the container db and the sharder daemon will proceed to + shard the container. This process may take some time depending on the size + of the container, the number of shard ranges and the underlying hardware. + +.. note:: + + Once the ``enable`` sub-command has been used there is no supported + mechanism to revert sharding. Do not use ``swift-manage-shard-ranges`` to + make any further changes to the shard ranges in the container db. + + For example:: + + $ swift-manage-shard-ranges enable + Loaded db broker for AUTH_test/c1. + Container moved to state 'sharding' with epoch 1525345093.22908. + Run container-sharder on all nodes to shard the container. + + This does not shard the container - sharding is performed by the + :ref:`sharder_daemon` - but sets the necessary state in the database for the + daemon to subsequently start the sharding process. + + The ``epoch`` value displayed in the output is the time at which sharding + was enabled. When the :ref:`sharder_daemon` starts sharding this container + it creates a new container database file using the epoch in the filename to + distinguish it from the retiring DB that is being sharded. + +All three steps may be performed with one sub-command:: + + $ swift-manage-shard-ranges find_and_replace 500000 --enable --force + Loaded db broker for AUTH_test/c1. + No shard ranges found to delete. + Injected 7 shard ranges. + Run container-replicator to replicate them to other nodes. + Container moved to state 'sharding' with epoch 1525345669.46153. + Run container-sharder on all nodes to shard the container. + +.. _sharder_daemon: + +``container-sharder`` daemon +---------------------------- + +Once sharding has been enabled for a container, the act of sharding is +performed by the :ref:`container-sharder`. The :ref:`container-sharder` daemon +must be running on all container servers. The ``container-sharder`` daemon +periodically visits each container database to perform any container sharding +tasks that are required. + +The ``container-sharder`` daemon requires a ``[container-sharder]`` config +section to exist in the container server configuration file; a sample config +section is shown in the `container-server.conf-sample` file. + +.. note:: + + Several of the ``[container-sharder]`` config options are only significant + when the ``auto_shard`` option is enabled. This option enables the + ``container-sharder`` daemon to automatically identify containers that are + candidates for sharding and initiate the sharding process, instead of using + the ``swift-manage-shard-ranges`` tool. The ``auto_shard`` option is + currently NOT recommended for production systems and shoud be set to + ``false`` (the default value). + +The container sharder uses an internal client and therefore requires an +internal client configuration file to exist. By default the internal-client +configuration file is expected to be found at +`/etc/swift/internal-client.conf`. An alternative location for the +configuration file may be specified using the ``internal_client_conf_path`` +option in the ``[container-sharder]`` config section. + +The content of the internal-client configuration file should be the same as the +`internal-client.conf-sample` file. In particular, the internal-client +configuration should have:: + + account_autocreate = True + +in the ``[proxy-server]`` section. + +A container database may require several visits by the ``container-sharder`` +daemon before it is fully sharded. On each visit the ``container-sharder`` +daemon will move a subset of object records to new shard containers by cleaving +new shard container databases from the original. By default, two shards are +processed per visit; this number may be configured by the ``cleave_batch_size`` +option. + +The ``container-sharder`` daemon periodically writes progress data for +containers that are being sharded to recon cache. For example:: + + "sharding_in_progress": { + "all": [ + { + "account": "AUTH_test", + "active": 0, + "cleaved": 2, + "container": "c1", + "created": 5, + "db_state": "sharding", + "error": null, + "file_size": 26624, + "found": 0, + "meta_timestamp": "1525349617.46235", + "node_index": 1, + "object_count": 3349030, + "path": , + "root": "AUTH_test/c1", + "state": "sharding" + } + ] + } + +This example indicates that from a total of 7 shard ranges, 2 have been cleaved +whereas 5 remain in created state waiting to be cleaved. + +Shard containers are created in an internal account and not visible to clients. +By default, shard containers for an account ``AUTH_test`` are created in the +internal account ``.shards_AUTH_test``. + +Once a container has started sharding, object updates to that container may be +redirected to the shard container. The ``container-sharder`` daemon is also +responsible for sending updates of a shard's object count and bytes_used to the +original container so that aggegrate object count and bytes used values can be +returned in responses to client requests. + +.. note:: + + The ``container-sharder`` daemon must continue to run on all container + servers in order for shards object stats updates to be generated. + + +-------------- +Under the hood +-------------- + +Terminology +----------- + +================== ================================================== +Name Description +================== ================================================== +Root container The original container that lives in the + user's account. It holds references to its + shard containers. +Retiring DB The original database file that is to be sharded. +Fresh DB A database file that will replace the retiring + database. +Shard range A range of the object namespace defined by a lower + bound and and upper bound. +Shard container A container that holds object records for a shard + range. Shard containers exist a hidden account + mirroring the user's account. +Misplaced objects Items that don't belong in a container's shard + range. These will be moved to their correct + location by the container-sharder. +Cleaving The act of moving object records within a shard + range to a shard container database. +Shrinking The act of merging a small shard container into + another shard container in order to delete the + small shard container. +Donor The shard range that is shrinking away. +Acceptor The shard range into which a donor is merged. +================== ================================================== + + +Finding shard ranges +-------------------- + +The end goal of sharding a container is to replace the original container +database which has grown very large with a number of shard container databases, +each of which is responsible for storing a range of the entire object +namespace. The first step towards achieving this is to identify an appropriate +set of contiguous object namespaces, known as shard ranges, each of which +contains a similar sized portion of the container's current object content. + +Shard ranges cannot simply be selected by sharding the namespace uniformly, +because object names are not guaranteed to be distributed uniformly. If the +container were naively sharded into two shard ranges, one containing all +object names up to `m` and the other containing all object names beyond `m`, +then if all object names actually start with `o` the outcome would be an +extremely unbalanced pair of shard containers. + +It is also too simplistic to assume that every container that requires sharding +can be sharded into two. This might be the goal in the ideal world, but in +practice there will be containers that have grown very large and should be +sharded into many shards. Furthermore, the time required to find the exact +mid-point of the existing object names in a large SQLite database would +increase with container size. + +For these reasons, shard ranges of size `N` are found by searching for the +`Nth` object in the database table, sorted by object name, and then searching +for the `(2 * N)th` object, and so on until all objects have been searched. For +a container that has exactly `2N` objects, the end result is the same as +sharding the container at the midpoint of its object names. In practice +sharding would typically be enabled for containers with great than `2N` objects +and more than two shard ranges will be found, the last one probably containing +less than `N` objects. With containers having large multiples of `N` objects, +shard ranges can be identified in batches which enables more scalable solution. + +To illustrate this process, consider a very large container in a user account +``acct`` that is a candidate for sharding: + +.. image:: images/sharding_unsharded.svg + +The :ref:`swift-manage-shard-ranges` tool ``find`` sub-command searches the +object table for the `Nth` object whose name will become the upper bound of the +first shard range, and the lower bound of the second shard range. The lower +bound of the first shard range is the empty string. + +For the purposes of this example the first upper bound is `cat`: + +.. image:: images/sharding_scan_basic.svg + +:ref:`swift-manage-shard-ranges` continues to search the container to find +further shard ranges, with the final upper bound also being the empty string. + +Enabling sharding +----------------- + +Once shard ranges have been found the :ref:`swift-manage-shard-ranges` +``replace`` sub-command is used to insert them into the `shard_ranges` table +of the container database. In addition to its lower and upper bounds, each +shard range is given a name. The name takes the form ``a/c`` where ``a`` is an +account name formed by prefixing the user account with the string +``.shards_``, and ``c`` is a container name that is derived from the original +container and includes the index of the shard range. The final container name +for the shard range uses the pattern of ``{original contianer name}-{hash of +parent container}-{timestamp}-{shard index}``. + +The ``enable`` sub-command then creates some final state required to initiate +sharding the container, including a special shard range record referred to as +the container's `own_shard_range` whose name is equal to the container's path. +This is used to keep a record of the object namespace that the container +covers, which for user containers is always the entire namespace. + +The :class:`~swift.common.utils.ShardRange` class +------------------------------------------------- + +The :class:`~swift.common.utils.ShardRange` class provides methods for +interactng with the attributes and state of a shard range. The class +encapsulates the following properties: + +* The name of the shard range which is also the name of the shard container + used to hold object records in its namespace. +* Lower and upper bounds which define the object namespace of the shard range. +* A deleted flag. +* A timestamp at which the bounds and deleted flag were last modified. +* The object stats for the shard range i.e. object count and bytes used. +* A timestamp at which the object stats were last modified. +* The state of the shard range, and an epoch, which is the timestamp used in + the shard container's database file name. +* A timestamp at which the state and epoch were last modified. + +A shard range progresses through the following states: + +* FOUND: the shard range has been identified in the container that is to be + sharded but no resources have been created for it. +* CREATED: A shard container has been created to store the contents of the + shard range. +* CLEAVED: the sharding container's contents for the shard range have been + copied to the shard container from *at least one replica* of the sharding + container. +* ACTIVE: shard ranges move to this state when all shard ranges in a sharding + container have been cleaved. +* SHRINKING: the shard range has been enabled for shrinking; or +* SHARDING: the shard range has been enabled for sharding. +* SHARDED: the shard range has completed sharding or shrinking. + +..note:: + + Shard range state represents the most advanced state of the shard range on + any replica of the container. For example, a shard range in CLEAVED state + may not have completed cleaving on all replicas but has cleaved on at least + one replica. + +Fresh and retiring database files +--------------------------------- + +As alluded to earlier, writing to a large container causes increased latency +for the container servers. Once sharding has been initiated on a container it +is desirable to stop writing to the large database; ultimately it will be +unlinked. This is primarily achieved by redirecting object updates to new shard +containers as they are created (see :ref:`redirecting_updates` below), but some +object updates may still need to be accepted by the root container and other +container metadata must still be modifiable. + +To render the large `retiring` database effectively read-only, when the +:ref:`sharder_daemon` finds a container with a set of shard range records, +including an `own_shard_range`, it first creates a fresh database file which +will ultimately replace the existing `retiring` database. For a retiring db +whose filename is:: + + .db + +the fresh database file name is of the form:: + + _.db + +where epoch is a timestamp stored in the container's `own_shard_range`. + +The fresh DB has a copy of the shard ranges table from the retiring DB and all +other container metadata apart from the object records. Once a fresh DB file +has been created it is used to store any new object updates and no more object +records are written to the retiring DB file. + +Once the sharding process has completed, the retiring DB file will be unlinked +leaving only the fresh DB file in the container's directory. There are +therefore three states that the container DB directory may be in during the +sharding process: UNSHARDED, SHARDING and SHARDED. + +.. image:: images/sharding_db_states.svg + +If the container ever shrink to the point that is has no shards then the fresh +DB starts to store object records, behaving the same as an unsharded container. +This is known as the COLLAPSED state. + +In summary, the DB states that any container replica may be in are: + +- UNSHARDED - In this state there is just one standard container database. All + containers are originally in this state. +- SHARDING - There are now two databases, the retiring database and a fresh + database. The fresh database stores any metadata, container level stats, + an object holding table, and a table that stores shard ranges. +- SHARDED - There is only one database, the fresh database, which has one or + more shard ranges in addition to its own shard range. The retiring database + has been unlinked. +- COLLAPSED - There is only one database, the fresh database, which has only + its its own shard range and store object records. + +.. note:: + + DB state is unique to each replica of a container and is not necessarily + synchronised with shard range state. + + +Creating shard containers +------------------------- + +The :ref:`sharder_daemon` next creates a shard container for each shard range +using the shard range name as the name of the shard container: + +.. image:: /images/sharding_cleave_basic.svg + +Shard containers now exist with a unique name and placed in a hidden account +that maps to the user account (`.shards_acct`). This avoids namespace +collisions and also keeps all the shard containers out of view from users of +the account. Each shard container has an `own_shard_range` record which has the +lower and upper bounds of the object namespace for which it is responsible, and +a reference to the sharding user container, which is referred to as the +`root_container`. Unlike the `root_container`, the shard container's +`own_shard_range` does not cover the entire namepsace. + +Cleaving shard containers +------------------------- + +Having created empty shard containers the sharder daemon will proceed to cleave +objects from the retiring database to each shard range. Cleaving occurs in +batches of two (by default) shard ranges, so if a container has more than two +shard ranges then the daemon must visit it multiple times to complete cleaving. + +To cleave a shard range the daemon creates a shard database for the shard +container on a local device. This device may be one of the shard container's +primary nodes but often it will not. Object records from the corresponding +shard range namespace are then copied from the retiring DB to this shard DB. + +Swift's container replication mechanism is then used to replicate the shard DB +to its primary nodes. Checks are made to ensure that the new shard container DB +has been replicated to a sufficient number of its primary nodes before it is +considered to have been successfully cleaved. By default the daemon requires +successful replication of a new shard broker to at least a quorum of the +container rings replica count, but this requirement can be tuned using the +``shard_replication_quorum`` option. + +Once a shard range has been succesfully cleaved from a retiring database the +daemon transitions its state to ``CLEAVED``. It should be noted that this state +transition occurs as soon as any one of the retiring DB replicas has cleaved +the shard range, and therefore does not imply that all retiring DB replicas +have cleaved that range. The significance of the state transition is that the +shard container is now considered suitable for contributing to object listings, +since its contents are present on a quorum of its primary nodes and are the +same as at least one of the retiring DBs for that namespace. + +Once a shard range is in the ``CLEAVED`` state, the requirement for +'successful' cleaving of other instances of the retirng DB may optionally be +relaxed since it is not so imperative that their contents are replicated +*immediately* to their primary nodes. The ``existing_shard_replication_quorum`` +option can be used to reduce the quorum required for a cleaved shard range to +be considered successfully replicated by the sharder daemon. + +.. note:: + + Once cleaved, shard container DBs will continue to be replicated by the + normal `container-replicator` daemon so that they will eventually be fully + replicated to all primary nodes regardless of any replication quorum options + used by the sharder daemon. + +The cleaving progress of each replica of a retiring DB must be +tracked independently of the shard range state. This is done using a per-DB +CleavingContext object that maintains a cleaving cursor for the retiring DB +that it is associated with. The cleaving cursor is simply the upper bound of +the last shard range to have been cleaved *from that particular retiring DB*. + +Each CleavingContext is stored in the sharding container's sysmeta under a key +that is the ``id`` of the retiring DB. Since all container DB files have unique +``id``s, this guarantees that each retiring DB will have a unique +CleavingContext. Furthermore, if the retiring DB file is changed, for example +by an rsync_then_merge replication operation which might change the contents of +the DB's object table, then it will get a new unique CleavingContext. + +A CleavingContext maintains other state that is used to ensure that a retiring +DB is only considered to be fully cleaved, and ready to be deleted, if *all* of +its object rows have been cleaved to a shard range. + +Once all shard ranges have been cleaved from the retiring DB it is deleted. The +container is now represented by the fresh DB which has a table of shard range +records that point to the shard containers that store the container's object +records. + +.. _redirecting_updates: + +Redirecting object updates +-------------------------- + +Once a shard container exists, object updates arising from new client requests +and async pending files are directed to the shard container instead of the root +container. This takes load off of the root container. + +For a sharded (or partially sharded) container, when the proxy receives a new +object request it issues a GET request to the container for data describing a +shard container to which the object update should be sent. The proxy then +annotates the object request with the shard container location so that the +object server will forward object updates to the shard container. If those +updates fail then the async pending file that is written on the object server +contains the shard container location. + +When the object updater processes async pending files for previously failed +object updates, it may not find a shard container location. In this case the +updater sends the update to the `root container`, which returns a redirection +response with the shard container location. + +.. note:: + + Object updates are directed to shard containers as soon as they exist, even + if the retiring DB object records have not yet been cleaved to the shard + container. This prevents further writes to the retiring DB and also avoids + the fresh DB being polluted by new object updates. The goal is to + ultimately have all object records in the shard containers and none in the + root container. + +Building container listings +--------------------------- + +Listing requests for a sharded container are handled by querying the shard +containers for components of the listing. The proxy forwards the client listing +request to the root container, as it would for an unsharded container, but the +container server responds with a list of shard ranges rather than objects. The +proxy then queries each shard container in namespace order for their listing, +until either the listing length limit is reached or all shard ranges have been +listed. + +While a container is still in the process of sharding, only *cleaved* shard +ranges are used when building a container listing. Shard ranges that have not +yet cleaved will not have any object records from the root container. The root +container continues to provide listings for the uncleaved part of its +namespace. + +..note:: + + New object updates are redirected to shard containers that have not yet been + cleaved. These updates will not threfore be included in container listings + until their shard range has been cleaved. + +Example request redirection +--------------------------- + +As an example, consider a sharding container in which 3 shard ranges have been +found ending in cat, giraffe and igloo. Their respective shard containers have +been created so update requests for objects up to "igloo" are redirected to the +appropriate shard container. The root DB continues to handle listing requests +and update requests for any object name beyond "igloo". + +.. image:: images/sharding_scan_load.svg + +The sharder daemon cleaves objects from the retiring DB to the shard range DBs; +it also moves any misplaced objects from the root container's fresh DB to the +shard DB. Cleaving progress is represented by the blue line. Once the first +shard range has been cleaved listing requests for that namespace are directed +to the shard container. The root container still provides listings for the +remainder of the namespace. + +.. image:: images/sharding_cleave1_load.svg + +The process continues: the sharder cleaves the next range and a new range is +found with upper bound of "linux". Now the root container only needs to handle +listing requests up to "giraffe" and update requests for objects whose name is +greater than "linux". Load will continue to diminish on the root DB and be +dispersed across the shard DBs. + +.. image:: images/sharding_cleave2_load.svg + + +Container replication +--------------------- + +Shard range records are replicated between container DB replicas in much the +same way as object records are for unsharded containers. However, the usual +replication of object records between replicas of a container is halted as soon +as a container is capable of being sharded. Instead, object records are moved +to their new locations in shard containers. This avoids unnecessary replication +traffic between container replicas. + +To facilitate this, shard ranges are both 'pushed' and 'pulled' during +replication, prior to any attempt to replicate objects. This means that the +node initiating replication learns about shard ranges from the destination node +early during the replication process and is able to skip object replication if +it discovers that it has shard ranges and is able to shard. + +.. note:: + + When the destination DB for container replication is missing then the + 'complete_rsync' replication mechanism is still used and in this case only + both object records and shard range records are copied to the destination + node. + +Container deletion +------------------ + +Sharded containers may be deleted by a ``DELETE`` request just like an +unsharded container. A sharded container must be empty before it can be deleted +which implies that all of its shard containers must have reported that they are +empty. + +Shard containers are *not* immediately deleted when their root container is +deleted; the shard containers remain undeleted so that they are able to +continue to receive object updates that might arrive after the root container +has been deleted. Shard containers continue to update their deleted root +container with their object stats. If a shard container does receive object +updates that cause it to no longer be empty then the root container will no +longer be considered deleted once that shard container sends an object stats +update. + + +Sharding a shard container +-------------------------- + +A shard container may grow to a size that requires it to be sharded. +``swift-manage-shard-ranges`` may be used to identify shard ranges within a +shard container and enable sharding in the same way as for a root container. +When a shard is sharding it notifies the root of its shard ranges so that the +root can start to redirect object updates to the new 'sub-shards'. When the +shard has completed sharding the root is aware of all the new sub-shards and +the sharding shard deletes its shard range record in the root container shard +ranges table. At this point the root is aware of all the new sub-shards which +collectively cover the namespace of the now-deleted shard. + +There is no hierarchy of shards beyond the root and its immediate shards. When +a shard shards, its sub-shards are effectively re-parented with the root +container. + + +Shrinking a shard container +--------------------------- + +A shard's contents may reduce to a point where the shard is no longer required. +If this happens then the shard may be shrunk into another shard range. +Shrinking is achieved in a similar way to sharding: an 'acceptor' shard range +is written to the shrinking shard container's shard ranges table; unlike +sharding, where shard ranges each cover a subset of the sharding container's +namespace, the acceptor shard range is a superset of the shrinking shard range. + +Once given an acceptor shard range the shrinking shard will cleave itself to +its acceptor, and then delete itself from the root container shard ranges +table.