Add databases_per_second to db daemons

Most daemons have a "go as fast as you can then sleep for 30 seconds"
strategy towards resource utilization; the object-updater and
object-auditor however have some "X_per_second" options that allow
operators much better control over how they spend their I/O budget.

This change extends that pattern into the account-replicator,
container-replicator, and container-sharder which have been known to peg
CPUs when they're not IO limited.

Partial-Bug: #1784753
Change-Id: Ib7f2497794fa2f384a1a6ab500b657c624426384
This commit is contained in:
Clay Gerrard 2018-10-29 14:49:48 -05:00 committed by Tim Burke
parent 24bf5eea8c
commit 06cf5d298f
7 changed files with 253 additions and 178 deletions

View File

@ -1173,9 +1173,9 @@ ionice_priority None I/O scheduling priority of ser
[container-replicator] [container-replicator]
********************** **********************
================== =========================== ============================= ==================== =========================== =============================
Option Default Description Option Default Description
------------------ --------------------------- ----------------------------- -------------------- --------------------------- -----------------------------
log_name container-replicator Label used when logging log_name container-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level log_level INFO Logging level
@ -1201,6 +1201,10 @@ concurrency 8 Number of replication workers
to spawn to spawn
interval 30 Time in seconds to wait interval 30 Time in seconds to wait
between replication passes between replication passes
databases_per_second 50 Maximum databases to process
per second. Should be tuned
according to individual
system specs. 0 is unlimited.
node_timeout 10 Request timeout to external node_timeout 10 Request timeout to external
services services
conn_timeout 0.5 Connection timeout to external conn_timeout 0.5 Connection timeout to external
@ -1260,7 +1264,7 @@ ionice_priority None I/O scheduling priority of
Work only with ionice_class. Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE Ignored if IOPRIO_CLASS_IDLE
is set. is set.
================== =========================== ============================= ==================== =========================== =============================
******************* *******************
[container-updater] [container-updater]
@ -1524,9 +1528,9 @@ ionice_priority None I/O scheduling priority of server
[account-replicator] [account-replicator]
******************** ********************
================== ========================= =============================== ==================== ========================= ===============================
Option Default Description Option Default Description
------------------ ------------------------- ------------------------------- -------------------- ------------------------- -------------------------------
log_name account-replicator Label used when logging log_name account-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level log_level INFO Logging level
@ -1551,6 +1555,10 @@ concurrency 8 Number of replication workers
to spawn to spawn
interval 30 Time in seconds to wait between interval 30 Time in seconds to wait between
replication passes replication passes
databases_per_second 50 Maximum databases to process
per second. Should be tuned
according to individual
system specs. 0 is unlimited.
node_timeout 10 Request timeout to external node_timeout 10 Request timeout to external
services services
conn_timeout 0.5 Connection timeout to external conn_timeout 0.5 Connection timeout to external
@ -1606,7 +1614,7 @@ ionice_priority None I/O scheduling priority of server
Work only with ionice_class. Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE Ignored if IOPRIO_CLASS_IDLE
is set. is set.
================== ========================= =============================== ==================== ========================= ===============================
***************** *****************
[account-auditor] [account-auditor]

View File

@ -143,6 +143,9 @@ use = egg:swift#recon
# run_pause is deprecated, use interval instead # run_pause is deprecated, use interval instead
# run_pause = 30 # run_pause = 30
# #
# Process at most this many databases per second
# databases_per_second = 50
#
# node_timeout = 10 # node_timeout = 10
# conn_timeout = 0.5 # conn_timeout = 0.5
# #

View File

@ -156,6 +156,9 @@ use = egg:swift#recon
# run_pause is deprecated, use interval instead # run_pause is deprecated, use interval instead
# run_pause = 30 # run_pause = 30
# #
# Process at most this many databases per second
# databases_per_second = 50
#
# node_timeout = 10 # node_timeout = 10
# conn_timeout = 0.5 # conn_timeout = 0.5
# #
@ -436,6 +439,9 @@ use = egg:swift#xprofile
# Time in seconds to wait between sharder cycles # Time in seconds to wait between sharder cycles
# interval = 30 # interval = 30
# #
# Process at most this many databases per second
# databases_per_second = 50
#
# The container-sharder accepts the following configuration options as defined # The container-sharder accepts the following configuration options as defined
# in the container-replicator section: # in the container-replicator section:
# #

View File

@ -33,7 +33,7 @@ from swift.common.utils import get_logger, whataremyips, storage_directory, \
renamer, mkdirs, lock_parent_directory, config_true_value, \ renamer, mkdirs, lock_parent_directory, config_true_value, \
unlink_older_than, dump_recon_cache, rsync_module_interpolation, \ unlink_older_than, dump_recon_cache, rsync_module_interpolation, \
json, parse_override_options, round_robin_iter, Everything, get_db_files, \ json, parse_override_options, round_robin_iter, Everything, get_db_files, \
parse_db_filename, quote parse_db_filename, quote, RateLimitedIterator
from swift.common import ring from swift.common import ring
from swift.common.ring.utils import is_local_device from swift.common.ring.utils import is_local_device
from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \ from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \
@ -204,6 +204,8 @@ class Replicator(Daemon):
' to use option %(type)s-replicator/' ' to use option %(type)s-replicator/'
'interval.' 'interval.'
% {'type': self.server_type}) % {'type': self.server_type})
self.databases_per_second = int(
conf.get('databases_per_second', 50))
self.node_timeout = float(conf.get('node_timeout', 10)) self.node_timeout = float(conf.get('node_timeout', 10))
self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.conn_timeout = float(conf.get('conn_timeout', 0.5))
self.rsync_compress = config_true_value( self.rsync_compress = config_true_value(
@ -733,6 +735,11 @@ class Replicator(Daemon):
def report_up_to_date(self, full_info): def report_up_to_date(self, full_info):
return True return True
def roundrobin_datadirs(self, dirs):
return RateLimitedIterator(
roundrobin_datadirs(dirs),
elements_per_second=self.databases_per_second)
def run_once(self, *args, **kwargs): def run_once(self, *args, **kwargs):
"""Run a replication pass once.""" """Run a replication pass once."""
override_options = parse_override_options(once=True, **kwargs) override_options = parse_override_options(once=True, **kwargs)
@ -789,7 +796,7 @@ class Replicator(Daemon):
"file, not replicating", "file, not replicating",
", ".join(ips), self.port) ", ".join(ips), self.port)
self.logger.info(_('Beginning replication run')) self.logger.info(_('Beginning replication run'))
for part, object_file, node_id in roundrobin_datadirs(dirs): for part, object_file, node_id in self.roundrobin_datadirs(dirs):
self.cpool.spawn_n( self.cpool.spawn_n(
self._replicate_object, part, object_file, node_id) self._replicate_object, part, object_file, node_id)
self.cpool.waitall() self.cpool.waitall()

View File

@ -23,7 +23,7 @@ import os
import six import six
from eventlet import Timeout from eventlet import Timeout
from swift.common import internal_client, db_replicator from swift.common import internal_client
from swift.common.constraints import check_drive from swift.common.constraints import check_drive
from swift.common.direct_client import (direct_put_container, from swift.common.direct_client import (direct_put_container,
DirectClientException) DirectClientException)
@ -1500,7 +1500,7 @@ class ContainerSharder(ContainerReplicator):
dirs.append((datadir, node, part_filt)) dirs.append((datadir, node, part_filt))
if not dirs: if not dirs:
self.logger.warning('Found no data dirs!') self.logger.warning('Found no data dirs!')
for part, path, node in db_replicator.roundrobin_datadirs(dirs): for part, path, node in self.roundrobin_datadirs(dirs):
# NB: get_part_nodes always provides an 'index' key; # NB: get_part_nodes always provides an 'index' key;
# this will be used in leader selection # this will be used in leader selection
for primary in self.ring.get_part_nodes(int(part)): for primary in self.ring.get_part_nodes(int(part)):

View File

@ -321,6 +321,7 @@ class TestDBReplicator(unittest.TestCase):
# later config should be extended to assert more config options # later config should be extended to assert more config options
replicator = TestReplicator({'node_timeout': '3.5'}) replicator = TestReplicator({'node_timeout': '3.5'})
self.assertEqual(replicator.node_timeout, 3.5) self.assertEqual(replicator.node_timeout, 3.5)
self.assertEqual(replicator.databases_per_second, 50)
def test_repl_connection(self): def test_repl_connection(self):
node = {'replication_ip': '127.0.0.1', 'replication_port': 80, node = {'replication_ip': '127.0.0.1', 'replication_port': 80,

View File

@ -128,6 +128,7 @@ class TestSharder(BaseTestSharder):
expected = { expected = {
'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201, 'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201,
'per_diff': 1000, 'max_diffs': 100, 'interval': 30, 'per_diff': 1000, 'max_diffs': 100, 'interval': 30,
'databases_per_second': 50,
'cleave_row_batch_size': 10000, 'cleave_row_batch_size': 10000,
'node_timeout': 10, 'conn_timeout': 5, 'node_timeout': 10, 'conn_timeout': 5,
'rsync_compress': False, 'rsync_compress': False,
@ -154,6 +155,7 @@ class TestSharder(BaseTestSharder):
conf = { conf = {
'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010, 'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010,
'per_diff': 2000, 'max_diffs': 200, 'interval': 60, 'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
'databases_per_second': 5,
'cleave_row_batch_size': 3000, 'cleave_row_batch_size': 3000,
'node_timeout': 20, 'conn_timeout': 1, 'node_timeout': 20, 'conn_timeout': 1,
'rsync_compress': True, 'rsync_compress': True,
@ -176,6 +178,7 @@ class TestSharder(BaseTestSharder):
expected = { expected = {
'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010, 'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010,
'per_diff': 2000, 'max_diffs': 200, 'interval': 60, 'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
'databases_per_second': 5,
'cleave_row_batch_size': 3000, 'cleave_row_batch_size': 3000,
'node_timeout': 20, 'conn_timeout': 1, 'node_timeout': 20, 'conn_timeout': 1,
'rsync_compress': True, 'rsync_compress': True,
@ -485,7 +488,7 @@ class TestSharder(BaseTestSharder):
0, 'text/plain', 'etag', 0) 0, 'text/plain', 'etag', 0)
# check only sharding enabled containers are processed # check only sharding enabled containers are processed
with mock.patch.object( with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker' sharder, '_process_broker'
) as mock_process_broker: ) as mock_process_broker:
sharder._local_device_ids = {'stale_node_id'} sharder._local_device_ids = {'stale_node_id'}
@ -539,7 +542,7 @@ class TestSharder(BaseTestSharder):
"for %s" % broker.path) "for %s" % broker.path)
# check exceptions are handled # check exceptions are handled
with mock.patch.object( with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker', side_effect=mock_processing sharder, '_process_broker', side_effect=mock_processing
) as mock_process_broker: ) as mock_process_broker:
sharder._local_device_ids = {'stale_node_id'} sharder._local_device_ids = {'stale_node_id'}
@ -593,7 +596,7 @@ class TestSharder(BaseTestSharder):
for i in range(10): for i in range(10):
brokers[1].delete_object( brokers[1].delete_object(
'o%s' % i, next(self.ts_iter).internal) 'o%s' % i, next(self.ts_iter).internal)
with mock.patch.object( with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker' sharder, '_process_broker'
) as mock_process_broker: ) as mock_process_broker:
sharder._local_device_ids = {999} sharder._local_device_ids = {999}
@ -612,6 +615,53 @@ class TestSharder(BaseTestSharder):
expected_candidate_stats, sharder, 'sharding_candidates') expected_candidate_stats, sharder, 'sharding_candidates')
self._assert_recon_stats(None, sharder, 'sharding_progress') self._assert_recon_stats(None, sharder, 'sharding_progress')
def test_ratelimited_roundrobin(self):
n_databases = 100
def stub_iter(dirs):
for i in range(n_databases):
yield i, '/srv/node/sda/path/to/container.db', {}
now = time.time()
clock = {
'sleeps': [],
'now': now,
}
def fake_sleep(t):
clock['sleeps'].append(t)
clock['now'] += t
def fake_time():
return clock['now']
with self._mock_sharder({'databases_per_second': 1}) as sharder, \
mock.patch('swift.common.db_replicator.roundrobin_datadirs',
stub_iter), \
mock.patch('time.time', fake_time), \
mock.patch('eventlet.sleep', fake_sleep):
list(sharder.roundrobin_datadirs(None))
# 100 db at 1/s should take ~100s
run_time = sum(clock['sleeps'])
self.assertTrue(97 <= run_time < 100, 'took %s' % run_time)
n_databases = 1000
now = time.time()
clock = {
'sleeps': [],
'now': now,
}
with self._mock_sharder({'databases_per_second': 50}) as sharder, \
mock.patch('swift.common.db_replicator.roundrobin_datadirs',
stub_iter), \
mock.patch('time.time', fake_time), \
mock.patch('eventlet.sleep', fake_sleep):
list(sharder.roundrobin_datadirs(None))
# 1000 db at 50/s
run_time = sum(clock['sleeps'])
self.assertTrue(18 <= run_time < 20, 'took %s' % run_time)
@contextmanager @contextmanager
def _mock_sharder(self, conf=None, replicas=3): def _mock_sharder(self, conf=None, replicas=3):
conf = conf or {} conf = conf or {}