2a593174a5
A container is typically sharded when it has grown to have an object count of shard_container_threshold + N, where N << shard_container_threshold. If sharded using the default rows_per_shard of shard_container_threshold / 2 then this would previously result in 3 shards: the tail shard would typically be small, having only N rows. This behaviour caused more shards to be generated than desirable. This patch adds a minimum-shard-size option to swift-manage-shard-ranges, and a corresponding option in the sharder config, which can be used to avoid small tail shards. If set to greater than one then the final shard range may be extended to more than rows_per_shard in order to avoid a further shard range with less than minimum-shard-size rows. In the example given, if minimum-shard-size is set to M > N then the container would shard into two shards having rows_per_shard rows and rows_per_shard + N respectively. The default value for minimum-shard-size is rows_per_shard // 5. If all options have their default values this results in minimum-shard-size being 100000. Closes-Bug: #1928370 Co-Authored-By: Matthew Oliver <matt@oliver.net.au> Change-Id: I3baa278c6eaf488e3f390a936eebbec13f2c3e55
513 lines
20 KiB
Plaintext
513 lines
20 KiB
Plaintext
[DEFAULT]
|
|
# bind_ip = 0.0.0.0
|
|
bind_port = 6201
|
|
# keep_idle = 600
|
|
# bind_timeout = 30
|
|
# backlog = 4096
|
|
# user = swift
|
|
# swift_dir = /etc/swift
|
|
# devices = /srv/node
|
|
# mount_check = true
|
|
# disable_fallocate = false
|
|
#
|
|
# Use an integer to override the number of pre-forked processes that will
|
|
# accept connections.
|
|
# workers = auto
|
|
#
|
|
# Maximum concurrent requests per worker
|
|
# max_clients = 1024
|
|
#
|
|
# This is a comma separated list of hosts allowed in the X-Container-Sync-To
|
|
# field for containers. This is the old-style of using container sync. It is
|
|
# strongly recommended to use the new style of a separate
|
|
# container-sync-realms.conf -- see container-sync-realms.conf-sample
|
|
# allowed_sync_hosts = 127.0.0.1
|
|
#
|
|
# You can specify default log routing here if you want:
|
|
# log_name = swift
|
|
# log_facility = LOG_LOCAL0
|
|
# log_level = INFO
|
|
# log_address = /dev/log
|
|
# The following caps the length of log lines to the value given; no limit if
|
|
# set to 0, the default.
|
|
# log_max_line_length = 0
|
|
#
|
|
# Hashing algorithm for log anonymization. Must be one of algorithms supported
|
|
# by Python's hashlib.
|
|
# log_anonymization_method = MD5
|
|
#
|
|
# Salt added during log anonymization
|
|
# log_anonymization_salt =
|
|
#
|
|
# Template used to format logs. All words surrounded by curly brackets
|
|
# will be substituted with the appropriate values
|
|
# log_format = {remote_addr} - - [{time.d}/{time.b}/{time.Y}:{time.H}:{time.M}:{time.S} +0000] "{method} {path}" {status} {content_length} "{referer}" "{txn_id}" "{user_agent}" {trans_time:.4f} "{additional_info}" {pid} {policy_index}
|
|
#
|
|
# comma separated list of functions to call to setup custom log handlers.
|
|
# functions get passed: conf, name, log_to_console, log_route, fmt, logger,
|
|
# adapted_logger
|
|
# log_custom_handlers =
|
|
#
|
|
# If set, log_udp_host will override log_address
|
|
# log_udp_host =
|
|
# log_udp_port = 514
|
|
#
|
|
# You can enable StatsD logging here:
|
|
# log_statsd_host =
|
|
# log_statsd_port = 8125
|
|
# log_statsd_default_sample_rate = 1.0
|
|
# log_statsd_sample_rate_factor = 1.0
|
|
# log_statsd_metric_prefix =
|
|
#
|
|
# If you don't mind the extra disk space usage in overhead, you can turn this
|
|
# on to preallocate disk space with SQLite databases to decrease fragmentation.
|
|
# db_preallocation = off
|
|
#
|
|
# Enable this option to log all sqlite3 queries (requires python >=3.3)
|
|
# db_query_logging = off
|
|
#
|
|
# eventlet_debug = false
|
|
#
|
|
# You can set fallocate_reserve to the number of bytes or percentage of disk
|
|
# space you'd like fallocate to reserve, whether there is space for the given
|
|
# file size or not. Percentage will be used if the value ends with a '%'.
|
|
# fallocate_reserve = 1%
|
|
#
|
|
# You can set scheduling priority of processes. Niceness values range from -20
|
|
# (most favorable to the process) to 19 (least favorable to the process).
|
|
# nice_priority =
|
|
#
|
|
# You can set I/O scheduling class and priority of processes. I/O niceness
|
|
# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
|
|
# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
|
|
# 0 to 7. The higher the value, the lower the I/O priority of the process.
|
|
# Work only with ionice_class.
|
|
# ionice_class =
|
|
# ionice_priority =
|
|
|
|
[pipeline:main]
|
|
pipeline = healthcheck recon container-server
|
|
|
|
[app:container-server]
|
|
use = egg:swift#container
|
|
# You can override the default log routing for this app here:
|
|
# set log_name = container-server
|
|
# set log_facility = LOG_LOCAL0
|
|
# set log_level = INFO
|
|
# set log_requests = true
|
|
# set log_address = /dev/log
|
|
#
|
|
# node_timeout = 3
|
|
# conn_timeout = 0.5
|
|
# allow_versions = false
|
|
#
|
|
# You can disable REPLICATE handling (default is to allow it). When deploying
|
|
# a cluster with a separate replication network, you'll want multiple
|
|
# container-server processes running: one for client-driven traffic and another
|
|
# for replication traffic. The server handling client-driven traffic may set
|
|
# this to false. If there is only one container-server process, leave this as
|
|
# true.
|
|
# replication_server = true
|
|
#
|
|
# You can set scheduling priority of processes. Niceness values range from -20
|
|
# (most favorable to the process) to 19 (least favorable to the process).
|
|
# nice_priority =
|
|
#
|
|
# You can set I/O scheduling class and priority of processes. I/O niceness
|
|
# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
|
|
# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
|
|
# 0 to 7. The higher the value, the lower the I/O priority of the process.
|
|
# Work only with ionice_class.
|
|
# ionice_class =
|
|
# ionice_priority =
|
|
#
|
|
# You can set fallocate_reserve to the number of bytes or percentage
|
|
# of disk space you'd like kept free at all times. If the disk's free
|
|
# space falls below this value, then PUT, POST, and REPLICATE requests
|
|
# will be denied until the disk ha s more space available. Percentage
|
|
# will be used if the value ends with a '%'.
|
|
# fallocate_reserve = 1%
|
|
|
|
[filter:healthcheck]
|
|
use = egg:swift#healthcheck
|
|
# An optional filesystem path, which if present, will cause the healthcheck
|
|
# URL to return "503 Service Unavailable" with a body of "DISABLED BY FILE"
|
|
# disable_path =
|
|
|
|
[filter:recon]
|
|
use = egg:swift#recon
|
|
#recon_cache_path = /var/cache/swift
|
|
|
|
[container-replicator]
|
|
# You can override the default log routing for this app here (don't use set!):
|
|
# log_name = container-replicator
|
|
# log_facility = LOG_LOCAL0
|
|
# log_level = INFO
|
|
# log_address = /dev/log
|
|
#
|
|
# Maximum number of database rows that will be sync'd in a single HTTP
|
|
# replication request. Databases with less than or equal to this number of
|
|
# differing rows will always be sync'd using an HTTP replication request rather
|
|
# than using rsync.
|
|
# per_diff = 1000
|
|
#
|
|
# Maximum number of HTTP replication requests attempted on each replication
|
|
# pass for any one container. This caps how long the replicator will spend
|
|
# trying to sync a given database per pass so the other databases don't get
|
|
# starved.
|
|
# max_diffs = 100
|
|
#
|
|
# Number of replication workers to spawn.
|
|
# concurrency = 8
|
|
#
|
|
# Time in seconds to wait between replication passes
|
|
# interval = 30.0
|
|
# run_pause is deprecated, use interval instead
|
|
# run_pause = 30.0
|
|
#
|
|
# Process at most this many databases per second
|
|
# databases_per_second = 50
|
|
#
|
|
# node_timeout = 10
|
|
# conn_timeout = 0.5
|
|
#
|
|
# The replicator also performs reclamation
|
|
# reclaim_age = 604800
|
|
#
|
|
# Allow rsync to compress data which is transmitted to destination node
|
|
# during sync. However, this is applicable only when destination node is in
|
|
# a different region than the local one.
|
|
# rsync_compress = no
|
|
#
|
|
# Format of the rsync module where the replicator will send data. See
|
|
# etc/rsyncd.conf-sample for some usage examples.
|
|
# rsync_module = {replication_ip}::container
|
|
#
|
|
# recon_cache_path = /var/cache/swift
|
|
#
|
|
# You can set scheduling priority of processes. Niceness values range from -20
|
|
# (most favorable to the process) to 19 (least favorable to the process).
|
|
# nice_priority =
|
|
#
|
|
# You can set I/O scheduling class and priority of processes. I/O niceness
|
|
# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
|
|
# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
|
|
# 0 to 7. The higher the value, the lower the I/O priority of the process.
|
|
# Work only with ionice_class.
|
|
# ionice_class =
|
|
# ionice_priority =
|
|
#
|
|
# The handoffs_only mode option is for special-case emergency
|
|
# situations such as full disks in the cluster. This option SHOULD NOT
|
|
# BE ENABLED except in emergencies. When handoffs_only mode is enabled
|
|
# the replicator will *only* replicate from handoff nodes to primary
|
|
# nodes and will not sync primary nodes with other primary nodes.
|
|
#
|
|
# This has two main effects: first, the replicator becomes much more
|
|
# effective at removing misplaced databases, thereby freeing up disk
|
|
# space at a much faster pace than normal. Second, the replicator does
|
|
# not sync data between primary nodes, so out-of-sync account and
|
|
# container listings will not resolve while handoffs_only is enabled.
|
|
#
|
|
# This mode is intended to allow operators to temporarily sacrifice
|
|
# consistency in order to gain faster rebalancing, such as during a
|
|
# capacity addition with nearly-full disks. It is not intended for
|
|
# long-term use.
|
|
#
|
|
# handoffs_only = no
|
|
|
|
[container-updater]
|
|
# You can override the default log routing for this app here (don't use set!):
|
|
# log_name = container-updater
|
|
# log_facility = LOG_LOCAL0
|
|
# log_level = INFO
|
|
# log_address = /dev/log
|
|
#
|
|
# interval = 300.0
|
|
# concurrency = 4
|
|
# node_timeout = 3
|
|
# conn_timeout = 0.5
|
|
#
|
|
# Send at most this many container updates per second
|
|
# containers_per_second = 50
|
|
#
|
|
# slowdown will sleep that amount between containers. Deprecated; use
|
|
# containers_per_second instead.
|
|
# slowdown = 0.01
|
|
#
|
|
# Seconds to suppress updating an account that has generated an error
|
|
# account_suppression_time = 60
|
|
#
|
|
# recon_cache_path = /var/cache/swift
|
|
#
|
|
# You can set scheduling priority of processes. Niceness values range from -20
|
|
# (most favorable to the process) to 19 (least favorable to the process).
|
|
# nice_priority =
|
|
#
|
|
# You can set I/O scheduling class and priority of processes. I/O niceness
|
|
# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
|
|
# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
|
|
# 0 to 7. The higher the value, the lower the I/O priority of the process.
|
|
# Work only with ionice_class.
|
|
# ionice_class =
|
|
# ionice_priority =
|
|
|
|
[container-auditor]
|
|
# You can override the default log routing for this app here (don't use set!):
|
|
# log_name = container-auditor
|
|
# log_facility = LOG_LOCAL0
|
|
# log_level = INFO
|
|
# log_address = /dev/log
|
|
#
|
|
# Will audit each container at most once per interval
|
|
# interval = 1800.0
|
|
#
|
|
# containers_per_second = 200
|
|
# recon_cache_path = /var/cache/swift
|
|
#
|
|
# You can set scheduling priority of processes. Niceness values range from -20
|
|
# (most favorable to the process) to 19 (least favorable to the process).
|
|
# nice_priority =
|
|
#
|
|
# You can set I/O scheduling class and priority of processes. I/O niceness
|
|
# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
|
|
# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
|
|
# 0 to 7. The higher the value, the lower the I/O priority of the process.
|
|
# Work only with ionice_class.
|
|
# ionice_class =
|
|
# ionice_priority =
|
|
|
|
[container-sync]
|
|
# You can override the default log routing for this app here (don't use set!):
|
|
# log_name = container-sync
|
|
# log_facility = LOG_LOCAL0
|
|
# log_level = INFO
|
|
# log_address = /dev/log
|
|
#
|
|
# If you need to use an HTTP Proxy, set it here; defaults to no proxy.
|
|
# You can also set this to a comma separated list of HTTP Proxies and they will
|
|
# be randomly used (simple load balancing).
|
|
# sync_proxy = http://10.1.1.1:8888,http://10.1.1.2:8888
|
|
#
|
|
# Will sync each container at most once per interval
|
|
# interval = 300.0
|
|
#
|
|
# Maximum amount of time to spend syncing each container per pass
|
|
# container_time = 60
|
|
#
|
|
# Maximum amount of time in seconds for the connection attempt
|
|
# conn_timeout = 5
|
|
# Server errors from requests will be retried by default
|
|
# request_tries = 3
|
|
#
|
|
# Internal client config file path
|
|
# internal_client_conf_path = /etc/swift/internal-client.conf
|
|
#
|
|
# You can set scheduling priority of processes. Niceness values range from -20
|
|
# (most favorable to the process) to 19 (least favorable to the process).
|
|
# nice_priority =
|
|
#
|
|
# You can set I/O scheduling class and priority of processes. I/O niceness
|
|
# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
|
|
# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
|
|
# 0 to 7. The higher the value, the lower the I/O priority of the process.
|
|
# Work only with ionice_class.
|
|
# ionice_class =
|
|
# ionice_priority =
|
|
|
|
# Note: Put it at the beginning of the pipeline to profile all middleware. But
|
|
# it is safer to put this after healthcheck.
|
|
[filter:xprofile]
|
|
use = egg:swift#xprofile
|
|
# This option enable you to switch profilers which should inherit from python
|
|
# standard profiler. Currently the supported value can be 'cProfile',
|
|
# 'eventlet.green.profile' etc.
|
|
# profile_module = eventlet.green.profile
|
|
#
|
|
# This prefix will be used to combine process ID and timestamp to name the
|
|
# profile data file. Make sure the executing user has permission to write
|
|
# into this path (missing path segments will be created, if necessary).
|
|
# If you enable profiling in more than one type of daemon, you must override
|
|
# it with an unique value like: /var/log/swift/profile/container.profile
|
|
# log_filename_prefix = /tmp/log/swift/profile/default.profile
|
|
#
|
|
# the profile data will be dumped to local disk based on above naming rule
|
|
# in this interval.
|
|
# dump_interval = 5.0
|
|
#
|
|
# Be careful, this option will enable profiler to dump data into the file with
|
|
# time stamp which means there will be lots of files piled up in the directory.
|
|
# dump_timestamp = false
|
|
#
|
|
# This is the path of the URL to access the mini web UI.
|
|
# path = /__profile__
|
|
#
|
|
# Clear the data when the wsgi server shutdown.
|
|
# flush_at_shutdown = false
|
|
#
|
|
# unwind the iterator of applications
|
|
# unwind = false
|
|
|
|
[container-sharder]
|
|
# You can override the default log routing for this app here (don't use set!):
|
|
# log_name = container-sharder
|
|
# log_facility = LOG_LOCAL0
|
|
# log_level = INFO
|
|
# log_address = /dev/log
|
|
#
|
|
# Container sharder specific settings
|
|
#
|
|
# If the auto_shard option is true then the sharder will automatically select
|
|
# containers to shard, scan for shard ranges, and select shards to shrink.
|
|
# The default is false.
|
|
# Warning: auto-sharding is still under development and should not be used in
|
|
# production; do not set this option to true in a production cluster.
|
|
# auto_shard = false
|
|
#
|
|
# When auto-sharding is enabled shard_container_threshold defines the object
|
|
# count at which a container with container-sharding enabled will start to
|
|
# shard. shard_container_threshold also indirectly determines the defaults for
|
|
# rows_per_shard, shrink_threshold and expansion_limit.
|
|
# shard_container_threshold = 1000000
|
|
#
|
|
# rows_per_shard determines the initial nominal size of shard containers. The
|
|
# default is shard_container_threshold // 2
|
|
# rows_per_shard = 500000
|
|
#
|
|
# Minimum size of the final shard range. If this is greater than one then the
|
|
# final shard range may be extended to more than rows_per_shard in order to
|
|
# avoid a further shard range with less than minimum_shard_size rows. The
|
|
# default value is rows_per_shard // 5.
|
|
# minimum_shard_size = 100000
|
|
#
|
|
# When auto-sharding is enabled shrink_threshold defines the object count
|
|
# below which a 'donor' shard container will be considered for shrinking into
|
|
# another 'acceptor' shard container. The default is determined by
|
|
# shard_shrink_point. If set, shrink_threshold will take precedence over
|
|
# shard_shrink_point.
|
|
# shrink_threshold =
|
|
#
|
|
# When auto-sharding is enabled shard_shrink_point defines the object count
|
|
# below which a 'donor' shard container will be considered for shrinking into
|
|
# another 'acceptor' shard container. shard_shrink_point is a percentage of
|
|
# shard_container_threshold e.g. the default value of 10 means 10% of the
|
|
# shard_container_threshold.
|
|
# Deprecated: shrink_threshold is recommended and if set will take precedence
|
|
# over shard_shrink_point.
|
|
# shard_shrink_point = 10
|
|
#
|
|
# When auto-sharding is enabled expansion_limit defines the maximum
|
|
# allowed size of an acceptor shard container after having a donor merged into
|
|
# it. The default is determined by shard_shrink_merge_point.
|
|
# If set, expansion_limit will take precedence over shard_shrink_merge_point.
|
|
# expansion_limit =
|
|
#
|
|
# When auto-sharding is enabled shard_shrink_merge_point defines the maximum
|
|
# allowed size of an acceptor shard container after having a donor merged into
|
|
# it. Shard_shrink_merge_point is a percentage of shard_container_threshold.
|
|
# e.g. the default value of 75 means that the projected sum of a donor object
|
|
# count and acceptor count must be less than 75% of shard_container_threshold
|
|
# for the donor to be allowed to merge into the acceptor.
|
|
#
|
|
# For example, if the shard_container_threshold is 1 million,
|
|
# shard_shrink_point is 10, and shard_shrink_merge_point is 75 then a shard will
|
|
# be considered for shrinking if it has less than or equal to 100 thousand
|
|
# objects but will only merge into an acceptor if the combined object count
|
|
# would be less than or equal to 750 thousand objects.
|
|
# Deprecated: expansion_limit is recommended and if set will take precedence
|
|
# over shard_shrink_merge_point.
|
|
# shard_shrink_merge_point = 75
|
|
#
|
|
# When auto-sharding is enabled shard_scanner_batch_size defines the maximum
|
|
# number of shard ranges that will be found each time the sharder daemon visits
|
|
# a sharding container. If necessary the sharder daemon will continue to search
|
|
# for more shard ranges each time it visits the container.
|
|
# shard_scanner_batch_size = 10
|
|
#
|
|
# cleave_batch_size defines the number of shard ranges that will be cleaved
|
|
# each time the sharder daemon visits a sharding container.
|
|
# cleave_batch_size = 2
|
|
#
|
|
# cleave_row_batch_size defines the size of batches of object rows read from a
|
|
# sharding container and merged to a shard container during cleaving.
|
|
# cleave_row_batch_size = 10000
|
|
#
|
|
# max_expanding defines the maximum number of shards that could be expanded in a
|
|
# single cycle of the sharder. Defaults to unlimited (-1).
|
|
# max_expanding = -1
|
|
#
|
|
# max_shrinking defines the maximum number of shards that should be shrunk into
|
|
# each expanding shard. Defaults to 1.
|
|
# NOTE: Using values greater than 1 may result in temporary gaps in object listings
|
|
# until all selected shards have shrunk.
|
|
# max_shrinking = 1
|
|
#
|
|
# Defines the number of successfully replicated shard dbs required when
|
|
# cleaving a previously uncleaved shard range before the sharder will progress
|
|
# to the next shard range. The value should be less than or equal to the
|
|
# container ring replica count. The default of 'auto' causes the container ring
|
|
# quorum value to be used. This option only applies to the container-sharder
|
|
# replication and does not affect the number of shard container replicas that
|
|
# will eventually be replicated by the container-replicator.
|
|
# shard_replication_quorum = auto
|
|
#
|
|
# Defines the number of successfully replicated shard dbs required when
|
|
# cleaving a shard range that has been previously cleaved on another node
|
|
# before the sharder will progress to the next shard range. The value should be
|
|
# less than or equal to the container ring replica count. The default of 'auto'
|
|
# causes the shard_replication_quorum value to be used. This option only
|
|
# applies to the container-sharder replication and does not affect the number
|
|
# of shard container replicas that will eventually be replicated by the
|
|
# container-replicator.
|
|
# existing_shard_replication_quorum = auto
|
|
#
|
|
# The sharder uses an internal client to create and make requests to
|
|
# containers. The absolute path to the client config file can be configured.
|
|
# internal_client_conf_path = /etc/swift/internal-client.conf
|
|
#
|
|
# The number of time the internal client will retry requests.
|
|
# request_tries = 3
|
|
#
|
|
# Each time the sharder dumps stats to the recon cache file it includes a list
|
|
# of containers that appear to need sharding but are not yet sharding. By
|
|
# default this list is limited to the top 5 containers, ordered by object
|
|
# count. The limit may be changed by setting recon_candidates_limit to an
|
|
# integer value. A negative value implies no limit.
|
|
# recon_candidates_limit = 5
|
|
#
|
|
# As the sharder visits each container that's currently sharding it dumps to
|
|
# recon their current progress. To be able to mark their progress as completed
|
|
# this in-progress check will need to monitor containers that have just
|
|
# completed sharding. The recon_sharded_timeout parameter says for how long a
|
|
# container whose just finished sharding should be checked by the in-progress
|
|
# check. This is to allow anything monitoring the sharding recon dump to have
|
|
# enough time to collate and see things complete. The time is capped at
|
|
# reclaim_age, so this parameter should be less than or equal to reclaim_age.
|
|
# The default is 12 hours (12 x 60 x 60)
|
|
# recon_sharded_timeout = 43200
|
|
#
|
|
# Large databases tend to take a while to work with, but we want to make sure
|
|
# we write down our progress. Use a larger-than-normal broker timeout to make
|
|
# us less likely to bomb out on a LockTimeout.
|
|
# broker_timeout = 60
|
|
#
|
|
# Time in seconds to wait between sharder cycles
|
|
# interval = 30.0
|
|
#
|
|
# Process at most this many databases per second
|
|
# databases_per_second = 50
|
|
#
|
|
# The container-sharder accepts the following configuration options as defined
|
|
# in the container-replicator section:
|
|
#
|
|
# per_diff = 1000
|
|
# max_diffs = 100
|
|
# concurrency = 8
|
|
# node_timeout = 10
|
|
# conn_timeout = 0.5
|
|
# reclaim_age = 604800
|
|
# rsync_compress = no
|
|
# rsync_module = {replication_ip}::container
|
|
# recon_cache_path = /var/cache/swift
|
|
#
|