Erasure Code Documentation
This patch adds all the relevant EC documentation to the source tree. Notable additions are: - Updated SAIO documentation - Updates to existing swift documentation; and - Erasure Coding overview Co-Authored-By: Alistair Coles <alistair.coles@hp.com> Co-Authored-By: Thiago da Silva <thiago@redhat.com> Co-Authored-By: John Dickinson <me@not.mn> Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com> Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com> Co-Authored-By: Samuel Merritt <sam@swiftstack.com> Co-Authored-By: Christian Schwede <christian.schwede@enovance.com> Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com> Change-Id: I0403016a4bb7dad9535891632753b0e5e9d402eb Implements: blueprint swift-ec Signed-off-by: Thiago da Silva <thiago@redhat.com>
This commit is contained in:
parent
647b66a2ce
commit
8f5d4d2455
@ -16,6 +16,16 @@ swift-ring-builder object-1.builder add r1z2-127.0.0.1:6020/sdb2 1
|
|||||||
swift-ring-builder object-1.builder add r1z3-127.0.0.1:6030/sdb3 1
|
swift-ring-builder object-1.builder add r1z3-127.0.0.1:6030/sdb3 1
|
||||||
swift-ring-builder object-1.builder add r1z4-127.0.0.1:6040/sdb4 1
|
swift-ring-builder object-1.builder add r1z4-127.0.0.1:6040/sdb4 1
|
||||||
swift-ring-builder object-1.builder rebalance
|
swift-ring-builder object-1.builder rebalance
|
||||||
|
swift-ring-builder object-2.builder create 10 6 1
|
||||||
|
swift-ring-builder object-2.builder add r1z1-127.0.0.1:6010/sdb1 1
|
||||||
|
swift-ring-builder object-2.builder add r1z1-127.0.0.1:6010/sdb5 1
|
||||||
|
swift-ring-builder object-2.builder add r1z2-127.0.0.1:6020/sdb2 1
|
||||||
|
swift-ring-builder object-2.builder add r1z2-127.0.0.1:6020/sdb6 1
|
||||||
|
swift-ring-builder object-2.builder add r1z3-127.0.0.1:6030/sdb3 1
|
||||||
|
swift-ring-builder object-2.builder add r1z3-127.0.0.1:6030/sdb7 1
|
||||||
|
swift-ring-builder object-2.builder add r1z4-127.0.0.1:6040/sdb4 1
|
||||||
|
swift-ring-builder object-2.builder add r1z4-127.0.0.1:6040/sdb8 1
|
||||||
|
swift-ring-builder object-2.builder rebalance
|
||||||
swift-ring-builder container.builder create 10 3 1
|
swift-ring-builder container.builder create 10 3 1
|
||||||
swift-ring-builder container.builder add r1z1-127.0.0.1:6011/sdb1 1
|
swift-ring-builder container.builder add r1z1-127.0.0.1:6011/sdb1 1
|
||||||
swift-ring-builder container.builder add r1z2-127.0.0.1:6021/sdb2 1
|
swift-ring-builder container.builder add r1z2-127.0.0.1:6021/sdb2 1
|
||||||
|
@ -9,7 +9,10 @@ sudo mkfs.xfs -f ${SAIO_BLOCK_DEVICE:-/dev/sdb1}
|
|||||||
sudo mount /mnt/sdb1
|
sudo mount /mnt/sdb1
|
||||||
sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4
|
sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4
|
||||||
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
||||||
mkdir -p /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 /srv/4/node/sdb4
|
mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \
|
||||||
|
/srv/2/node/sdb2 /srv/2/node/sdb6 \
|
||||||
|
/srv/3/node/sdb3 /srv/3/node/sdb7 \
|
||||||
|
/srv/4/node/sdb4 /srv/4/node/sdb8
|
||||||
sudo rm -f /var/log/debug /var/log/messages /var/log/rsyncd.log /var/log/syslog
|
sudo rm -f /var/log/debug /var/log/messages /var/log/rsyncd.log /var/log/syslog
|
||||||
find /var/cache/swift* -type f -name *.recon -exec rm -f {} \;
|
find /var/cache/swift* -type f -name *.recon -exec rm -f {} \;
|
||||||
# On Fedora use "systemctl restart <service>"
|
# On Fedora use "systemctl restart <service>"
|
||||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
|||||||
[object-replicator]
|
[object-replicator]
|
||||||
vm_test_mode = yes
|
vm_test_mode = yes
|
||||||
|
|
||||||
|
[object-reconstructor]
|
||||||
|
|
||||||
[object-updater]
|
[object-updater]
|
||||||
|
|
||||||
[object-auditor]
|
[object-auditor]
|
||||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
|||||||
[object-replicator]
|
[object-replicator]
|
||||||
vm_test_mode = yes
|
vm_test_mode = yes
|
||||||
|
|
||||||
|
[object-reconstructor]
|
||||||
|
|
||||||
[object-updater]
|
[object-updater]
|
||||||
|
|
||||||
[object-auditor]
|
[object-auditor]
|
||||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
|||||||
[object-replicator]
|
[object-replicator]
|
||||||
vm_test_mode = yes
|
vm_test_mode = yes
|
||||||
|
|
||||||
|
[object-reconstructor]
|
||||||
|
|
||||||
[object-updater]
|
[object-updater]
|
||||||
|
|
||||||
[object-auditor]
|
[object-auditor]
|
||||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
|||||||
[object-replicator]
|
[object-replicator]
|
||||||
vm_test_mode = yes
|
vm_test_mode = yes
|
||||||
|
|
||||||
|
[object-reconstructor]
|
||||||
|
|
||||||
[object-updater]
|
[object-updater]
|
||||||
|
|
||||||
[object-auditor]
|
[object-auditor]
|
||||||
|
@ -5,7 +5,16 @@ swift_hash_path_suffix = changeme
|
|||||||
|
|
||||||
[storage-policy:0]
|
[storage-policy:0]
|
||||||
name = gold
|
name = gold
|
||||||
|
policy_type = replication
|
||||||
default = yes
|
default = yes
|
||||||
|
|
||||||
[storage-policy:1]
|
[storage-policy:1]
|
||||||
name = silver
|
name = silver
|
||||||
|
policy_type = replication
|
||||||
|
|
||||||
|
[storage-policy:2]
|
||||||
|
name = ec42
|
||||||
|
policy_type = erasure_coding
|
||||||
|
ec_type = jerasure_rs_vand
|
||||||
|
ec_num_data_fragments = 4
|
||||||
|
ec_num_parity_fragments = 2
|
||||||
|
@ -104,5 +104,7 @@ Other
|
|||||||
* `Swiftsync <https://github.com/stackforge/swiftsync>`_ - A massive syncer between two swift clusters.
|
* `Swiftsync <https://github.com/stackforge/swiftsync>`_ - A massive syncer between two swift clusters.
|
||||||
* `Django Swiftbrowser <https://github.com/cschwede/django-swiftbrowser>`_ - Simple Django web app to access Openstack Swift.
|
* `Django Swiftbrowser <https://github.com/cschwede/django-swiftbrowser>`_ - Simple Django web app to access Openstack Swift.
|
||||||
* `Swift-account-stats <https://github.com/enovance/swift-account-stats>`_ - Swift-account-stats is a tool to report statistics on Swift usage at tenant and global levels.
|
* `Swift-account-stats <https://github.com/enovance/swift-account-stats>`_ - Swift-account-stats is a tool to report statistics on Swift usage at tenant and global levels.
|
||||||
|
* `PyECLib <https://bitbucket.org/kmgreen2/pyeclib>`_ - High Level Erasure Code library used by Swift
|
||||||
|
* `liberasurecode <http://www.bytebucket.org/tsg-/liberasurecode>`_ - Low Level Erasure Code library used by PyECLib
|
||||||
* `Swift Browser <https://github.com/zerovm/swift-browser>`_ - JavaScript interface for Swift
|
* `Swift Browser <https://github.com/zerovm/swift-browser>`_ - JavaScript interface for Swift
|
||||||
* `swift-ui <https://github.com/fanatic/swift-ui>`_ - OpenStack Swift web browser
|
* `swift-ui <https://github.com/fanatic/swift-ui>`_ - OpenStack Swift web browser
|
||||||
|
@ -87,8 +87,11 @@ another device when creating the VM, and follow these instructions:
|
|||||||
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
||||||
sudo mkdir /srv
|
sudo mkdir /srv
|
||||||
for x in {1..4}; do sudo ln -s /mnt/sdb1/$x /srv/$x; done
|
for x in {1..4}; do sudo ln -s /mnt/sdb1/$x /srv/$x; done
|
||||||
sudo mkdir -p /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 \
|
sudo mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \
|
||||||
/srv/4/node/sdb4 /var/run/swift
|
/srv/2/node/sdb2 /srv/2/node/sdb6 \
|
||||||
|
/srv/3/node/sdb3 /srv/3/node/sdb7 \
|
||||||
|
/srv/4/node/sdb4 /srv/4/node/sdb8 \
|
||||||
|
/var/run/swift
|
||||||
sudo chown -R ${USER}:${USER} /var/run/swift
|
sudo chown -R ${USER}:${USER} /var/run/swift
|
||||||
# **Make sure to include the trailing slash after /srv/$x/**
|
# **Make sure to include the trailing slash after /srv/$x/**
|
||||||
for x in {1..4}; do sudo chown -R ${USER}:${USER} /srv/$x/; done
|
for x in {1..4}; do sudo chown -R ${USER}:${USER} /srv/$x/; done
|
||||||
@ -124,7 +127,11 @@ these instructions:
|
|||||||
sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4
|
sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4
|
||||||
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
||||||
for x in {1..4}; do sudo ln -s /mnt/sdb1/$x /srv/$x; done
|
for x in {1..4}; do sudo ln -s /mnt/sdb1/$x /srv/$x; done
|
||||||
sudo mkdir -p /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 /srv/4/node/sdb4 /var/run/swift
|
sudo mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \
|
||||||
|
/srv/2/node/sdb2 /srv/2/node/sdb6 \
|
||||||
|
/srv/3/node/sdb3 /srv/3/node/sdb7 \
|
||||||
|
/srv/4/node/sdb4 /srv/4/node/sdb8 \
|
||||||
|
/var/run/swift
|
||||||
sudo chown -R ${USER}:${USER} /var/run/swift
|
sudo chown -R ${USER}:${USER} /var/run/swift
|
||||||
# **Make sure to include the trailing slash after /srv/$x/**
|
# **Make sure to include the trailing slash after /srv/$x/**
|
||||||
for x in {1..4}; do sudo chown -R ${USER}:${USER} /srv/$x/; done
|
for x in {1..4}; do sudo chown -R ${USER}:${USER} /srv/$x/; done
|
||||||
@ -402,7 +409,7 @@ Setting up scripts for running Swift
|
|||||||
|
|
||||||
#. Copy the SAIO scripts for resetting the environment::
|
#. Copy the SAIO scripts for resetting the environment::
|
||||||
|
|
||||||
cd $HOME/swift/doc; cp -r saio/bin $HOME/bin; cd -
|
cd $HOME/swift/doc; cp saio/bin/* $HOME/bin; cd -
|
||||||
chmod +x $HOME/bin/*
|
chmod +x $HOME/bin/*
|
||||||
|
|
||||||
#. Edit the ``$HOME/bin/resetswift`` script
|
#. Edit the ``$HOME/bin/resetswift`` script
|
||||||
@ -455,30 +462,41 @@ Setting up scripts for running Swift
|
|||||||
|
|
||||||
.. literalinclude:: /../saio/bin/remakerings
|
.. literalinclude:: /../saio/bin/remakerings
|
||||||
|
|
||||||
You can expect the output from this command to produce the following (note
|
You can expect the output from this command to produce the following. Note
|
||||||
that 2 object rings are created in order to test storage policies in the
|
that 3 object rings are created in order to test storage policies and EC in
|
||||||
SAIO environment however they map to the same nodes)::
|
the SAIO environment. The EC ring is the only one with all 8 devices.
|
||||||
|
There are also two replication rings, one for 3x replication and another
|
||||||
|
for 2x replication, but those rings only use 4 devices::
|
||||||
|
|
||||||
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
||||||
Device d1r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 1
|
Device d1r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 1
|
||||||
Device d2r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 2
|
Device d2r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 2
|
||||||
Device d3r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 3
|
Device d3r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 3
|
||||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||||
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
||||||
Device d1r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 1
|
Device d1r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 1
|
||||||
Device d2r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 2
|
Device d2r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 2
|
||||||
Device d3r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 3
|
Device d3r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 3
|
||||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||||
|
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
||||||
|
Device d1r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb5_"" with 1.0 weight got id 1
|
||||||
|
Device d2r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 2
|
||||||
|
Device d3r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb6_"" with 1.0 weight got id 3
|
||||||
|
Device d4r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 4
|
||||||
|
Device d5r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb7_"" with 1.0 weight got id 5
|
||||||
|
Device d6r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 6
|
||||||
|
Device d7r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb8_"" with 1.0 weight got id 7
|
||||||
|
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||||
Device d0r1z1-127.0.0.1:6011R127.0.0.1:6011/sdb1_"" with 1.0 weight got id 0
|
Device d0r1z1-127.0.0.1:6011R127.0.0.1:6011/sdb1_"" with 1.0 weight got id 0
|
||||||
Device d1r1z2-127.0.0.1:6021R127.0.0.1:6021/sdb2_"" with 1.0 weight got id 1
|
Device d1r1z2-127.0.0.1:6021R127.0.0.1:6021/sdb2_"" with 1.0 weight got id 1
|
||||||
Device d2r1z3-127.0.0.1:6031R127.0.0.1:6031/sdb3_"" with 1.0 weight got id 2
|
Device d2r1z3-127.0.0.1:6031R127.0.0.1:6031/sdb3_"" with 1.0 weight got id 2
|
||||||
Device d3r1z4-127.0.0.1:6041R127.0.0.1:6041/sdb4_"" with 1.0 weight got id 3
|
Device d3r1z4-127.0.0.1:6041R127.0.0.1:6041/sdb4_"" with 1.0 weight got id 3
|
||||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||||
Device d0r1z1-127.0.0.1:6012R127.0.0.1:6012/sdb1_"" with 1.0 weight got id 0
|
Device d0r1z1-127.0.0.1:6012R127.0.0.1:6012/sdb1_"" with 1.0 weight got id 0
|
||||||
Device d1r1z2-127.0.0.1:6022R127.0.0.1:6022/sdb2_"" with 1.0 weight got id 1
|
Device d1r1z2-127.0.0.1:6022R127.0.0.1:6022/sdb2_"" with 1.0 weight got id 1
|
||||||
Device d2r1z3-127.0.0.1:6032R127.0.0.1:6032/sdb3_"" with 1.0 weight got id 2
|
Device d2r1z3-127.0.0.1:6032R127.0.0.1:6032/sdb3_"" with 1.0 weight got id 2
|
||||||
Device d3r1z4-127.0.0.1:6042R127.0.0.1:6042/sdb4_"" with 1.0 weight got id 3
|
Device d3r1z4-127.0.0.1:6042R127.0.0.1:6042/sdb4_"" with 1.0 weight got id 3
|
||||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||||
|
|
||||||
#. Read more about Storage Policies and your SAIO :doc:`policies_saio`
|
#. Read more about Storage Policies and your SAIO :doc:`policies_saio`
|
||||||
|
|
||||||
|
BIN
doc/source/images/ec_overview.png
Executable file
BIN
doc/source/images/ec_overview.png
Executable file
Binary file not shown.
After Width: | Height: | Size: 145 KiB |
@ -56,6 +56,7 @@ Overview and Concepts
|
|||||||
overview_expiring_objects
|
overview_expiring_objects
|
||||||
cors
|
cors
|
||||||
crossdomain
|
crossdomain
|
||||||
|
overview_erasure_code
|
||||||
overview_backing_store
|
overview_backing_store
|
||||||
associated_projects
|
associated_projects
|
||||||
|
|
||||||
|
@ -11,7 +11,10 @@ Proxy Server
|
|||||||
The Proxy Server is responsible for tying together the rest of the Swift
|
The Proxy Server is responsible for tying together the rest of the Swift
|
||||||
architecture. For each request, it will look up the location of the account,
|
architecture. For each request, it will look up the location of the account,
|
||||||
container, or object in the ring (see below) and route the request accordingly.
|
container, or object in the ring (see below) and route the request accordingly.
|
||||||
The public API is also exposed through the Proxy Server.
|
For Erasure Code type policies, the Proxy Server is also responsible for
|
||||||
|
encoding and decoding object data. See :doc:`overview_erasure_code` for
|
||||||
|
complete information on Erasure Code suport. The public API is also exposed
|
||||||
|
through the Proxy Server.
|
||||||
|
|
||||||
A large number of failures are also handled in the Proxy Server. For
|
A large number of failures are also handled in the Proxy Server. For
|
||||||
example, if a server is unavailable for an object PUT, it will ask the
|
example, if a server is unavailable for an object PUT, it will ask the
|
||||||
@ -87,7 +90,8 @@ implementing a particular differentiation.
|
|||||||
For example, one might have the default policy with 3x replication, and create
|
For example, one might have the default policy with 3x replication, and create
|
||||||
a second policy which, when applied to new containers only uses 2x replication.
|
a second policy which, when applied to new containers only uses 2x replication.
|
||||||
Another might add SSDs to a set of storage nodes and create a performance tier
|
Another might add SSDs to a set of storage nodes and create a performance tier
|
||||||
storage policy for certain containers to have their objects stored there.
|
storage policy for certain containers to have their objects stored there. Yet
|
||||||
|
another might be the use of Erasure Coding to define a cold-storage tier.
|
||||||
|
|
||||||
This mapping is then exposed on a per-container basis, where each container
|
This mapping is then exposed on a per-container basis, where each container
|
||||||
can be assigned a specific storage policy when it is created, which remains in
|
can be assigned a specific storage policy when it is created, which remains in
|
||||||
@ -156,6 +160,15 @@ item (object, container, or account) is deleted, a tombstone is set as the
|
|||||||
latest version of the item. The replicator will see the tombstone and ensure
|
latest version of the item. The replicator will see the tombstone and ensure
|
||||||
that the item is removed from the entire system.
|
that the item is removed from the entire system.
|
||||||
|
|
||||||
|
--------------
|
||||||
|
Reconstruction
|
||||||
|
--------------
|
||||||
|
|
||||||
|
The reconstructor is used by Erasure Code policies and is analogous to the
|
||||||
|
replicator for Replication type policies. See :doc:`overview_erasure_code`
|
||||||
|
for complete information on both Erasure Code support as well as the
|
||||||
|
reconstructor.
|
||||||
|
|
||||||
--------
|
--------
|
||||||
Updaters
|
Updaters
|
||||||
--------
|
--------
|
||||||
|
672
doc/source/overview_erasure_code.rst
Executable file
672
doc/source/overview_erasure_code.rst
Executable file
@ -0,0 +1,672 @@
|
|||||||
|
====================
|
||||||
|
Erasure Code Support
|
||||||
|
====================
|
||||||
|
|
||||||
|
|
||||||
|
--------------------------
|
||||||
|
Beta: Not production ready
|
||||||
|
--------------------------
|
||||||
|
The erasure code support in Swift is considered "beta" at this point.
|
||||||
|
Most major functionality is included, but it has not been tested or validated
|
||||||
|
at large scale. This feature relies on ssync for durability. Deployers are
|
||||||
|
urged to do extensive testing and not deploy production data using an
|
||||||
|
erasure code storage policy.
|
||||||
|
|
||||||
|
If any bugs are found during testing, please report them to
|
||||||
|
https://bugs.launchpad.net/swift
|
||||||
|
|
||||||
|
|
||||||
|
-------------------------------
|
||||||
|
History and Theory of Operation
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
There's a lot of good material out there on Erasure Code (EC) theory, this short
|
||||||
|
introduction is just meant to provide some basic context to help the reader
|
||||||
|
better understand the implementation in Swift.
|
||||||
|
|
||||||
|
Erasure Coding for storage applications grew out of Coding Theory as far back as
|
||||||
|
the 1960s with the Reed-Solomon codes. These codes have been used for years in
|
||||||
|
applications ranging from CDs to DVDs to general communications and, yes, even
|
||||||
|
in the space program starting with Voyager! The basic idea is that some amount
|
||||||
|
of data is broken up into smaller pieces called fragments and coded in such a
|
||||||
|
way that it can be transmitted with the ability to tolerate the loss of some
|
||||||
|
number of the coded fragments. That's where the word "erasure" comes in, if you
|
||||||
|
transmit 14 fragments and only 13 are received then one of them is said to be
|
||||||
|
"erased". The word "erasure" provides an important distinction with EC; it
|
||||||
|
isn't about detecting errors, it's about dealing with failures. Another
|
||||||
|
important element of EC is that the number of erasures that can be tolerated can
|
||||||
|
be adjusted to meet the needs of the application.
|
||||||
|
|
||||||
|
At a high level EC works by using a specific scheme to break up a single data
|
||||||
|
buffer into several smaller data buffers then, depending on the scheme,
|
||||||
|
performing some encoding operation on that data in order to generate additional
|
||||||
|
information. So you end up with more data than you started with and that extra
|
||||||
|
data is often called "parity". Note that there are many, many different
|
||||||
|
encoding techniques that vary both in how they organize and manipulate the data
|
||||||
|
as well by what means they use to calculate parity. For example, one scheme
|
||||||
|
might rely on `Galois Field Arithmetic <http://www.ssrc.ucsc.edu/Papers/plank-
|
||||||
|
fast13.pdf>`_ while others may work with only XOR. The number of variations and
|
||||||
|
details about their differences are well beyond the scope of this introduction,
|
||||||
|
but we will talk more about a few of them when we get into the implementation of
|
||||||
|
EC in Swift.
|
||||||
|
|
||||||
|
--------------------------------
|
||||||
|
Overview of EC Support in Swift
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
First and foremost, from an application perspective EC support is totally
|
||||||
|
transparent. There are no EC related external API; a container is simply created
|
||||||
|
using a Storage Policy defined to use EC and then interaction with the cluster
|
||||||
|
is the same as any other durability policy.
|
||||||
|
|
||||||
|
EC is implemented in Swift as a Storage Policy, see :doc:`overview_policies` for
|
||||||
|
complete details on Storage Policies. Because support is implemented as a
|
||||||
|
Storage Policy, all of the storage devices associated with your cluster's EC
|
||||||
|
capability can be isolated. It is entirely possible to share devices between
|
||||||
|
storage policies, but for EC it may make more sense to not only use separate
|
||||||
|
devices but possibly even entire nodes dedicated for EC.
|
||||||
|
|
||||||
|
Which direction one chooses depends on why the EC policy is being deployed. If,
|
||||||
|
for example, there is a production replication policy in place already and the
|
||||||
|
goal is to add a cold storage tier such that the existing nodes performing
|
||||||
|
replication are impacted as little as possible, adding a new set of nodes
|
||||||
|
dedicated to EC might make the most sense but also incurs the most cost. On the
|
||||||
|
other hand, if EC is being added as a capability to provide additional
|
||||||
|
durability for a specific set of applications and the existing infrastructure is
|
||||||
|
well suited for EC (sufficient number of nodes, zones for the EC scheme that is
|
||||||
|
chosen) then leveraging the existing infrastructure such that the EC ring shares
|
||||||
|
nodes with the replication ring makes the most sense. These are some of the
|
||||||
|
main considerations:
|
||||||
|
|
||||||
|
* Layout of existing infrastructure.
|
||||||
|
* Cost of adding dedicated EC nodes (or just dedicated EC devices).
|
||||||
|
* Intended usage model(s).
|
||||||
|
|
||||||
|
The Swift code base does not include any of the algorithms necessary to perform
|
||||||
|
the actual encoding and decoding of data; that is left to external libraries.
|
||||||
|
The Storage Policies architecture is leveraged to enable EC on a per container
|
||||||
|
basis -- the object rings are still used to determine the placement of EC data
|
||||||
|
fragments. Although there are several code paths that are unique to an operation
|
||||||
|
associated with an EC policy, an external dependency to an Erasure Code library
|
||||||
|
is what Swift counts on to perform the low level EC functions. The use of an
|
||||||
|
external library allows for maximum flexibility as there are a significant
|
||||||
|
number of options out there, each with its owns pros and cons that can vary
|
||||||
|
greatly from one use case to another.
|
||||||
|
|
||||||
|
---------------------------------------
|
||||||
|
PyECLib: External Erasure Code Library
|
||||||
|
---------------------------------------
|
||||||
|
|
||||||
|
PyECLib is a Python Erasure Coding Library originally designed and written as
|
||||||
|
part of the effort to add EC support to the Swift project, however it is an
|
||||||
|
independent project. The library provides a well-defined and simple Python
|
||||||
|
interface and internally implements a plug-in architecture allowing it to take
|
||||||
|
advantage of many well-known C libraries such as:
|
||||||
|
|
||||||
|
* Jerasure and GFComplete at http://jerasure.org.
|
||||||
|
* Intel(R) ISA-L at http://01.org/intel%C2%AE-storage-acceleration-library-open-source-version.
|
||||||
|
* Or write your own!
|
||||||
|
|
||||||
|
PyECLib uses a C based library called liberasurecode to implement the plug in
|
||||||
|
infrastructure; liberasure code is available at:
|
||||||
|
|
||||||
|
* liberasurecode: https://bitbucket.org/tsg-/liberasurecode
|
||||||
|
|
||||||
|
PyECLib itself therefore allows for not only choice but further extensibility as
|
||||||
|
well. PyECLib also comes with a handy utility to help determine the best
|
||||||
|
algorithm to use based on the equipment that will be used (processors and server
|
||||||
|
configurations may vary in performance per algorithm). More on this will be
|
||||||
|
covered in the configuration section. PyECLib is included as a Swift
|
||||||
|
requirement.
|
||||||
|
|
||||||
|
For complete details see `PyECLib <https://bitbucket.org/kmgreen2/pyeclib>`_
|
||||||
|
|
||||||
|
------------------------------
|
||||||
|
Storing and Retrieving Objects
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
We will discuss the details of how PUT and GET work in the "Under the Hood"
|
||||||
|
section later on. The key point here is that all of the erasure code work goes
|
||||||
|
on behind the scenes; this summary is a high level information overview only.
|
||||||
|
|
||||||
|
The PUT flow looks like this:
|
||||||
|
|
||||||
|
#. The proxy server streams in an object and buffers up "a segment" of data
|
||||||
|
(size is configurable).
|
||||||
|
#. The proxy server calls on PyECLib to encode the data into smaller fragments.
|
||||||
|
#. The proxy streams the encoded fragments out to the storage nodes based on
|
||||||
|
ring locations.
|
||||||
|
#. Repeat until the client is done sending data.
|
||||||
|
#. The client is notified of completion when a quorum is met.
|
||||||
|
|
||||||
|
The GET flow looks like this:
|
||||||
|
|
||||||
|
#. The proxy server makes simultaneous requests to participating nodes.
|
||||||
|
#. As soon as the proxy has the fragments it needs, it calls on PyECLib to
|
||||||
|
decode the data.
|
||||||
|
#. The proxy streams the decoded data it has back to the client.
|
||||||
|
#. Repeat until the proxy is done sending data back to the client.
|
||||||
|
|
||||||
|
It may sound like, from this high level overview, that using EC is going to
|
||||||
|
cause an explosion in the number of actual files stored in each node's local
|
||||||
|
file system. Although it is true that more files will be stored (because an
|
||||||
|
object is broken into pieces), the implementation works to minimize this where
|
||||||
|
possible, more details are available in the Under the Hood section.
|
||||||
|
|
||||||
|
-------------
|
||||||
|
Handoff Nodes
|
||||||
|
-------------
|
||||||
|
|
||||||
|
In EC policies, similarly to replication, handoff nodes are a set of storage
|
||||||
|
nodes used to augment the list of primary nodes responsible for storing an
|
||||||
|
erasure coded object. These handoff nodes are used in the event that one or more
|
||||||
|
of the primaries are unavailable. Handoff nodes are still selected with an
|
||||||
|
attempt to achieve maximum separation of the data being placed.
|
||||||
|
|
||||||
|
--------------
|
||||||
|
Reconstruction
|
||||||
|
--------------
|
||||||
|
|
||||||
|
For an EC policy, reconstruction is analogous to the process of replication for
|
||||||
|
a replication type policy -- essentially "the reconstructor" replaces "the
|
||||||
|
replicator" for EC policy types. The basic framework of reconstruction is very
|
||||||
|
similar to that of replication with a few notable exceptions:
|
||||||
|
|
||||||
|
* Because EC does not actually replicate partitions, it needs to operate at a
|
||||||
|
finer granularity than what is provided with rsync, therefore EC leverages
|
||||||
|
much of ssync behind the scenes (you do not need to manually configure ssync).
|
||||||
|
* Once a pair of nodes has determined the need to replace a missing object
|
||||||
|
fragment, instead of pushing over a copy like replication would do, the
|
||||||
|
reconstructor has to read in enough surviving fragments from other nodes and
|
||||||
|
perform a local reconstruction before it has the correct data to push to the
|
||||||
|
other node.
|
||||||
|
* A reconstructor does not talk to all other reconstructors in the set of nodes
|
||||||
|
responsible for an EC partition, this would be far too chatty, instead each
|
||||||
|
reconstructor is responsible for sync'ing with the partition's closest two
|
||||||
|
neighbors (closest meaning left and right on the ring).
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
EC work (encode and decode) takes place both on the proxy nodes, for PUT/GET
|
||||||
|
operations, as well as on the storage nodes for reconstruction. As with
|
||||||
|
replication, reconstruction can be the result of rebalancing, bit-rot, drive
|
||||||
|
failure or reverting data from a hand-off node back to its primary.
|
||||||
|
|
||||||
|
--------------------------
|
||||||
|
Performance Considerations
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
Efforts are underway to characterize performance of various Erasure Code
|
||||||
|
schemes. One of the main goals of the beta release is to perform this
|
||||||
|
characterization and encourage others to do so and provide meaningful feedback
|
||||||
|
to the development community. There are many factors that will affect
|
||||||
|
performance of EC so it is vital that we have multiple characterization
|
||||||
|
activities happening.
|
||||||
|
|
||||||
|
In general, EC has different performance characteristics than replicated data.
|
||||||
|
EC requires substantially more CPU to read and write data, and is more suited
|
||||||
|
for larger objects that are not frequently accessed (eg backups).
|
||||||
|
|
||||||
|
----------------------------
|
||||||
|
Using an Erasure Code Policy
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To use an EC policy, the administrator simply needs to define an EC policy in
|
||||||
|
`swift.conf` and create/configure the associated object ring. An example of how
|
||||||
|
an EC policy can be setup is shown below::
|
||||||
|
|
||||||
|
[storage-policy:2]
|
||||||
|
name = ec104
|
||||||
|
policy_type = erasure_coding
|
||||||
|
ec_type = jerasure_rs_vand
|
||||||
|
ec_num_data_fragments = 10
|
||||||
|
ec_num_parity_fragments = 4
|
||||||
|
ec_object_segment_size = 1048576
|
||||||
|
|
||||||
|
Let's take a closer look at each configuration parameter:
|
||||||
|
|
||||||
|
* ``name``: This is a standard storage policy parameter.
|
||||||
|
See :doc:`overview_policies` for details.
|
||||||
|
* ``policy_type``: Set this to ``erasure_coding`` to indicate that this is an EC
|
||||||
|
policy.
|
||||||
|
* ``ec_type``: Set this value according to the available options in the selected
|
||||||
|
PyECLib back-end. This specifies the EC scheme that is to be used. For
|
||||||
|
example the option shown here selects Vandermonde Reed-Solomon encoding while
|
||||||
|
an option of ``flat_xor_hd_3`` would select Flat-XOR based HD combination
|
||||||
|
codes. See the `PyECLib <https://bitbucket.org/kmgreen2/pyeclib>`_ page for
|
||||||
|
full details.
|
||||||
|
* ``ec_num_data_fragments``: The total number of fragments that will be
|
||||||
|
comprised of data.
|
||||||
|
* ``ec_num_parity_fragments``: The total number of fragments that will be
|
||||||
|
comprised of parity.
|
||||||
|
* ``ec_object_segment_size``: The amount of data that will be buffered up before
|
||||||
|
feeding a segment into the encoder/decoder. The default value is 1048576.
|
||||||
|
|
||||||
|
When PyECLib encodes an object, it will break it into N fragments. However, what
|
||||||
|
is important during configuration, is how many of those are data and how many
|
||||||
|
are parity. So in the example above, PyECLib will actually break an object in
|
||||||
|
14 different fragments, 10 of them will be made up of actual object data and 4
|
||||||
|
of them will be made of parity data (calculations depending on ec_type).
|
||||||
|
|
||||||
|
When deciding which devices to use in the EC policy's object ring, be sure to
|
||||||
|
carefully consider the performance impacts. Running some performance
|
||||||
|
benchmarking in a test environment for your configuration is highly recommended
|
||||||
|
before deployment. Once you have configured your EC policy in `swift.conf` and
|
||||||
|
created your object ring, your application is ready to start using EC simply by
|
||||||
|
creating a container with the specified policy name and interacting as usual.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
It's important to note that once you have deployed a policy and have created
|
||||||
|
objects with that policy, these configurations options cannot be changed. In
|
||||||
|
case a change in the configuration is desired, you must create a new policy
|
||||||
|
and migrate the data to a new container.
|
||||||
|
|
||||||
|
Migrating Between Policies
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
A common usage of EC is to migrate less commonly accessed data from a more
|
||||||
|
expensive but lower latency policy such as replication. When an application
|
||||||
|
determines that it wants to move data from a replication policy to an EC policy,
|
||||||
|
it simply needs to move the data from the replicated container to an EC
|
||||||
|
container that was created with the target durability policy.
|
||||||
|
|
||||||
|
Region Support
|
||||||
|
--------------
|
||||||
|
|
||||||
|
For at least the initial version of EC, it is not recommended that an EC scheme
|
||||||
|
span beyond a single region, neither performance nor functional validation has
|
||||||
|
be been done in such a configuration.
|
||||||
|
|
||||||
|
--------------
|
||||||
|
Under the Hood
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Now that we've explained a little about EC support in Swift and how to
|
||||||
|
configure/use it, let's explore how EC fits in at the nuts-n-bolts level.
|
||||||
|
|
||||||
|
Terminology
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The term 'fragment' has been used already to describe the output of the EC
|
||||||
|
process (a series of fragments) however we need to define some other key terms
|
||||||
|
here before going any deeper. Without paying special attention to using the
|
||||||
|
correct terms consistently, it is very easy to get confused in a hurry!
|
||||||
|
|
||||||
|
* **chunk**: HTTP chunks received over wire (term not used to describe any EC
|
||||||
|
specific operation).
|
||||||
|
* **segment**: Not to be confused with SLO/DLO use of the word, in EC we call a
|
||||||
|
segment a series of consecutive HTTP chunks buffered up before performing an
|
||||||
|
EC operation.
|
||||||
|
* **fragment**: Data and parity 'fragments' are generated when erasure coding
|
||||||
|
transformation is applied to a segment.
|
||||||
|
* **EC archive**: A concatenation of EC fragments; to a storage node this looks
|
||||||
|
like an object.
|
||||||
|
* **ec_ndata**: Number of EC data fragments.
|
||||||
|
* **ec_nparity**: Number of EC parity fragments.
|
||||||
|
|
||||||
|
Middleware
|
||||||
|
----------
|
||||||
|
|
||||||
|
Middleware remains unchanged. For most middleware (e.g., SLO/DLO) the fact that
|
||||||
|
the proxy is fragmenting incoming objects is transparent. For list endpoints,
|
||||||
|
however, it is a bit different. A caller of list endpoints will get back the
|
||||||
|
locations of all of the fragments. The caller will be unable to re-assemble the
|
||||||
|
original object with this information, however the node locations may still
|
||||||
|
prove to be useful information for some applications.
|
||||||
|
|
||||||
|
On Disk Storage
|
||||||
|
---------------
|
||||||
|
|
||||||
|
EC archives are stored on disk in their respective objects-N directory based on
|
||||||
|
their policy index. See :doc:`overview_policies` for details on per policy
|
||||||
|
directory information.
|
||||||
|
|
||||||
|
The actual names on disk of EC archives also have one additional piece of data
|
||||||
|
encoded in the filename, the fragment archive index.
|
||||||
|
|
||||||
|
Each storage policy now must include a transformation function that diskfile
|
||||||
|
will use to build the filename to store on disk. The functions are implemented
|
||||||
|
in the diskfile module as policy specific sub classes ``DiskFileManager``.
|
||||||
|
|
||||||
|
This is required for a few reasons. For one, it allows us to store fragment
|
||||||
|
archives of different indexes on the same storage node which is not typical
|
||||||
|
however it is possible in many circumstances. Without unique filenames for the
|
||||||
|
different EC archive files in a set, we would be at risk of overwriting one
|
||||||
|
archive of index n with another of index m in some scenarios.
|
||||||
|
|
||||||
|
The transformation function for the replication policy is simply a NOP. For
|
||||||
|
reconstruction, the index is appended to the filename just before the .data
|
||||||
|
extension. An example filename for a fragment archive storing the 5th fragment
|
||||||
|
would like this this::
|
||||||
|
|
||||||
|
1418673556.92690#5.data
|
||||||
|
|
||||||
|
An additional file is also included for Erasure Code policies called the
|
||||||
|
``.durable`` file. Its meaning will be covered in detail later, however, its on-
|
||||||
|
disk format does not require the name transformation function that was just
|
||||||
|
covered. The .durable for the example above would simply look like this::
|
||||||
|
|
||||||
|
1418673556.92690.durable
|
||||||
|
|
||||||
|
And it would be found alongside every fragment specific .data file following a
|
||||||
|
100% successful PUT operation.
|
||||||
|
|
||||||
|
Proxy Server
|
||||||
|
------------
|
||||||
|
|
||||||
|
High Level
|
||||||
|
==========
|
||||||
|
|
||||||
|
The Proxy Server handles Erasure Coding in a different manner than replication,
|
||||||
|
therefore there are several code paths unique to EC policies either though sub
|
||||||
|
classing or simple conditionals. Taking a closer look at the PUT and the GET
|
||||||
|
paths will help make this clearer. But first, a high level overview of how an
|
||||||
|
object flows through the system:
|
||||||
|
|
||||||
|
.. image:: images/ec_overview.png
|
||||||
|
|
||||||
|
Note how:
|
||||||
|
|
||||||
|
* Incoming objects are buffered into segments at the proxy.
|
||||||
|
* Segments are erasure coded into fragments at the proxy.
|
||||||
|
* The proxy stripes fragments across participating nodes such that the on-disk
|
||||||
|
stored files that we call a fragment archive is appended with each new
|
||||||
|
fragment.
|
||||||
|
|
||||||
|
This scheme makes it possible to minimize the number of on-disk files given our
|
||||||
|
segmenting and fragmenting.
|
||||||
|
|
||||||
|
Multi_Phase Conversation
|
||||||
|
========================
|
||||||
|
|
||||||
|
Multi-part MIME document support is used to allow the proxy to engage in a
|
||||||
|
handshake conversation with the storage node for processing PUT requests. This
|
||||||
|
is required for a few different reasons.
|
||||||
|
|
||||||
|
#. From the perspective of the storage node, a fragment archive is really just
|
||||||
|
another object, we need a mechanism to send down the original object etag
|
||||||
|
after all fragment archives have landed.
|
||||||
|
#. Without introducing strong consistency semantics, the proxy needs a mechanism
|
||||||
|
to know when a quorum of fragment archives have actually made it to disk
|
||||||
|
before it can inform the client of a successful PUT.
|
||||||
|
|
||||||
|
MIME supports a conversation between the proxy and the storage nodes for every
|
||||||
|
PUT. This provides us with the ability to handle a PUT in one connection and
|
||||||
|
assure that we have the essence of a 2 phase commit, basically having the proxy
|
||||||
|
communicate back to the storage nodes once it has confirmation that all fragment
|
||||||
|
archives in the set have been committed. Note that we still require a quorum of
|
||||||
|
data elements of the conversation to complete before signaling status to the
|
||||||
|
client but we can relax that requirement for the commit phase such that only 2
|
||||||
|
confirmations to that phase of the conversation are required for success as the
|
||||||
|
reconstructor will assure propagation of markers that indicate data durability.
|
||||||
|
|
||||||
|
This provides the storage node with a cheap indicator of the last known durable
|
||||||
|
set of fragment archives for a given object on a successful durable PUT, this is
|
||||||
|
known as the ``.durable`` file. The presence of a ``.durable`` file means, to
|
||||||
|
the object server, `there is a set of ts.data files that are durable at
|
||||||
|
timestamp ts.` Note that the completion of the commit phase of the conversation
|
||||||
|
is also a signal for the object server to go ahead and immediately delete older
|
||||||
|
timestamp files for this object. This is critical as we do not want to delete
|
||||||
|
the older object until the storage node has confirmation from the proxy, via the
|
||||||
|
multi-phase conversation, that the other nodes have landed enough for a quorum.
|
||||||
|
|
||||||
|
The basic flow looks like this:
|
||||||
|
|
||||||
|
* The Proxy Server erasure codes and streams the object fragments
|
||||||
|
(ec_ndata + ec_nparity) to the storage nodes.
|
||||||
|
* The storage nodes store objects as EC archives and upon finishing object
|
||||||
|
data/metadata write, send a 1st-phase response to proxy.
|
||||||
|
* Upon quorum of storage nodes responses, the proxy initiates 2nd-phase by
|
||||||
|
sending commit confirmations to object servers.
|
||||||
|
* Upon receipt of commit message, object servers store a 0-byte data file as
|
||||||
|
`<timestamp>.durable` indicating successful PUT, and send a final response to
|
||||||
|
the proxy server.
|
||||||
|
* The proxy waits for a minimal number of two object servers to respond with a
|
||||||
|
success (2xx) status before responding to the client with a successful
|
||||||
|
status. In this particular case it was decided that two responses was
|
||||||
|
the mininum amount to know that the file would be propagated in case of
|
||||||
|
failure from other others and because a greater number would potentially
|
||||||
|
mean more latency, which should be avoided if possible.
|
||||||
|
|
||||||
|
Here is a high level example of what the conversation looks like::
|
||||||
|
|
||||||
|
proxy: PUT /p/a/c/o
|
||||||
|
Transfer-Encoding': 'chunked'
|
||||||
|
Expect': '100-continue'
|
||||||
|
X-Backend-Obj-Multiphase-Commit: yes
|
||||||
|
obj: 100 Continue
|
||||||
|
X-Obj-Multiphase-Commit: yes
|
||||||
|
proxy: --MIMEboundary
|
||||||
|
X-Document: object body
|
||||||
|
<obj_data>
|
||||||
|
--MIMEboundary
|
||||||
|
X-Document: object metadata
|
||||||
|
Content-MD5: <footer_meta_cksum>
|
||||||
|
<footer_meta>
|
||||||
|
--MIMEboundary
|
||||||
|
<object server writes data, metadata>
|
||||||
|
obj: 100 Continue
|
||||||
|
<quorum>
|
||||||
|
proxy: X-Document: put commit
|
||||||
|
commit_confirmation
|
||||||
|
--MIMEboundary--
|
||||||
|
<object server writes ts.durable state>
|
||||||
|
obj: 20x
|
||||||
|
<proxy waits to receive >=2 2xx responses>
|
||||||
|
proxy: 2xx -> client
|
||||||
|
|
||||||
|
A few key points on the .durable file:
|
||||||
|
|
||||||
|
* The .durable file means \"the matching .data file for this has sufficient
|
||||||
|
fragment archives somewhere, committed, to reconstruct the object\".
|
||||||
|
* The Proxy Server will never have knowledge, either on GET or HEAD, of the
|
||||||
|
existence of a .data file on an object server if it does not have a matching
|
||||||
|
.durable file.
|
||||||
|
* The object server will never return a .data that does not have a matching
|
||||||
|
.durable.
|
||||||
|
* When a proxy does a GET, it will only receive fragment archives that have
|
||||||
|
enough present somewhere to be reconstructed.
|
||||||
|
|
||||||
|
Partial PUT Failures
|
||||||
|
====================
|
||||||
|
|
||||||
|
A partial PUT failure has a few different modes. In one scenario the Proxy
|
||||||
|
Server is alive through the entire PUT conversation. This is a very
|
||||||
|
straightforward case. The client will receive a good response if and only if a
|
||||||
|
quorum of fragment archives were successfully landed on their storage nodes. In
|
||||||
|
this case the Reconstructor will discover the missing fragment archives, perform
|
||||||
|
a reconstruction and deliver fragment archives and their matching .durable files
|
||||||
|
to the nodes.
|
||||||
|
|
||||||
|
The more interesting case is what happens if the proxy dies in the middle of a
|
||||||
|
conversation. If it turns out that a quorum had been met and the commit phase
|
||||||
|
of the conversation finished, its as simple as the previous case in that the
|
||||||
|
reconstructor will repair things. However, if the commit didn't get a change to
|
||||||
|
happen then some number of the storage nodes have .data files on them (fragment
|
||||||
|
archives) but none of them knows whether there are enough elsewhere for the
|
||||||
|
entire object to be reconstructed. In this case the client will not have
|
||||||
|
received a 2xx response so there is no issue there, however, it is left to the
|
||||||
|
storage nodes to clean up the stale fragment archives. Work is ongoing in this
|
||||||
|
area to enable the proxy to play a role in reviving these fragment archives,
|
||||||
|
however, for the current release, a proxy failure after the start of a
|
||||||
|
conversation but before the commit message will simply result in a PUT failure.
|
||||||
|
|
||||||
|
GET
|
||||||
|
===
|
||||||
|
|
||||||
|
The GET for EC is different enough from replication that subclassing the
|
||||||
|
`BaseObjectController` to the `ECObjectController` enables an efficient way to
|
||||||
|
implement the high level steps described earlier:
|
||||||
|
|
||||||
|
#. The proxy server makes simultaneous requests to participating nodes.
|
||||||
|
#. As soon as the proxy has the fragments it needs, it calls on PyECLib to
|
||||||
|
decode the data.
|
||||||
|
#. The proxy streams the decoded data it has back to the client.
|
||||||
|
#. Repeat until the proxy is done sending data back to the client.
|
||||||
|
|
||||||
|
The GET path will attempt to contact all nodes participating in the EC scheme,
|
||||||
|
if not enough primaries respond then handoffs will be contacted just as with
|
||||||
|
replication. Etag and content length headers are updated for the client
|
||||||
|
response following reconstruction as the individual fragment archives metadata
|
||||||
|
is valid only for that fragment archive.
|
||||||
|
|
||||||
|
Object Server
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The Object Server, like the Proxy Server, supports MIME conversations as
|
||||||
|
described in the proxy section earlier. This includes processing of the commit
|
||||||
|
message and decoding various sections of the MIME document to extract the footer
|
||||||
|
which includes things like the entire object etag.
|
||||||
|
|
||||||
|
DiskFile
|
||||||
|
========
|
||||||
|
|
||||||
|
Erasure code uses subclassed ``ECDiskFile``, ``ECDiskFileWriter`` and
|
||||||
|
``ECDiskFileManager`` to impement EC specific handling of on disk files. This
|
||||||
|
includes things like file name manipulation to include the fragment index in the
|
||||||
|
filename, determination of valid .data files based on .durable presence,
|
||||||
|
construction of EC specific hashes.pkl file to include fragment index
|
||||||
|
information, etc., etc.
|
||||||
|
|
||||||
|
Metadata
|
||||||
|
--------
|
||||||
|
|
||||||
|
There are few different categories of metadata that are associated with EC:
|
||||||
|
|
||||||
|
System Metadata: EC has a set of object level system metadata that it
|
||||||
|
attaches to each of the EC archives. The metadata is for internal use only:
|
||||||
|
|
||||||
|
* ``X-Object-Sysmeta-EC-Etag``: The Etag of the original object.
|
||||||
|
* ``X-Object-Sysmeta-EC-Content-Length``: The content length of the original
|
||||||
|
object.
|
||||||
|
* ``X-Object-Sysmeta-EC-Frag-Index``: The fragment index for the object.
|
||||||
|
* ``X-Object-Sysmeta-EC-Scheme``: Description of the EC policy used to encode
|
||||||
|
the object.
|
||||||
|
* ``X-Object-Sysmeta-EC-Segment-Size``: The segment size used for the object.
|
||||||
|
|
||||||
|
User Metadata: User metadata is unaffected by EC, however, a full copy of the
|
||||||
|
user metadata is stored with every EC archive. This is required as the
|
||||||
|
reconstructor needs this information and each reconstructor only communicates
|
||||||
|
with its closest neighbors on the ring.
|
||||||
|
|
||||||
|
PyECLib Metadata: PyECLib stores a small amount of metadata on a per fragment
|
||||||
|
basis. This metadata is not documented here as it is opaque to Swift.
|
||||||
|
|
||||||
|
Database Updates
|
||||||
|
----------------
|
||||||
|
|
||||||
|
As account and container rings are not associated with a Storage Policy, there
|
||||||
|
is no change to how these database updates occur when using an EC policy.
|
||||||
|
|
||||||
|
The Reconstructor
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
The Reconstructor performs analogous functions to the replicator:
|
||||||
|
|
||||||
|
#. Recovery from disk drive failure.
|
||||||
|
#. Moving data around because of a rebalance.
|
||||||
|
#. Reverting data back to a primary from a handoff.
|
||||||
|
#. Recovering fragment archives from bit rot discovered by the auditor.
|
||||||
|
|
||||||
|
However, under the hood it operates quite differently. The following are some
|
||||||
|
of the key elements in understanding how the reconstructor operates.
|
||||||
|
|
||||||
|
Unlike the replicator, the work that the reconstructor does is not always as
|
||||||
|
easy to break down into the 2 basic tasks of synchronize or revert (move data
|
||||||
|
from handoff back to primary) because of the fact that one storage node can
|
||||||
|
house fragment archives of various indexes and each index really /"belongs/" to
|
||||||
|
a different node. So, whereas when the replicator is reverting data from a
|
||||||
|
handoff it has just one node to send its data to, the reconstructor can have
|
||||||
|
several. Additionally, its not always the case that the processing of a
|
||||||
|
particular suffix directory means one or the other for the entire directory (as
|
||||||
|
it does for replication). The scenarios that create these mixed situations can
|
||||||
|
be pretty complex so we will just focus on what the reconstructor does here and
|
||||||
|
not a detailed explanation of why.
|
||||||
|
|
||||||
|
Job Construction and Processing
|
||||||
|
===============================
|
||||||
|
|
||||||
|
Because of the nature of the work it has to do as described above, the
|
||||||
|
reconstructor builds jobs for a single job processor. The job itself contains
|
||||||
|
all of the information needed for the processor to execute the job which may be
|
||||||
|
a synchronization or a data reversion and there may be a mix of jobs that
|
||||||
|
perform both of these operations on the same suffix directory.
|
||||||
|
|
||||||
|
Jobs are constructed on a per partition basis and then per fragment index basis.
|
||||||
|
That is, there will be one job for every fragment index in a partition.
|
||||||
|
Performing this construction \"up front\" like this helps minimize the
|
||||||
|
interaction between nodes collecting hashes.pkl information.
|
||||||
|
|
||||||
|
Once a set of jobs for a partition has been constructed, those jobs are sent off
|
||||||
|
to threads for execution. The single job processor then performs the necessary
|
||||||
|
actions working closely with ssync to carry out its instructions. For data
|
||||||
|
reversion, the actual objects themselves are cleaned up via the ssync module and
|
||||||
|
once that partition's set of jobs is complete, the reconstructor will attempt to
|
||||||
|
remove the relevant directory structures.
|
||||||
|
|
||||||
|
The scenarios that job construction has to take into account include:
|
||||||
|
|
||||||
|
#. A partition directory with all fragment indexes matching the local node
|
||||||
|
index. This is the case where everything is where it belongs and we just
|
||||||
|
need to compare hashes and sync if needed, here we sync with our partners.
|
||||||
|
#. A partition directory with one local fragment index and mix of others. Here
|
||||||
|
we need to sync with our partners where fragment indexes matches the
|
||||||
|
local_id, all others are sync'd with their home nodes and then deleted.
|
||||||
|
#. A partition directory with no local fragment index and just one or more of
|
||||||
|
others. Here we sync with just the home nodes for the fragment indexes that
|
||||||
|
we have and then all the local archives are deleted. This is the basic
|
||||||
|
handoff reversion case.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
A \"home node\" is the node where the fragment index encoded in the
|
||||||
|
fragment archive's filename matches the node index of a node in the primary
|
||||||
|
partition list.
|
||||||
|
|
||||||
|
Node Communication
|
||||||
|
==================
|
||||||
|
|
||||||
|
The replicators talk to all nodes who have a copy of their object, typically
|
||||||
|
just 2 other nodes. For EC, having each reconstructor node talk to all nodes
|
||||||
|
would incur a large amount of overhead as there will typically be a much larger
|
||||||
|
number of nodes participating in the EC scheme. Therefore, the reconstructor is
|
||||||
|
built to talk to its adjacent nodes on the ring only. These nodes are typically
|
||||||
|
referred to as partners.
|
||||||
|
|
||||||
|
Reconstruction
|
||||||
|
==============
|
||||||
|
|
||||||
|
Reconstruction can be thought of sort of like replication but with an extra step
|
||||||
|
in the middle. The reconstructor is hard-wired to use ssync to determine what is
|
||||||
|
missing and desired by the other side. However, before an object is sent over
|
||||||
|
the wire it needs to be reconstructed from the remaining fragments as the local
|
||||||
|
fragment is just that - a different fragment index than what the other end is
|
||||||
|
asking for.
|
||||||
|
|
||||||
|
Thus, there are hooks in ssync for EC based policies. One case would be for
|
||||||
|
basic reconstruction which, at a high level, looks like this:
|
||||||
|
|
||||||
|
* Determine which nodes need to be contacted to collect other EC archives needed
|
||||||
|
to perform reconstruction.
|
||||||
|
* Update the etag and fragment index metadata elements of the newly constructed
|
||||||
|
fragment archive.
|
||||||
|
* Establish a connection to the target nodes and give ssync a DiskFileLike class
|
||||||
|
that it can stream data from.
|
||||||
|
|
||||||
|
The reader in this class gathers fragments from the nodes and uses PyECLib to
|
||||||
|
reconstruct each segment before yielding data back to ssync. Essentially what
|
||||||
|
this means is that data is buffered, in memory, on a per segment basis at the
|
||||||
|
node performing reconstruction and each segment is dynamically reconstructed and
|
||||||
|
delivered to `ssync_sender` where the `send_put()` method will ship them on
|
||||||
|
over. The sender is then responsible for deleting the objects as they are sent
|
||||||
|
in the case of data reversion.
|
||||||
|
|
||||||
|
The Auditor
|
||||||
|
-----------
|
||||||
|
|
||||||
|
Because the auditor already operates on a per storage policy basis, there are no
|
||||||
|
specific auditor changes associated with EC. Each EC archive looks like, and is
|
||||||
|
treated like, a regular object from the perspective of the auditor. Therefore,
|
||||||
|
if the auditor finds bit-rot in an EC archive, it simply quarantines it and the
|
||||||
|
reconstructor will take care of the rest just as the replicator does for
|
||||||
|
replication policies.
|
@ -8,22 +8,22 @@ feature is implemented throughout the entire code base so it is an important
|
|||||||
concept in understanding Swift architecture.
|
concept in understanding Swift architecture.
|
||||||
|
|
||||||
As described in :doc:`overview_ring`, Swift uses modified hashing rings to
|
As described in :doc:`overview_ring`, Swift uses modified hashing rings to
|
||||||
determine where data should reside in the cluster. There is a separate ring
|
determine where data should reside in the cluster. There is a separate ring for
|
||||||
for account databases, container databases, and there is also one object
|
account databases, container databases, and there is also one object ring per
|
||||||
ring per storage policy. Each object ring behaves exactly the same way
|
storage policy. Each object ring behaves exactly the same way and is maintained
|
||||||
and is maintained in the same manner, but with policies, different devices
|
in the same manner, but with policies, different devices can belong to different
|
||||||
can belong to different rings with varying levels of replication. By supporting
|
rings. By supporting multiple object rings, Swift allows the application and/or
|
||||||
multiple object rings, Swift allows the application and/or deployer to
|
deployer to essentially segregate the object storage within a single cluster.
|
||||||
essentially segregate the object storage within a single cluster. There are
|
There are many reasons why this might be desirable:
|
||||||
many reasons why this might be desirable:
|
|
||||||
|
|
||||||
* Different levels of replication: If a provider wants to offer, for example,
|
* Different levels of durability: If a provider wants to offer, for example,
|
||||||
2x replication and 3x replication but doesn't want to maintain 2 separate clusters,
|
2x replication and 3x replication but doesn't want to maintain 2 separate
|
||||||
they would setup a 2x policy and a 3x policy and assign the nodes to their
|
clusters, they would setup a 2x and a 3x replication policy and assign the
|
||||||
respective rings.
|
nodes to their respective rings. Furthermore, if a provider wanted to offer a
|
||||||
|
cold storage tier, they could create an erasure coded policy.
|
||||||
|
|
||||||
* Performance: Just as SSDs can be used as the exclusive members of an account or
|
* Performance: Just as SSDs can be used as the exclusive members of an account
|
||||||
database ring, an SSD-only object ring can be created as well and used to
|
or database ring, an SSD-only object ring can be created as well and used to
|
||||||
implement a low-latency/high performance policy.
|
implement a low-latency/high performance policy.
|
||||||
|
|
||||||
* Collecting nodes into group: Different object rings may have different
|
* Collecting nodes into group: Different object rings may have different
|
||||||
@ -36,10 +36,12 @@ many reasons why this might be desirable:
|
|||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
Today, choosing a different storage policy allows the use of different
|
Today, Swift supports two different policy types: Replication and Erasure
|
||||||
object rings, but future policies (such as Erasure Coding) will also
|
Code. Erasure Code policy is currently a beta release and should not be
|
||||||
change some of the actual code paths when processing a request. Also note
|
used in a Production cluster. See :doc:`overview_erasure_code` for details.
|
||||||
that Diskfile refers to backend object storage plug-in architecture.
|
|
||||||
|
Also note that Diskfile refers to backend object storage plug-in
|
||||||
|
architecture. See :doc:`development_ondisk_backends` for details.
|
||||||
|
|
||||||
-----------------------
|
-----------------------
|
||||||
Containers and Policies
|
Containers and Policies
|
||||||
@ -61,31 +63,33 @@ Policy-0 is considered the default). We will be covering the difference
|
|||||||
between default and Policy-0 in the next section.
|
between default and Policy-0 in the next section.
|
||||||
|
|
||||||
Policies are assigned when a container is created. Once a container has been
|
Policies are assigned when a container is created. Once a container has been
|
||||||
assigned a policy, it cannot be changed (unless it is deleted/recreated). The implications
|
assigned a policy, it cannot be changed (unless it is deleted/recreated). The
|
||||||
on data placement/movement for large datasets would make this a task best left for
|
implications on data placement/movement for large datasets would make this a
|
||||||
applications to perform. Therefore, if a container has an existing policy of,
|
task best left for applications to perform. Therefore, if a container has an
|
||||||
for example 3x replication, and one wanted to migrate that data to a policy that specifies
|
existing policy of, for example 3x replication, and one wanted to migrate that
|
||||||
a different replication level, the application would create another container
|
data to an Erasure Code policy, the application would create another container
|
||||||
specifying the other policy name and then simply move the data from one container
|
specifying the other policy parameters and then simply move the data from one
|
||||||
to the other. Policies apply on a per container basis allowing for minimal application
|
container to the other. Policies apply on a per container basis allowing for
|
||||||
awareness; once a container has been created with a specific policy, all objects stored
|
minimal application awareness; once a container has been created with a specific
|
||||||
in it will be done so in accordance with that policy. If a container with a
|
policy, all objects stored in it will be done so in accordance with that policy.
|
||||||
specific name is deleted (requires the container be empty) a new container may
|
If a container with a specific name is deleted (requires the container be empty)
|
||||||
be created with the same name without any restriction on storage policy
|
a new container may be created with the same name without any restriction on
|
||||||
enforced by the deleted container which previously shared the same name.
|
storage policy enforced by the deleted container which previously shared the
|
||||||
|
same name.
|
||||||
|
|
||||||
Containers have a many-to-one relationship with policies meaning that any number
|
Containers have a many-to-one relationship with policies meaning that any number
|
||||||
of containers can share one policy. There is no limit to how many containers can use
|
of containers can share one policy. There is no limit to how many containers
|
||||||
a specific policy.
|
can use a specific policy.
|
||||||
|
|
||||||
The notion of associating a ring with a container introduces an interesting scenario:
|
The notion of associating a ring with a container introduces an interesting
|
||||||
What would happen if 2 containers of the same name were created with different
|
scenario: What would happen if 2 containers of the same name were created with
|
||||||
Storage Policies on either side of a network outage at the same time? Furthermore,
|
different Storage Policies on either side of a network outage at the same time?
|
||||||
what would happen if objects were placed in those containers, a whole bunch of them,
|
Furthermore, what would happen if objects were placed in those containers, a
|
||||||
and then later the network outage was restored? Well, without special care it would
|
whole bunch of them, and then later the network outage was restored? Well,
|
||||||
be a big problem as an application could end up using the wrong ring to try and find
|
without special care it would be a big problem as an application could end up
|
||||||
an object. Luckily there is a solution for this problem, a daemon known as the
|
using the wrong ring to try and find an object. Luckily there is a solution for
|
||||||
Container Reconciler works tirelessly to identify and rectify this potential scenario.
|
this problem, a daemon known as the Container Reconciler works tirelessly to
|
||||||
|
identify and rectify this potential scenario.
|
||||||
|
|
||||||
--------------------
|
--------------------
|
||||||
Container Reconciler
|
Container Reconciler
|
||||||
@ -184,9 +188,9 @@ this case we would not use the default as it might not have the same
|
|||||||
policy as legacy containers. When no other policies are defined, Swift
|
policy as legacy containers. When no other policies are defined, Swift
|
||||||
will always choose ``Policy-0`` as the default.
|
will always choose ``Policy-0`` as the default.
|
||||||
|
|
||||||
In other words, default means "create using this policy if nothing else is specified"
|
In other words, default means "create using this policy if nothing else is
|
||||||
and ``Policy-0`` means "use the legacy policy if a container doesn't have one" which
|
specified" and ``Policy-0`` means "use the legacy policy if a container doesn't
|
||||||
really means use ``object.ring.gz`` for lookups.
|
have one" which really means use ``object.ring.gz`` for lookups.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@ -244,17 +248,19 @@ not mark the policy as deprecated to all nodes.
|
|||||||
Configuring Policies
|
Configuring Policies
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
Policies are configured in ``swift.conf`` and it is important that the deployer have a solid
|
Policies are configured in ``swift.conf`` and it is important that the deployer
|
||||||
understanding of the semantics for configuring policies. Recall that a policy must have
|
have a solid understanding of the semantics for configuring policies. Recall
|
||||||
a corresponding ring file, so configuring a policy is a two-step process. First, edit
|
that a policy must have a corresponding ring file, so configuring a policy is a
|
||||||
your ``/etc/swift/swift.conf`` file to add your new policy and, second, create the
|
two-step process. First, edit your ``/etc/swift/swift.conf`` file to add your
|
||||||
corresponding policy object ring file.
|
new policy and, second, create the corresponding policy object ring file.
|
||||||
|
|
||||||
See :doc:`policies_saio` for a step by step guide on adding a policy to the SAIO setup.
|
See :doc:`policies_saio` for a step by step guide on adding a policy to the SAIO
|
||||||
|
setup.
|
||||||
|
|
||||||
Note that each policy has a section starting with ``[storage-policy:N]`` where N is the
|
Note that each policy has a section starting with ``[storage-policy:N]`` where N
|
||||||
policy index. There's no reason other than readability that these be sequential but there
|
is the policy index. There's no reason other than readability that these be
|
||||||
are a number of rules enforced by Swift when parsing this file:
|
sequential but there are a number of rules enforced by Swift when parsing this
|
||||||
|
file:
|
||||||
|
|
||||||
* If a policy with index 0 is not declared and no other policies defined,
|
* If a policy with index 0 is not declared and no other policies defined,
|
||||||
Swift will create one
|
Swift will create one
|
||||||
@ -269,9 +275,11 @@ are a number of rules enforced by Swift when parsing this file:
|
|||||||
* The policy name 'Policy-0' can only be used for the policy with index 0
|
* The policy name 'Policy-0' can only be used for the policy with index 0
|
||||||
* If any policies are defined, exactly one policy must be declared default
|
* If any policies are defined, exactly one policy must be declared default
|
||||||
* Deprecated policies cannot be declared the default
|
* Deprecated policies cannot be declared the default
|
||||||
|
* If no ``policy_type`` is provided, ``replication`` is the default value.
|
||||||
|
|
||||||
The following is an example of a properly configured ``swift.conf`` file. See :doc:`policies_saio`
|
The following is an example of a properly configured ``swift.conf`` file. See
|
||||||
for full instructions on setting up an all-in-one with this example configuration.::
|
:doc:`policies_saio` for full instructions on setting up an all-in-one with this
|
||||||
|
example configuration.::
|
||||||
|
|
||||||
[swift-hash]
|
[swift-hash]
|
||||||
# random unique strings that can never change (DO NOT LOSE)
|
# random unique strings that can never change (DO NOT LOSE)
|
||||||
@ -280,10 +288,12 @@ for full instructions on setting up an all-in-one with this example configuratio
|
|||||||
|
|
||||||
[storage-policy:0]
|
[storage-policy:0]
|
||||||
name = gold
|
name = gold
|
||||||
|
policy_type = replication
|
||||||
default = yes
|
default = yes
|
||||||
|
|
||||||
[storage-policy:1]
|
[storage-policy:1]
|
||||||
name = silver
|
name = silver
|
||||||
|
policy_type = replication
|
||||||
deprecated = yes
|
deprecated = yes
|
||||||
|
|
||||||
Review :ref:`default-policy` and :ref:`deprecate-policy` for more
|
Review :ref:`default-policy` and :ref:`deprecate-policy` for more
|
||||||
@ -300,11 +310,14 @@ There are some other considerations when managing policies:
|
|||||||
the desired policy section, but a deprecated policy may not also
|
the desired policy section, but a deprecated policy may not also
|
||||||
be declared the default, and you must specify a default - so you
|
be declared the default, and you must specify a default - so you
|
||||||
must have policy which is not deprecated at all times.
|
must have policy which is not deprecated at all times.
|
||||||
|
* The option ``policy_type`` is used to distinguish between different
|
||||||
|
policy types. The default value is ``replication``. When defining an EC
|
||||||
|
policy use the value ``erasure_coding``.
|
||||||
|
* The EC policy has additional required parameters. See
|
||||||
|
:doc:`overview_erasure_code` for details.
|
||||||
|
|
||||||
There will be additional parameters for policies as new features are added
|
Once ``swift.conf`` is configured for a new policy, a new ring must be created.
|
||||||
(e.g., Erasure Code), but for now only a section name/index and name are
|
The ring tools are not policy name aware so it's critical that the
|
||||||
required. Once ``swift.conf`` is configured for a new policy, a new ring must be
|
|
||||||
created. The ring tools are not policy name aware so it's critical that the
|
|
||||||
correct policy index be used when creating the new policy's ring file.
|
correct policy index be used when creating the new policy's ring file.
|
||||||
Additional object rings are created in the same manner as the legacy ring
|
Additional object rings are created in the same manner as the legacy ring
|
||||||
except that '-N' is appended after the word ``object`` where N matches the
|
except that '-N' is appended after the word ``object`` where N matches the
|
||||||
@ -404,43 +417,47 @@ Middleware
|
|||||||
----------
|
----------
|
||||||
|
|
||||||
Middleware can take advantage of policies through the :data:`.POLICIES` global
|
Middleware can take advantage of policies through the :data:`.POLICIES` global
|
||||||
and by importing :func:`.get_container_info` to gain access to the policy
|
and by importing :func:`.get_container_info` to gain access to the policy index
|
||||||
index associated with the container in question. From the index it
|
associated with the container in question. From the index it can then use the
|
||||||
can then use the :data:`.POLICIES` singleton to grab the right ring. For example,
|
:data:`.POLICIES` singleton to grab the right ring. For example,
|
||||||
:ref:`list_endpoints` is policy aware using the means just described. Another
|
:ref:`list_endpoints` is policy aware using the means just described. Another
|
||||||
example is :ref:`recon` which will report the md5 sums for all of the rings.
|
example is :ref:`recon` which will report the md5 sums for all of the rings.
|
||||||
|
|
||||||
Proxy Server
|
Proxy Server
|
||||||
------------
|
------------
|
||||||
|
|
||||||
The :ref:`proxy-server` module's role in Storage Policies is essentially to make sure the
|
The :ref:`proxy-server` module's role in Storage Policies is essentially to make
|
||||||
correct ring is used as its member element. Before policies, the one object ring
|
sure the correct ring is used as its member element. Before policies, the one
|
||||||
would be instantiated when the :class:`.Application` class was instantiated and could
|
object ring would be instantiated when the :class:`.Application` class was
|
||||||
be overridden by test code via init parameter. With policies, however, there is
|
instantiated and could be overridden by test code via init parameter. With
|
||||||
no init parameter and the :class:`.Application` class instead depends on the :data:`.POLICIES`
|
policies, however, there is no init parameter and the :class:`.Application`
|
||||||
global singleton to retrieve the ring which is instantiated the first time it's
|
class instead depends on the :data:`.POLICIES` global singleton to retrieve the
|
||||||
needed. So, instead of an object ring member of the :class:`.Application` class, there is
|
ring which is instantiated the first time it's needed. So, instead of an object
|
||||||
an accessor function, :meth:`~.Application.get_object_ring`, that gets the ring from :data:`.POLICIES`.
|
ring member of the :class:`.Application` class, there is an accessor function,
|
||||||
|
:meth:`~.Application.get_object_ring`, that gets the ring from
|
||||||
|
:data:`.POLICIES`.
|
||||||
|
|
||||||
In general, when any module running on the proxy requires an object ring, it
|
In general, when any module running on the proxy requires an object ring, it
|
||||||
does so via first getting the policy index from the cached container info. The
|
does so via first getting the policy index from the cached container info. The
|
||||||
exception is during container creation where it uses the policy name from the
|
exception is during container creation where it uses the policy name from the
|
||||||
request header to look up policy index from the :data:`.POLICIES` global. Once the
|
request header to look up policy index from the :data:`.POLICIES` global. Once
|
||||||
proxy has determined the policy index, it can use the :meth:`~.Application.get_object_ring` method
|
the proxy has determined the policy index, it can use the
|
||||||
described earlier to gain access to the correct ring. It then has the responsibility
|
:meth:`~.Application.get_object_ring` method described earlier to gain access to
|
||||||
of passing the index information, not the policy name, on to the back-end servers
|
the correct ring. It then has the responsibility of passing the index
|
||||||
via the header ``X-Backend-Storage-Policy-Index``. Going the other way, the proxy also
|
information, not the policy name, on to the back-end servers via the header ``X
|
||||||
strips the index out of headers that go back to clients, and makes sure they only
|
-Backend-Storage-Policy-Index``. Going the other way, the proxy also strips the
|
||||||
see the friendly policy names.
|
index out of headers that go back to clients, and makes sure they only see the
|
||||||
|
friendly policy names.
|
||||||
|
|
||||||
On Disk Storage
|
On Disk Storage
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
Policies each have their own directories on the back-end servers and are identified by
|
Policies each have their own directories on the back-end servers and are
|
||||||
their storage policy indexes. Organizing the back-end directory structures by policy
|
identified by their storage policy indexes. Organizing the back-end directory
|
||||||
index helps keep track of things and also allows for sharing of disks between policies
|
structures by policy index helps keep track of things and also allows for
|
||||||
which may or may not make sense depending on the needs of the provider. More
|
sharing of disks between policies which may or may not make sense depending on
|
||||||
on this later, but for now be aware of the following directory naming convention:
|
the needs of the provider. More on this later, but for now be aware of the
|
||||||
|
following directory naming convention:
|
||||||
|
|
||||||
* ``/objects`` maps to objects associated with Policy-0
|
* ``/objects`` maps to objects associated with Policy-0
|
||||||
* ``/objects-N`` maps to storage policy index #N
|
* ``/objects-N`` maps to storage policy index #N
|
||||||
@ -466,19 +483,19 @@ policy index and leaves the actual directory naming/structure mechanisms to
|
|||||||
:class:`.Diskfile` being used will assure that data is properly located in the
|
:class:`.Diskfile` being used will assure that data is properly located in the
|
||||||
tree based on its policy.
|
tree based on its policy.
|
||||||
|
|
||||||
For the same reason, the :ref:`object-updater` also is policy aware. As previously
|
For the same reason, the :ref:`object-updater` also is policy aware. As
|
||||||
described, different policies use different async pending directories so the
|
previously described, different policies use different async pending directories
|
||||||
updater needs to know how to scan them appropriately.
|
so the updater needs to know how to scan them appropriately.
|
||||||
|
|
||||||
The :ref:`object-replicator` is policy aware in that, depending on the policy, it may have to
|
The :ref:`object-replicator` is policy aware in that, depending on the policy,
|
||||||
do drastically different things, or maybe not. For example, the difference in
|
it may have to do drastically different things, or maybe not. For example, the
|
||||||
handling a replication job for 2x versus 3x is trivial; however, the difference in
|
difference in handling a replication job for 2x versus 3x is trivial; however,
|
||||||
handling replication between 3x and erasure code is most definitely not. In
|
the difference in handling replication between 3x and erasure code is most
|
||||||
fact, the term 'replication' really isn't appropriate for some policies
|
definitely not. In fact, the term 'replication' really isn't appropriate for
|
||||||
like erasure code; however, the majority of the framework for collecting and
|
some policies like erasure code; however, the majority of the framework for
|
||||||
processing jobs is common. Thus, those functions in the replicator are
|
collecting and processing jobs is common. Thus, those functions in the
|
||||||
leveraged for all policies and then there is policy specific code required for
|
replicator are leveraged for all policies and then there is policy specific code
|
||||||
each policy, added when the policy is defined if needed.
|
required for each policy, added when the policy is defined if needed.
|
||||||
|
|
||||||
The ssync functionality is policy aware for the same reason. Some of the
|
The ssync functionality is policy aware for the same reason. Some of the
|
||||||
other modules may not obviously be affected, but the back-end directory
|
other modules may not obviously be affected, but the back-end directory
|
||||||
@ -487,25 +504,26 @@ parameter. Therefore ssync being policy aware really means passing the
|
|||||||
policy index along. See :class:`~swift.obj.ssync_sender` and
|
policy index along. See :class:`~swift.obj.ssync_sender` and
|
||||||
:class:`~swift.obj.ssync_receiver` for more information on ssync.
|
:class:`~swift.obj.ssync_receiver` for more information on ssync.
|
||||||
|
|
||||||
For :class:`.Diskfile` itself, being policy aware is all about managing the back-end
|
For :class:`.Diskfile` itself, being policy aware is all about managing the
|
||||||
structure using the provided policy index. In other words, callers who get
|
back-end structure using the provided policy index. In other words, callers who
|
||||||
a :class:`.Diskfile` instance provide a policy index and :class:`.Diskfile`'s job is to keep data
|
get a :class:`.Diskfile` instance provide a policy index and
|
||||||
separated via this index (however it chooses) such that policies can share
|
:class:`.Diskfile`'s job is to keep data separated via this index (however it
|
||||||
the same media/nodes if desired. The included implementation of :class:`.Diskfile`
|
chooses) such that policies can share the same media/nodes if desired. The
|
||||||
lays out the directory structure described earlier but that's owned within
|
included implementation of :class:`.Diskfile` lays out the directory structure
|
||||||
:class:`.Diskfile`; external modules have no visibility into that detail. A common
|
described earlier but that's owned within :class:`.Diskfile`; external modules
|
||||||
function is provided to map various directory names and/or strings
|
have no visibility into that detail. A common function is provided to map
|
||||||
based on their policy index. For example :class:`.Diskfile` defines :func:`.get_data_dir`
|
various directory names and/or strings based on their policy index. For example
|
||||||
which builds off of a generic :func:`.get_policy_string` to consistently build
|
:class:`.Diskfile` defines :func:`.get_data_dir` which builds off of a generic
|
||||||
policy aware strings for various usage.
|
:func:`.get_policy_string` to consistently build policy aware strings for
|
||||||
|
various usage.
|
||||||
|
|
||||||
Container Server
|
Container Server
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
The :ref:`container-server` plays a very important role in Storage Policies, it is
|
The :ref:`container-server` plays a very important role in Storage Policies, it
|
||||||
responsible for handling the assignment of a policy to a container and the
|
is responsible for handling the assignment of a policy to a container and the
|
||||||
prevention of bad things like changing policies or picking the wrong policy
|
prevention of bad things like changing policies or picking the wrong policy to
|
||||||
to use when nothing is specified (recall earlier discussion on Policy-0 versus
|
use when nothing is specified (recall earlier discussion on Policy-0 versus
|
||||||
default).
|
default).
|
||||||
|
|
||||||
The :ref:`container-updater` is policy aware, however its job is very simple, to
|
The :ref:`container-updater` is policy aware, however its job is very simple, to
|
||||||
@ -538,19 +556,19 @@ migrated to be fully compatible with the post-storage-policy queries without
|
|||||||
having to fall back and retry queries with the legacy schema to service
|
having to fall back and retry queries with the legacy schema to service
|
||||||
container read requests.
|
container read requests.
|
||||||
|
|
||||||
The :ref:`container-sync-daemon` functionality only needs to be policy aware in that it
|
The :ref:`container-sync-daemon` functionality only needs to be policy aware in
|
||||||
accesses the object rings. Therefore, it needs to pull the policy index
|
that it accesses the object rings. Therefore, it needs to pull the policy index
|
||||||
out of the container information and use it to select the appropriate
|
out of the container information and use it to select the appropriate object
|
||||||
object ring from the :data:`.POLICIES` global.
|
ring from the :data:`.POLICIES` global.
|
||||||
|
|
||||||
Account Server
|
Account Server
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
The :ref:`account-server`'s role in Storage Policies is really limited to reporting.
|
The :ref:`account-server`'s role in Storage Policies is really limited to
|
||||||
When a HEAD request is made on an account (see example provided earlier),
|
reporting. When a HEAD request is made on an account (see example provided
|
||||||
the account server is provided with the storage policy index and builds
|
earlier), the account server is provided with the storage policy index and
|
||||||
the ``object_count`` and ``byte_count`` information for the client on a per
|
builds the ``object_count`` and ``byte_count`` information for the client on a
|
||||||
policy basis.
|
per policy basis.
|
||||||
|
|
||||||
The account servers are able to report per-storage-policy object and byte
|
The account servers are able to report per-storage-policy object and byte
|
||||||
counts because of some policy specific DB schema changes. A policy specific
|
counts because of some policy specific DB schema changes. A policy specific
|
||||||
@ -564,23 +582,23 @@ pre-storage-policy accounts by altering the DB schema and populating the
|
|||||||
point in time.
|
point in time.
|
||||||
|
|
||||||
The per-storage-policy object and byte counts are not updated with each object
|
The per-storage-policy object and byte counts are not updated with each object
|
||||||
PUT and DELETE request, instead container updates to the account server are performed
|
PUT and DELETE request, instead container updates to the account server are
|
||||||
asynchronously by the ``swift-container-updater``.
|
performed asynchronously by the ``swift-container-updater``.
|
||||||
|
|
||||||
.. _upgrade-policy:
|
.. _upgrade-policy:
|
||||||
|
|
||||||
Upgrading and Confirming Functionality
|
Upgrading and Confirming Functionality
|
||||||
--------------------------------------
|
--------------------------------------
|
||||||
|
|
||||||
Upgrading to a version of Swift that has Storage Policy support is not difficult,
|
Upgrading to a version of Swift that has Storage Policy support is not
|
||||||
in fact, the cluster administrator isn't required to make any special configuration
|
difficult, in fact, the cluster administrator isn't required to make any special
|
||||||
changes to get going. Swift will automatically begin using the existing object
|
configuration changes to get going. Swift will automatically begin using the
|
||||||
ring as both the default ring and the Policy-0 ring. Adding the declaration of
|
existing object ring as both the default ring and the Policy-0 ring. Adding the
|
||||||
policy 0 is totally optional and in its absence, the name given to the implicit
|
declaration of policy 0 is totally optional and in its absence, the name given
|
||||||
policy 0 will be 'Policy-0'. Let's say for testing purposes that you wanted to take
|
to the implicit policy 0 will be 'Policy-0'. Let's say for testing purposes
|
||||||
an existing cluster that already has lots of data on it and upgrade to Swift with
|
that you wanted to take an existing cluster that already has lots of data on it
|
||||||
Storage Policies. From there you want to go ahead and create a policy and test a
|
and upgrade to Swift with Storage Policies. From there you want to go ahead and
|
||||||
few things out. All you need to do is:
|
create a policy and test a few things out. All you need to do is:
|
||||||
|
|
||||||
#. Upgrade all of your Swift nodes to a policy-aware version of Swift
|
#. Upgrade all of your Swift nodes to a policy-aware version of Swift
|
||||||
#. Define your policies in ``/etc/swift/swift.conf``
|
#. Define your policies in ``/etc/swift/swift.conf``
|
||||||
|
@ -111,11 +111,53 @@ Another improvement planned all along the way is separating the local disk
|
|||||||
structure from the protocol path structure. This separation will allow ring
|
structure from the protocol path structure. This separation will allow ring
|
||||||
resizing at some point, or at least ring-doubling.
|
resizing at some point, or at least ring-doubling.
|
||||||
|
|
||||||
FOR NOW, IT IS NOT RECOMMENDED TO USE SSYNC ON PRODUCTION CLUSTERS. Some of us
|
Note that for objects being stored with an Erasure Code policy, the replicator
|
||||||
will be in a limited fashion to look for any subtle issues, tuning, etc. but
|
daemon is not involved. Instead, the reconstructor is used by Erasure Code
|
||||||
generally ssync is an experimental feature. In its current implementation it is
|
policies and is analogous to the replicator for Replication type policies.
|
||||||
probably going to be a bit slower than RSync, but if all goes according to plan
|
See :doc:`overview_erasure_code` for complete information on both Erasure Code
|
||||||
it will end up much faster.
|
support as well as the reconstructor.
|
||||||
|
|
||||||
|
----------
|
||||||
|
Hashes.pkl
|
||||||
|
----------
|
||||||
|
|
||||||
|
The hashes.pkl file is a key element for both replication and reconstruction
|
||||||
|
(for Erasure Coding). Both daemons use this file to determine if any kind of
|
||||||
|
action is required between nodes that are participating in the durability
|
||||||
|
scheme. The file itself is a pickled dictionary with slightly different
|
||||||
|
formats depending on whether the policy is Replication or Erasure Code. In
|
||||||
|
either case, however, the same basic information is provided between the
|
||||||
|
nodes. The dictionary contains a dictionary where the key is a suffix
|
||||||
|
directory name and the value is the MD5 hash of the directory listing for
|
||||||
|
that suffix. In this manner, the daemon can quickly identify differences
|
||||||
|
between local and remote suffix directories on a per partition basis as the
|
||||||
|
scope of any one hashes.pkl file is a partition directory.
|
||||||
|
|
||||||
|
For Erasure Code policies, there is a little more information required. An
|
||||||
|
object's hash directory may contain multiple fragments of a single object in
|
||||||
|
the event that the node is acting as a handoff or perhaps if a rebalance is
|
||||||
|
underway. Each fragment of an object is stored with a fragment index, so
|
||||||
|
the hashes.pkl for an Erasure Code partition will still be a dictionary
|
||||||
|
keyed on the suffix directory name, however, the value is another dictionary
|
||||||
|
keyed on the fragment index with subsequent MD5 hashes for each one as
|
||||||
|
values. Some files within an object hash directory don't require a fragment
|
||||||
|
index so None is used to represent those. Below are examples of what these
|
||||||
|
dictionaries might look like.
|
||||||
|
|
||||||
|
Replication hashes.pkl::
|
||||||
|
|
||||||
|
{'a43': '72018c5fbfae934e1f56069ad4425627',
|
||||||
|
'b23': '12348c5fbfae934e1f56069ad4421234'}
|
||||||
|
|
||||||
|
Erasure Code hashes.pkl::
|
||||||
|
|
||||||
|
{'a43': {None: '72018c5fbfae934e1f56069ad4425627',
|
||||||
|
2: 'b6dd6db937cb8748f50a5b6e4bc3b808'},
|
||||||
|
'b23': {None: '12348c5fbfae934e1f56069ad4421234',
|
||||||
|
1: '45676db937cb8748f50a5b6e4bc34567'}}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
Loading…
x
Reference in New Issue
Block a user