From bb8c4eab41803d59e1d52ebc12ff50f29fba1171 Mon Sep 17 00:00:00 2001 From: Florian Hines Date: Tue, 18 Oct 2011 21:10:50 +0000 Subject: [PATCH] Add documentation for Swift Recon. Change-Id: I37f4fb624bdc5b8bbf2e691d29aa6b15cd648aa8 --- doc/source/admin_guide.rst | 95 +++++++++++++++++++++++++++++++++++ doc/source/misc.rst | 9 ++++ etc/object-server.conf-sample | 9 +++- 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index d4033c7430..77eb267b0c 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -222,6 +222,101 @@ place and then rerun the dispersion report:: Sample represents 1.00% of the object partition space +-------------------------------- +Cluster Telemetry and Monitoring +-------------------------------- + +Various metrics and telemetry can be obtained from the object servers using +the recon server middleware and the swift-recon cli. To do so update your +object-server.conf to enable the recon middleware by adding a pipeline entry +and setting its one option:: + + [pipeline:main] + pipeline = recon object-server + + [filter:recon] + use = egg:swift#recon + recon_cache_path = /var/cache/swift + +The recon_cache_path simply sets the directory where stats for a few items will +be stored. Depending on the method of deployment you may need to create this +directory manually and ensure that swift has read/write. + +If you wish to enable reporting of replication times you can enable recon +support in the object-replicator section of the object-server.conf:: + + [object-replicator] + ... + recon_enable = yes + recon_cache_path = /var/cache/swift + +Finally if you also wish to track asynchronous pending's you will need to setup +a cronjob to run the swift-recon-cron script periodically:: + + */5 * * * * swift /usr/bin/swift-recon-cron /etc/swift/object-server.conf + +Once enabled a GET request for "/recon/" to the object server will +return a json formatted response:: + + fhines@ubuntu:~$ curl -i http://localhost:6030/recon/async + HTTP/1.1 200 OK + Content-Type: application/json + Content-Length: 20 + Date: Tue, 18 Oct 2011 21:03:01 GMT + + {"async_pending": 0} + +The following metrics and telemetry are currently exposed: + +================== ==================================================== +Request URI Description +------------------ ---------------------------------------------------- +/recon/load returns 1,5, and 15 minute load average +/recon/async returns count of async pending +/recon/mem returns /proc/meminfo +/recon/replication returns last logged object replication time +/recon/mounted returns *ALL* currently mounted filesystems +/recon/unmounted returns all unmounted drives if mount_check = True +/recon/diskusage returns disk utilization for storage devices +/recon/ringmd5 returns object/container/account ring md5sums +/recon/quarantined returns # of quarantined objects/accounts/containers +================== ==================================================== + +This information can also be queried via the swift-recon command line utility:: + + fhines@ubuntu:~$ swift-recon -h + =============================================================================== + Usage: + usage: swift-recon [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5] + + + Options: + -h, --help show this help message and exit + -v, --verbose Print verbose info + --suppress Suppress most connection related errors + -a, --async Get async stats + -r, --replication Get replication stats + -u, --unmounted Check cluster for unmounted devices + -d, --diskusage Get disk usage stats + -l, --loadstats Get cluster load average stats + -q, --quarantined Get cluster quarantine stats + --objmd5 Get md5sums of object.ring.gz and compare to local + copy + --all Perform all checks. Equivalent to -arudlq --objmd5 + -z ZONE, --zone=ZONE Only query servers in specified zone + --swiftdir=SWIFTDIR Default = /etc/swift + +For example, to obtain quarantine stats from all hosts in zone "3":: + + fhines@ubuntu:~$ swift-recon -q --zone 3 + =============================================================================== + [2011-10-18 19:36:00] Checking quarantine dirs on 1 hosts... + [Quarantined objects] low: 4, high: 4, avg: 4, total: 4 + [Quarantined accounts] low: 0, high: 0, avg: 0, total: 0 + [Quarantined containers] low: 0, high: 0, avg: 0, total: 0 + =============================================================================== + + ------------------------ Debugging Tips and Tools ------------------------ diff --git a/doc/source/misc.rst b/doc/source/misc.rst index 9505870ce4..29486b15f0 100644 --- a/doc/source/misc.rst +++ b/doc/source/misc.rst @@ -98,6 +98,15 @@ Healthcheck :members: :show-inheritance: +.. _recon: + +Recon +=========== + +.. automodule:: swift.common.middleware.recon + :members: + :show-inheritance: + .. _memecached: MemCacheD diff --git a/etc/object-server.conf-sample b/etc/object-server.conf-sample index 8831048e43..77258ab27b 100644 --- a/etc/object-server.conf-sample +++ b/etc/object-server.conf-sample @@ -13,7 +13,7 @@ # log_level = INFO [pipeline:main] -pipeline = object-server +pipeline = recon object-server [app:object-server] use = egg:swift#object @@ -35,6 +35,10 @@ use = egg:swift#object # Content-Type, etag, Content-Length, or deleted # allowed_headers = Content-Encoding, Content-Disposition, X-Object-Manifest +[filter:recon] +use = egg:swift#recon +recon_cache_path = /var/cache/swift + [object-replicator] # You can override the default log routing for this app here (don't use set!): # log_name = object-replicator @@ -55,6 +59,9 @@ use = egg:swift#object # lockup_timeout = 1800 # The replicator also performs reclamation # reclaim_age = 604800 +# enable logging of replication stats for recon +# recon_enable = no +# recon_cache_path = /var/cache/swift [object-updater] # You can override the default log routing for this app here (don't use set!):