Influx HA as an external storage for Prometheus
Change-Id: I01bdfabb4189bcf35f0872350b78feba0f762eef
@ -12,3 +12,4 @@ Methodologies
|
|||||||
tools
|
tools
|
||||||
hyper-scale
|
hyper-scale
|
||||||
monitoring/index
|
monitoring/index
|
||||||
|
monitoring/influxha
|
||||||
|
BIN
doc/source/methodologies/monitoring/images/db/1_heap_usage.png
Normal file
After Width: | Height: | Size: 42 KiB |
BIN
doc/source/methodologies/monitoring/images/db/1_http_errors.png
Normal file
After Width: | Height: | Size: 21 KiB |
BIN
doc/source/methodologies/monitoring/images/db/1_point_intake.png
Normal file
After Width: | Height: | Size: 32 KiB |
BIN
doc/source/methodologies/monitoring/images/db/2_heap_usage.png
Normal file
After Width: | Height: | Size: 67 KiB |
BIN
doc/source/methodologies/monitoring/images/db/2_http_errors.png
Normal file
After Width: | Height: | Size: 20 KiB |
BIN
doc/source/methodologies/monitoring/images/db/2_point_intake.png
Normal file
After Width: | Height: | Size: 32 KiB |
BIN
doc/source/methodologies/monitoring/images/influxdb-relay.png
Normal file
After Width: | Height: | Size: 326 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/lb/cpu_idle.png
Normal file
After Width: | Height: | Size: 17 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/lb/cpu_system.png
Normal file
After Width: | Height: | Size: 92 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/lb/cpu_user.png
Normal file
After Width: | Height: | Size: 98 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/lb/disk_rate.png
Normal file
After Width: | Height: | Size: 61 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/lb/la.png
Normal file
After Width: | Height: | Size: 58 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/lb/mem_free.png
Normal file
After Width: | Height: | Size: 18 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/lb/mem_used.png
Normal file
After Width: | Height: | Size: 23 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 31 KiB |
After Width: | Height: | Size: 74 KiB |
After Width: | Height: | Size: 68 KiB |
After Width: | Height: | Size: 62 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/node1/la.png
Normal file
After Width: | Height: | Size: 59 KiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 33 KiB |
After Width: | Height: | Size: 62 KiB |
After Width: | Height: | Size: 40 KiB |
After Width: | Height: | Size: 144 KiB |
After Width: | Height: | Size: 78 KiB |
After Width: | Height: | Size: 67 KiB |
BIN
doc/source/methodologies/monitoring/images/sys/node2/la.png
Normal file
After Width: | Height: | Size: 65 KiB |
After Width: | Height: | Size: 21 KiB |
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 90 KiB |
@ -0,0 +1,42 @@
|
|||||||
|
|
||||||
|
client_max_body_size 20M;
|
||||||
|
|
||||||
|
upstream influxdb {
|
||||||
|
server influx1_ip:8086;
|
||||||
|
server influx2_ip:8086;
|
||||||
|
}
|
||||||
|
upstream relay {
|
||||||
|
server influx1_ip:9096;
|
||||||
|
server influx2_ip:9096;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 7076;
|
||||||
|
location /query {
|
||||||
|
limit_except GET {
|
||||||
|
deny all;
|
||||||
|
}
|
||||||
|
proxy_pass http://influxdb;
|
||||||
|
}
|
||||||
|
location /write {
|
||||||
|
limit_except POST {
|
||||||
|
deny all;
|
||||||
|
}
|
||||||
|
proxy_pass http://relay;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# stream {
|
||||||
|
# upstream test {
|
||||||
|
# server server1:8003;
|
||||||
|
# server server2:8003;
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# server {
|
||||||
|
# listen 7003 udp;
|
||||||
|
# proxy_pass test;
|
||||||
|
# proxy_timeout 1s;
|
||||||
|
# proxy_responses 1;
|
||||||
|
# }
|
||||||
|
# }
|
129
doc/source/methodologies/monitoring/influx_ha/conf/influxdb.conf
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
reporting-disabled = false
|
||||||
|
bind-address = ":8088"
|
||||||
|
|
||||||
|
[meta]
|
||||||
|
dir = "/var/lib/influxdb/meta"
|
||||||
|
retention-autocreate = true
|
||||||
|
logging-enabled = true
|
||||||
|
|
||||||
|
[data]
|
||||||
|
dir = "/var/lib/influxdb/data"
|
||||||
|
wal-dir = "/var/lib/influxdb/wal"
|
||||||
|
query-log-enabled = true
|
||||||
|
cache-max-memory-size = 1073741824
|
||||||
|
cache-snapshot-memory-size = 26214400
|
||||||
|
cache-snapshot-write-cold-duration = "10m0s"
|
||||||
|
compact-full-write-cold-duration = "4h0m0s"
|
||||||
|
max-series-per-database = 0
|
||||||
|
max-values-per-tag = 100000
|
||||||
|
trace-logging-enabled = false
|
||||||
|
|
||||||
|
[coordinator]
|
||||||
|
write-timeout = "10s"
|
||||||
|
max-concurrent-queries = 0
|
||||||
|
query-timeout = "0s"
|
||||||
|
log-queries-after = "0s"
|
||||||
|
max-select-point = 0
|
||||||
|
max-select-series = 0
|
||||||
|
max-select-buckets = 0
|
||||||
|
|
||||||
|
[retention]
|
||||||
|
enabled = true
|
||||||
|
check-interval = "30m0s"
|
||||||
|
|
||||||
|
[shard-precreation]
|
||||||
|
enabled = true
|
||||||
|
check-interval = "10m0s"
|
||||||
|
advance-period = "30m0s"
|
||||||
|
|
||||||
|
[admin]
|
||||||
|
enabled = false
|
||||||
|
bind-address = ":8083"
|
||||||
|
https-enabled = false
|
||||||
|
https-certificate = "/etc/ssl/influxdb.pem"
|
||||||
|
|
||||||
|
[monitor]
|
||||||
|
store-enabled = true
|
||||||
|
store-database = "_internal"
|
||||||
|
store-interval = "10s"
|
||||||
|
|
||||||
|
[subscriber]
|
||||||
|
enabled = true
|
||||||
|
http-timeout = "30s"
|
||||||
|
insecure-skip-verify = false
|
||||||
|
ca-certs = ""
|
||||||
|
write-concurrency = 40
|
||||||
|
write-buffer-size = 1000
|
||||||
|
|
||||||
|
[http]
|
||||||
|
enabled = true
|
||||||
|
bind-address = ":8086"
|
||||||
|
auth-enabled = false
|
||||||
|
log-enabled = true
|
||||||
|
write-tracing = false
|
||||||
|
pprof-enabled = true
|
||||||
|
https-enabled = false
|
||||||
|
https-certificate = "/etc/ssl/influxdb.pem"
|
||||||
|
https-private-key = ""
|
||||||
|
max-row-limit = 10000
|
||||||
|
max-connection-limit = 0
|
||||||
|
shared-secret = ""
|
||||||
|
realm = "InfluxDB"
|
||||||
|
unix-socket-enabled = false
|
||||||
|
bind-socket = "/var/run/influxdb.sock"
|
||||||
|
|
||||||
|
[[graphite]]
|
||||||
|
enabled = false
|
||||||
|
bind-address = ":2003"
|
||||||
|
database = "graphite"
|
||||||
|
retention-policy = ""
|
||||||
|
protocol = "tcp"
|
||||||
|
batch-size = 5000
|
||||||
|
batch-pending = 10
|
||||||
|
batch-timeout = "1s"
|
||||||
|
consistency-level = "one"
|
||||||
|
separator = "."
|
||||||
|
udp-read-buffer = 0
|
||||||
|
|
||||||
|
[[collectd]]
|
||||||
|
enabled = false
|
||||||
|
bind-address = ":25826"
|
||||||
|
database = "collectd"
|
||||||
|
retention-policy = ""
|
||||||
|
batch-size = 5000
|
||||||
|
batch-pending = 10
|
||||||
|
batch-timeout = "10s"
|
||||||
|
read-buffer = 0
|
||||||
|
typesdb = "/usr/share/collectd/types.db"
|
||||||
|
security-level = "none"
|
||||||
|
auth-file = "/etc/collectd/auth_file"
|
||||||
|
|
||||||
|
[[opentsdb]]
|
||||||
|
enabled = false
|
||||||
|
bind-address = ":4242"
|
||||||
|
database = "opentsdb"
|
||||||
|
retention-policy = ""
|
||||||
|
consistency-level = "one"
|
||||||
|
tls-enabled = false
|
||||||
|
certificate = "/etc/ssl/influxdb.pem"
|
||||||
|
batch-size = 1000
|
||||||
|
batch-pending = 5
|
||||||
|
batch-timeout = "1s"
|
||||||
|
log-point-errors = true
|
||||||
|
|
||||||
|
[[udp]]
|
||||||
|
enabled = false
|
||||||
|
bind-address = ":8089"
|
||||||
|
database = "udp"
|
||||||
|
retention-policy = ""
|
||||||
|
batch-size = 5000
|
||||||
|
batch-pending = 10
|
||||||
|
read-buffer = 0
|
||||||
|
batch-timeout = "1s"
|
||||||
|
precision = ""
|
||||||
|
|
||||||
|
[continuous_queries]
|
||||||
|
log-enabled = true
|
||||||
|
enabled = true
|
||||||
|
run-interval = "1s"
|
||||||
|
|
@ -0,0 +1,38 @@
|
|||||||
|
# Name of the HTTP server, used for display purposes only
|
||||||
|
[[http]]
|
||||||
|
name = "influx-http"
|
||||||
|
|
||||||
|
# TCP address to bind to, for HTTP server
|
||||||
|
bind-addr = "influx1_ip:9096"
|
||||||
|
|
||||||
|
# Array of InfluxDB instances to use as backends for Relay
|
||||||
|
# name: name of the backend, used for display purposes only.
|
||||||
|
# location: full URL of the /write endpoint of the backend
|
||||||
|
# timeout: Go-parseable time duration. Fail writes if incomplete in this time.
|
||||||
|
# skip-tls-verification: skip verification for HTTPS location. WARNING: it's insecure. Don't use in production.
|
||||||
|
output = [
|
||||||
|
{ name="local-influx1", location = "http://127.0.0.1:8086/write", timeout="10s" },
|
||||||
|
{ name="remote-influx2", location = "http://influx2_ip:8086/write", timeout="10s" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[[udp]]
|
||||||
|
# Name of the UDP server, used for display purposes only
|
||||||
|
name = "influx-udp"
|
||||||
|
|
||||||
|
# UDP address to bind to
|
||||||
|
bind-addr = "127.0.0.1:9096"
|
||||||
|
|
||||||
|
# Socket buffer size for incoming connections
|
||||||
|
read-buffer = 0 # default
|
||||||
|
|
||||||
|
# Precision to use for timestamps
|
||||||
|
precision = "n" # Can be n, u, ms, s, m, h
|
||||||
|
|
||||||
|
# Array of InfluxDB UDP instances to use as backends for Relay
|
||||||
|
# name: name of the backend, used for display purposes only.
|
||||||
|
# location: host and port of backend.
|
||||||
|
# mtu: maximum output payload size
|
||||||
|
output = [
|
||||||
|
{ name="local-influx1-udp", location="127.0.0.1:8089", mtu=512 },
|
||||||
|
{ name="remote-influx2-udp", location="influx2_ip:8089", mtu=512 },
|
||||||
|
]
|
@ -0,0 +1,38 @@
|
|||||||
|
# Name of the HTTP server, used for display purposes only
|
||||||
|
[[http]]
|
||||||
|
name = "influx-http"
|
||||||
|
|
||||||
|
# TCP address to bind to, for HTTP server
|
||||||
|
bind-addr = "influx2_ip:9096"
|
||||||
|
|
||||||
|
# Array of InfluxDB instances to use as backends for Relay
|
||||||
|
# name: name of the backend, used for display purposes only.
|
||||||
|
# location: full URL of the /write endpoint of the backend
|
||||||
|
# timeout: Go-parseable time duration. Fail writes if incomplete in this time.
|
||||||
|
# skip-tls-verification: skip verification for HTTPS location. WARNING: it's insecure. Don't use in production.
|
||||||
|
output = [
|
||||||
|
{ name="local-influx2", location = "http://127.0.0.1:8086/write", timeout="10s" },
|
||||||
|
{ name="remote-influx1", location = "http://influx1_ip:8086/write", timeout="10s" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[[udp]]
|
||||||
|
# Name of the UDP server, used for display purposes only
|
||||||
|
name = "influx-udp"
|
||||||
|
|
||||||
|
# UDP address to bind to
|
||||||
|
bind-addr = "127.0.0.1:9096"
|
||||||
|
|
||||||
|
# Socket buffer size for incoming connections
|
||||||
|
read-buffer = 0 # default
|
||||||
|
|
||||||
|
# Precision to use for timestamps
|
||||||
|
precision = "n" # Can be n, u, ms, s, m, h
|
||||||
|
|
||||||
|
# Array of InfluxDB UDP instances to use as backends for Relay
|
||||||
|
# name: name of the backend, used for display purposes only.
|
||||||
|
# location: host and port of backend.
|
||||||
|
# mtu: maximum output payload size
|
||||||
|
output = [
|
||||||
|
{ name="local-influx2-udp", location="127.0.0.1:8089", mtu=512 },
|
||||||
|
{ name="remote-influx1-udp", location="influx1_ip:8089", mtu=512 },
|
||||||
|
]
|
@ -0,0 +1,56 @@
|
|||||||
|
#!/bin/bash -xe
|
||||||
|
|
||||||
|
INFLUX1=${INFLUX1:-172.20.9.29}
|
||||||
|
INFLUX2=${INFLUX2:-172.20.9.19}
|
||||||
|
BALANCER=${BALANCER:-172.20.9.27}
|
||||||
|
SSH_PASSWORD="r00tme"
|
||||||
|
SSH_USER="root"
|
||||||
|
SSH_OPTIONS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
||||||
|
|
||||||
|
type sshpass || (echo "sshpass is not installed" && exit 1)
|
||||||
|
|
||||||
|
ssh_exec() {
|
||||||
|
node=$1
|
||||||
|
shift
|
||||||
|
sshpass -p ${SSH_PASSWORD} ssh ${SSH_OPTIONS} ${SSH_USER}@${node} "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
scp_exec() {
|
||||||
|
node=$1
|
||||||
|
src=$2
|
||||||
|
dst=$3
|
||||||
|
sshpass -p ${SSH_PASSWORD} scp ${SSH_OPTIONS} ${2} ${SSH_USER}@${node}:${3}
|
||||||
|
}
|
||||||
|
|
||||||
|
# prepare influx1:
|
||||||
|
ssh_exec $INFLUX1 "echo 'deb https://repos.influxdata.com/ubuntu xenial stable' > /etc/apt/sources.list.d/influxdb.list"
|
||||||
|
ssh_exec $INFLUX1 "apt-get update && apt-get install -y influxdb"
|
||||||
|
scp_exec $INFLUX1 conf/influxdb.conf /etc/influxdb/influxdb.conf
|
||||||
|
ssh_exec $INFLUX1 "service influxdb restart"
|
||||||
|
ssh_exec $INFLUX1 "echo 'GOPATH=/root/gocode' >> /etc/environment"
|
||||||
|
ssh_exec $INFLUX1 "apt-get install -y golang-go && mkdir /root/gocode"
|
||||||
|
ssh_exec $INFLUX1 "source /etc/environment && go get -u github.com/influxdata/influxdb-relay"
|
||||||
|
scp_exec $INFLUX1 conf/relay_1.toml /root/relay.toml
|
||||||
|
ssh_exec $INFLUX1 "sed -i -e 's/influx1_ip/${INFLUX1}/g' -e 's/influx2_ip/${INFLUX2}/g' /root/relay.toml"
|
||||||
|
ssh_exec $INFLUX1 "influxdb-relay -config relay.toml &"
|
||||||
|
|
||||||
|
# prepare influx2:
|
||||||
|
ssh_exec $INFLUX2 "echo 'deb https://repos.influxdata.com/ubuntu xenial stable' > /etc/apt/sources.list.d/influxdb.list"
|
||||||
|
ssh_exec $INFLUX2 "apt-get update && apt-get install -y influxdb"
|
||||||
|
scp_exec $INFLUX2 conf/influxdb.conf /etc/influxdb/influxdb.conf
|
||||||
|
ssh_exec $INFLUX2 "service influxdb restart"
|
||||||
|
ssh_exec $INFLUX2 "echo 'GOPATH=/root/gocode' >> /etc/environment"
|
||||||
|
ssh_exec $INFLUX2 "apt-get install -y golang-go && mkdir /root/gocode"
|
||||||
|
ssh_exec $INFLUX2 "source /etc/environment && go get -u github.com/influxdata/influxdb-relay"
|
||||||
|
scp_exec $INFLUX2 conf/relay_2.toml /root/relay.toml
|
||||||
|
ssh_exec $INFLUX2 "sed -i -e 's/influx1_ip/${INFLUX1}/g' -e 's/influx2_ip/${INFLUX2}/g' /root/relay.toml"
|
||||||
|
ssh_exec $INFLUX2 "influxdb-relay -config relay.toml &"
|
||||||
|
|
||||||
|
# prepare balancer:
|
||||||
|
ssh_exec $BALANCER "apt-get install -y nginx"
|
||||||
|
scp_exec $BALANCER conf/influx-loadbalancer.conf /etc/nginx/sites-enabled/influx-loadbalancer.conf
|
||||||
|
ssh_exec $BALANCER "sed -i -e 's/influx1_ip/${INFLUX1}/g' -e 's/influx2_ip/${INFLUX2}/g' /etc/nginx/sites-enabled/influx-loadbalancer.conf"
|
||||||
|
ssh_exec $BALANCER "service nginx reload"
|
||||||
|
|
||||||
|
echo "INFLUX HA SERVICE IS AVAILABLE AT http://${BALANCER}:7076"
|
||||||
|
|
281
doc/source/methodologies/monitoring/influxha.rst
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
|
||||||
|
.. _HA_InfluxDB_as_an_external_strorage_for_Prometheus:
|
||||||
|
|
||||||
|
**************************************************
|
||||||
|
HA InfluxDB as an external storage for Prometheus
|
||||||
|
**************************************************
|
||||||
|
|
||||||
|
:Abstract:
|
||||||
|
|
||||||
|
This document describes a way to provide high-available InfluxDB storage
|
||||||
|
based on Influx-relay and Nginx.
|
||||||
|
|
||||||
|
|
||||||
|
Prometheus storage issue and solutions
|
||||||
|
======================================
|
||||||
|
|
||||||
|
`Prometheus`_ native storage was designed only for short period data and needs
|
||||||
|
to be shortened in order to stay responsible and operational. For us to store
|
||||||
|
persistent data for longer periods the 'external storage' mechanism was
|
||||||
|
used. In this mode Prometheus duplicating its own data to external storage,
|
||||||
|
only external writes are available. Several options were possible but we
|
||||||
|
chose InfluxDB high-available solution. InfluxDB is a reliable and robust
|
||||||
|
storage with many features. Also, it's perfect in supplying monitoring data to
|
||||||
|
`Grafana`_ dashboard.
|
||||||
|
|
||||||
|
.. table:: Monitoring software version
|
||||||
|
|
||||||
|
+-------------+--------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+-------------+--------------------+
|
||||||
|
|Prometheus | 1.4.0 |
|
||||||
|
+-------------+--------------------+
|
||||||
|
|Grafana | 4.0.1 |
|
||||||
|
+-------------+--------------------+
|
||||||
|
|
||||||
|
InfluxDB installation overview
|
||||||
|
==============================
|
||||||
|
|
||||||
|
During our deployment we were following `Influx-Relay Offical Documentation`_.
|
||||||
|
The installation comprises three nodes:
|
||||||
|
- first and second are InfluxDB instances with running Influx-relay daemon
|
||||||
|
- third is a load-balancer node with running Nginx
|
||||||
|
|
||||||
|
Influx-Relay working scheme taken from InfluxDB web site describes 5-nodes
|
||||||
|
installation (four InfluxDB instances + Loadbalancer node), but three nodes
|
||||||
|
were sufficient for our working load.
|
||||||
|
|
||||||
|
.. image:: images/influxdb-relay.png
|
||||||
|
:alt: HA InfluxDB scheme
|
||||||
|
:scale: 80
|
||||||
|
|
||||||
|
|
||||||
|
Ubuntu Xenial were used on each node. See software version table below:
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Ubuntu |Ubuntu 16.04.1 LTS |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Kernel |4.4.0-47-generic |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`InfluxDB`_ |1.2.0-17 |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Influx-Relay`_ |adaa2ea7bf97af592884fcfa57df1a2a77adb571 |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Nginx`_ |nginx/1.10.0 (Ubuntu) |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
In order to deploy InfluxDB HA deployment `InfluxdbHA deployment script`_
|
||||||
|
was used.
|
||||||
|
|
||||||
|
InfluxDB HA mechanism realization
|
||||||
|
=================================
|
||||||
|
|
||||||
|
Native HA mechanisms were moved away from InfluxDB (since version 1.x.x) and
|
||||||
|
now provided only as an enterprise option. Open-source third-party software
|
||||||
|
Influx-relay is considered as one of the available substitutions for previous
|
||||||
|
native replication mechanisms.
|
||||||
|
|
||||||
|
Influx-Relay
|
||||||
|
------------
|
||||||
|
|
||||||
|
Influx-relay is written in Golang and its operation boils down to
|
||||||
|
proxying incoming write queries to multiple destinations (InfluxDB
|
||||||
|
instances).
|
||||||
|
Influx-Relay runs on every InfluxDB node, thus any writes requests coming
|
||||||
|
to any InfluxDB instance are mirrored across all other nodes.
|
||||||
|
Influx-Relay is light and robust and it doesn't consume much of the system
|
||||||
|
resources.
|
||||||
|
See Influx-Relay configuration in the `Influx-Relay configuration`_ section.
|
||||||
|
|
||||||
|
Nginx
|
||||||
|
-----
|
||||||
|
|
||||||
|
Nginx daemon runs on a separate node and acts as load-balancer (upstream proxy mode).
|
||||||
|
It redirects '/query' queries directly to an each InfluxDB instance and '/write' queries
|
||||||
|
to an each Influx-relay daemon. Round-robin algorithm is scheduled for both query and write.
|
||||||
|
This way, incoming reads and writes are balanced equally across the whole InfluxDB cluster.
|
||||||
|
See Nginx configuration in the `Nginx configuration`_ section.
|
||||||
|
|
||||||
|
InfluxDB Monitoring
|
||||||
|
===================
|
||||||
|
|
||||||
|
InfluxDB HA installation was tested with Prometheus that polls 200-nodes environment
|
||||||
|
and generates huge data flows towards its external storage. In order to test InfluxDB
|
||||||
|
performance '_internal' database counters were used and visualized with the help of
|
||||||
|
Grafana. We figured out that 3-nodes InfluxDB HA installation can handle 200-nodes
|
||||||
|
Prometheus load and total performance doesn't degrade.
|
||||||
|
Grafana dashboards for InfluxDB monitoring can be found at `Grafana InfluxDB dashboard`_
|
||||||
|
section.
|
||||||
|
|
||||||
|
InfluxDB HA performance data
|
||||||
|
============================
|
||||||
|
|
||||||
|
InfluxDB database performance data
|
||||||
|
----------------------------------
|
||||||
|
|
||||||
|
These graphs were built with Grafana based on the metrics that are natively stored
|
||||||
|
inside the InfluxDB '_internal' database. To create the visualization we used
|
||||||
|
`Grafana InfluxDB dashboard`_.
|
||||||
|
|
||||||
|
+---------------------------------------+----------------------------------------+
|
||||||
|
|InfluxDB node1 database performance |InfluxDB node2 database performance |
|
||||||
|
| | |
|
||||||
|
+=======================================+========================================+
|
||||||
|
|.. image:: images/db/1_heap_usage.png |.. image:: images/db/2_heap_usage.png |
|
||||||
|
| :alt: heap_usage(gb) | :alt: heap_usage(gb) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+---------------------------------------+----------------------------------------+
|
||||||
|
|.. image:: images/db/1_point_intake.png|.. image:: images/db/2_point_intake.png |
|
||||||
|
| :alt: point_intake(ops/sec) | :alt: point_intake(ops/sec) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+---------------------------------------+----------------------------------------+
|
||||||
|
|.. image:: images/db/1_http_errors.png |.. image:: images/db/2_http_errors.png |
|
||||||
|
| :alt: http_errors((ops/sec) | :alt: http_errors((ops/sec) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+---------------------------------------+----------------------------------------+
|
||||||
|
|
||||||
|
OS performance data
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Operation System performance metrics were gathered using Telegraf agent
|
||||||
|
that was started on each cluster node with appropriate plugins. See the `Telegraf system`_
|
||||||
|
configuration file from `Containerized Openstack Monitoring`_ documentation.
|
||||||
|
|
||||||
|
|
||||||
|
InfluxDB node1 OS performance
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------------------------+
|
||||||
|
|.. image:: images/sys/node1/la.png |.. image:: images/sys/node1/mem_free.png |
|
||||||
|
| :alt: load_average(%) | :alt: mem_free(GB) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/node1/cpu_user.png |.. image:: images/sys/node1/mem_used.png |
|
||||||
|
| :alt: cpu_user(%) | :alt: mem_used(GB) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/node1/cpu_system.png |.. image:: images/sys/node1/disk_rate.png |
|
||||||
|
| :alt: cpu_system(%) | :alt: disk_rate(MBps) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/node1/cpu_idle.png |.. image:: images/sys/node1/network_load.png|
|
||||||
|
| :alt: cpu_idle(%) | :alt: network_load(Mbps) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|
||||||
|
InfluxDB node2 OS performance
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------------------------+
|
||||||
|
|.. image:: images/sys/node2/la.png |.. image:: images/sys/node2/mem_free.png |
|
||||||
|
| :alt: load_average(%) | :alt: mem_free(GB) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/node2/cpu_user.png |.. image:: images/sys/node2/mem_used.png |
|
||||||
|
| :alt: cpu_user(%) | :alt: mem_used(GB) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/node2/cpu_system.png |.. image:: images/sys/node2/disk_rate.png |
|
||||||
|
| :alt: cpu_system(%) | :alt: disk_rate(MBps) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/node2/cpu_idle.png |.. image:: images/sys/node2/network_load.png|
|
||||||
|
| :alt: cpu_idle(%) | :alt: network_load(Mbps) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+--------------------------------------------+--------------------------------------------+
|
||||||
|
|
||||||
|
Load-balancer node OS performance
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
+------------------------------------------------------------------------------------+
|
||||||
|
|.. image:: images/sys/lb/la.png |.. image:: images/sys/lb/mem_free.png |
|
||||||
|
| :alt: load_average(%) | :alt: mem_free(GB) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+---------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/lb/cpu_user.png |.. image:: images/sys/lb/mem_used.png |
|
||||||
|
| :alt: cpu_user(%) | :alt: mem_used(GB) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+---------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/lb/cpu_system.png|.. image:: images/sys/lb/disk_rate.png |
|
||||||
|
| :alt: cpu_system(%) | :alt: disk_rate(MBps) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+---------------------------------------+--------------------------------------------+
|
||||||
|
|.. image:: images/sys/lb/cpu_idle.png |.. image:: images/sys/lb/network_load.png |
|
||||||
|
| :alt: cpu_idle(%) | :alt: network_load(Mbps) |
|
||||||
|
| :scale: 32 | :scale: 32 |
|
||||||
|
+---------------------------------------+--------------------------------------------+
|
||||||
|
|
||||||
|
How to deploy
|
||||||
|
=============
|
||||||
|
|
||||||
|
- Prepare three Ubuntu Xenial nodes with working network and Internet access
|
||||||
|
- Temporarily allow ssh access for root user
|
||||||
|
- Untar influx_ha_deployment.tar
|
||||||
|
- Set appropriate SSH_PASSWORD variable in the influx_ha/deploy_influx_ha.sh
|
||||||
|
- Start deployment script preceding it with node ip variables, e.g.
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
INFLUX1=172.20.9.29 INFLUX2=172.20.9.19 BALANCER=172.20.9.27 bash -xe influx_ha/deploy_influx_ha.sh
|
||||||
|
|
||||||
|
Applications
|
||||||
|
============
|
||||||
|
|
||||||
|
InfluxdbHA deployment script
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
.. literalinclude:: influx_ha/deploy_influx_ha.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Configuration tarball (for deployment script)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
:download:`influx_ha_deployment.tar <influx_ha/influx_ha_deployment.tar>`
|
||||||
|
|
||||||
|
InfluxDB configuration
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
.. literalinclude:: influx_ha/conf/influxdb.conf
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Influx-Relay configuration
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
first instance
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: influx_ha/conf/relay_1.toml
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
second instance
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: influx_ha/conf/relay_2.toml
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Nginx configuration
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. literalinclude:: influx_ha/conf/influx-loadbalancer.conf
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Grafana InfluxDB dashboard
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
:download:`InfluxDB_Dashboard.json <influx_ha/InfluxDB_Dashboard.json>`
|
||||||
|
|
||||||
|
.. references:
|
||||||
|
|
||||||
|
.. _Prometheus: https://prometheus.io/
|
||||||
|
.. _Grafana: http://grafana.org/
|
||||||
|
.. _InfluxDB: https://www.influxdata.com/open-source/#influxdb
|
||||||
|
.. _Influx-Relay Offical Documentation: https://github.com/influxdata/influxdb-relay/blob/master/README.md
|
||||||
|
.. _Influx-Relay: https://github.com/influxdata/influxdb-relay
|
||||||
|
.. _Nginx: https://www.nginx.com/
|
||||||
|
.. _Telegraf system: https://docs.openstack.org/developer/performance-docs/methodologies/monitoring/index.html#telegraf-sys-conf
|
||||||
|
.. _Containerized Openstack Monitoring: https://docs.openstack.org/developer/performance-docs/methodologies/monitoring/index.html
|
||||||
|
|