Containerized Openstack Monitoring Solution

Change-Id: I66ea0711dd0319c1153a13b159dc5be6f7a7016c
This commit is contained in:
Oleg Basov 2016-12-29 04:12:01 +03:00
parent f812999925
commit 013d072f2b
45 changed files with 14200 additions and 1 deletions

View File

@ -7,7 +7,8 @@ Methodologies
=======================
.. toctree::
:maxdepth: 2
:maxdepth: 4
tools
hyper-scale
monitoring/index

View File

@ -0,0 +1,15 @@
builder:
push: true
no_cache: false
registry:
address: "172.20.8.35:5000/env-1"
repositories:
skip_empty: True
kubernetes:
server: http://172.20.9.234:8080
---
!include
- versions.yaml
- topology.yaml
- configs.yaml
- repos.yaml

View File

@ -0,0 +1,38 @@
configs:
private_interface: p1p1.602
public_interface: p1p1.602
ingress:
enabled: true
glance:
bootstrap:
enable: true
# nova:
# allocation_ratio:
# cpu: 16.0
neutron:
physnets:
- name: "physnet1"
bridge_name: "br-ex"
interface: "p1p1.649"
flat: true
vlan_range: false
bootstrap:
internal:
enable: true
external:
enable: true
net_name: ext-net
subnet_name: ext-subnet
physnet: physnet1
network: 10.144.0.0/12
gateway: 10.144.0.1
nameserver: 10.144.0.1
pool:
start: 10.144.1.0
end: 10.159.255.250
keystone:
debug: true
heat:
debug: true
memcached:
ram: 30720

View File

@ -0,0 +1,78 @@
#!/bin/bash
set -ex
if [ -z "$1" ]; then
echo "Please set number of env as argument"
exit 1
fi
DEPLOY_TIMEOUT=1200
export SSH_USER="root"
export SSH_PASS="r00tme"
cd $(dirname $(realpath $0))
NODE1="172.20.8.6${1}"
SSH_OPTS="-q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
SSH_CMD="sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${NODE1}"
SCP_CMD="sshpass -p ${SSH_PASS} scp ${SSH_OPTS}"
if [ ! -d ./env-${1} ]; then
echo "Yaml files for env-${1} is not found"
echo "Please, create and commit deployment/ccp/rackspace/env-${1}/configs with correct yaml files"
echo "Main file should be deployment/ccp/rackspace/env-${1}/configs/ccp.yaml"
exit 1
fi
$SCP_CMD ./env-${1}/configs/ccp.yaml ${SSH_USER}@${NODE1}:/root/.ccp.yaml
for i in $(ls -1 ./env-${1}/configs/ | grep -v ccp.yaml ); do
$SCP_CMD ./env-${1}/configs/${i} ${SSH_USER}@${NODE1}:/root/
done
$SSH_CMD "rm -rf /root/fuel-ccp; cd /root; git clone https://git.openstack.org/openstack/fuel-ccp"
$SSH_CMD "apt-get -y install python-pip"
$SSH_CMD "/usr/bin/pip install --upgrade pip"
$SSH_CMD "/usr/bin/pip install /root/fuel-ccp/"
CCP_STATUS=$($SSH_CMD "/usr/local/bin/ccp status")
if [ -n "$CCP_STATUS" ]; then
echo "Active deployment was found"
echo "$CCP_STATUS"
echo "Please execute 'ccp cleanup' and 'rm -rf /var/lib/mysql/*' on the ${NODE1} manually"
exit 1
fi
$SSH_CMD "echo '172.20.8.6${1} cloudformation.ccp.external console.ccp.external identity.ccp.external object-store.ccp.external compute.ccp.external orchestration.ccp.external network.ccp.external image.ccp.external volume.ccp.external horizon.ccp.external' >> /etc/hosts"
# $SSH_CMD kubectl delete configmaps traefik-conf -n kube-system
# $SSH_CMD kubectl delete service traefik -n kube-system
# $SSH_CMD kubectl delete secret traefik-cert -n kube-system
# $SSH_CMD kubectl delete deployment traefik -n kube-system
$SSH_CMD "/root/fuel-ccp/tools/ingress/deploy-ingress-controller.sh -i 172.20.8.6${1}" || echo "Already configured"
$SSH_CMD "echo 172.20.8.6${1} \$(ccp domains list -f value) >> /etc/hosts"
$SSH_CMD "openssl s_client -status -connect identity.ccp.external:8443 < /dev/null 2>&1 | awk 'BEGIN {pr=0;} /-----BEGIN CERTIFICATE-----/ {pr=1;} {if (pr) print;} /-----END CERTIFICATE-----/ {exit;}' >> /usr/local/lib/python2.7/dist-packages/requests/cacert.pem"
$SSH_CMD "openssl s_client -status -connect identity.ccp.external:8443 < /dev/null 2>&1 | awk 'BEGIN {pr=0;} /-----BEGIN CERTIFICATE-----/ {pr=1;} {if (pr) print;} /-----END CERTIFICATE-----/ {exit;}' > /usr/share/ca-certificates/ingress.crt"
$SSH_CMD "cp /usr/share/ca-certificates/ingress.crt /usr/local/share/ca-certificates/"
$SSH_CMD "update-ca-certificates"
if [ $($SSH_CMD "curl -s 'https://identity.ccp.external:8443/' > /dev/null; echo \$?") != 0 ]
then
echo "keystone is unreachable check https://identity.ccp.external:8443"
exit 1
fi
#$SSH_CMD "/root/fuel-ccp/tools/registry/deploy-registry.sh" &&
$SSH_CMD "/usr/local/bin/ccp fetch"
$SSH_CMD "/usr/local/bin/ccp build"
$SSH_CMD "/usr/local/bin/ccp deploy"
DEPLOY_TIME=0
while [ "$($SSH_CMD '/usr/local/bin/ccp status -s -f value' 2>/dev/null)" != "ok" ]
do
sleep 5
DEPLOY_TIME=$((${DEPLOY_TIME} + 5))
if [ $DEPLOY_TIME -ge $DEPLOY_TIMEOUT ]; then
echo "Deployment timeout"
exit 1
fi
done
$SSH_CMD "/usr/local/bin/ccp status"

View File

@ -0,0 +1,7 @@
export OS_PROJECT_DOMAIN_NAME=default
export OS_USER_DOMAIN_NAME=default
export OS_PROJECT_NAME=admin
export OS_USERNAME=admin
export OS_PASSWORD=password
export OS_IDENTITY_API_VERSION=3
export OS_AUTH_URL=https://identity.ccp.external:8443/v3

View File

@ -0,0 +1,44 @@
repositories:
repos:
- git_url: https://git.openstack.org/openstack/fuel-ccp-ceph
name: fuel-ccp-ceph
- git_url: https://git.openstack.org/openstack/fuel-ccp-cinder
name: fuel-ccp-cinder
- git_url: https://git.openstack.org/openstack/fuel-ccp-debian-base
name: fuel-ccp-debian-base
- git_url: https://git.openstack.org/openstack/fuel-ccp-entrypoint
name: fuel-ccp-entrypoint
- git_url: https://git.openstack.org/openstack/fuel-ccp-etcd
name: fuel-ccp-etcd
- git_url: https://git.openstack.org/openstack/fuel-ccp-glance
name: fuel-ccp-glance
- git_url: https://git.openstack.org/openstack/fuel-ccp-heat
name: fuel-ccp-heat
- git_url: https://git.openstack.org/openstack/fuel-ccp-horizon
name: fuel-ccp-horizon
# - git_url: https://git.openstack.org/openstack/fuel-ccp-ironic
# name: fuel-ccp-ironic
- git_url: https://git.openstack.org/openstack/fuel-ccp-keystone
name: fuel-ccp-keystone
# - git_url: https://git.openstack.org/openstack/fuel-ccp-mariadb
# name: fuel-ccp-mariadb
- git_url: https://git.openstack.org/openstack/fuel-ccp-galera
name: fuel-ccp-galera
- git_url: https://git.openstack.org/openstack/fuel-ccp-memcached
name: fuel-ccp-memcached
# - git_url: https://git.openstack.org/openstack/fuel-ccp-murano
# name: fuel-ccp-murano
- git_url: https://git.openstack.org/openstack/fuel-ccp-neutron
name: fuel-ccp-neutron
- git_url: https://git.openstack.org/openstack/fuel-ccp-nova
name: fuel-ccp-nova
- git_url: https://git.openstack.org/openstack/fuel-ccp-openstack-base
name: fuel-ccp-openstack-base
- git_url: https://git.openstack.org/openstack/fuel-ccp-rabbitmq
name: fuel-ccp-rabbitmq
# - git_url: https://git.openstack.org/openstack/fuel-ccp-sahara
# name: fuel-ccp-sahara
# - git_url: https://git.openstack.org/openstack/fuel-ccp-searchlight
# name: fuel-ccp-searchlight
# - git_url: https://git.openstack.org/openstack/fuel-ccp-stacklight
# name: fuel-ccp-stacklight

View File

@ -0,0 +1,77 @@
nodes:
# node[1-3]: Kubernetes
node([4-6])$: # 4-6
roles:
- controller
- openvswitch
node[7-9]$: # 7-9
roles:
- rabbitmq
node10$: # 10
roles:
- galera
node11$: # 11
roles:
- heat
node(1[2-9])$: # 12-19
roles:
- compute
- openvswitch
node[2-9][0-9]$: # 20-99
roles:
- compute
- openvswitch
node(1[0-9][0-9])$: # 100-199
roles:
- compute
- openvswitch
node200$:
roles:
- backup
replicas:
glance-api: 1
glance-registry: 1
keystone: 3
nova-api: 3
nova-scheduler: 3
nova-conductor: 3
neutron-server: 3
neutron-metadata-agent: 3
horizon: 3
heat-api: 1
heat-api-cfn: 1
heat-engine: 1
roles:
galera:
- galera
rabbitmq:
- rabbitmq
controller:
- etcd
- glance-api
- glance-registry
- horizon
- keystone
- memcached
- neutron-dhcp-agent
- neutron-l3-agent
- neutron-metadata-agent
- neutron-server
- nova-api
- nova-conductor
- nova-consoleauth
- nova-novncproxy
- nova-scheduler
compute:
- nova-compute
- nova-libvirt
openvswitch:
- neutron-openvswitch-agent
- openvswitch-db
- openvswitch-vswitchd
backup:
- backup
heat:
- heat-api
- heat-api-cfn
- heat-engine

View File

@ -0,0 +1,71 @@
images:
tag: newton
# image_specs:
# keystone:
# tag: newton
# horizon:
# tag: newton
# nova-upgrade:
# tag: newton
# nova-api:
# tag: newton
# nova-conductor:
# tag: newton
# nova-consoleauth:
# tag: newton
# nova-novncproxy:
# tag: newton
# nova-scheduler:
# tag: newton
# nova-compute:
# tag: newton
# nova-libvirt:
# tag: newton
# neutron-dhcp-agent:
# tag: newton
# neutron-l3-agent:
# tag: newton
# neutron-metadata-agent:
# tag: newton
# neutron-server:
# tag: newton
# neutron-openvswitch-agent:
# tag: newton
# glance-api:
# tag: newton
# glance-registry:
# tag: newton
# glance-upgrade:
# tag: newton
sources:
openstack/cinder:
git_ref: stable/newton
git_url: https://github.com/openstack/cinder.git
openstack/glance:
git_ref: stable/newton
git_url: https://github.com/openstack/glance.git
openstack/heat:
git_ref: stable/newton
git_url: https://github.com/openstack/heat.git
openstack/horizon:
git_ref: stable/newton
git_url: https://github.com/openstack/horizon.git
openstack/keystone:
git_ref: stable/newton
git_url: https://github.com/openstack/keystone.git
openstack/neutron:
git_ref: stable/newton
git_url: https://github.com/openstack/neutron.git
openstack/nova:
git_ref: stable/newton
git_url: https://github.com/openstack/nova.git
openstack/requirements:
git_ref: stable/newton
git_url: https://git.openstack.org/openstack/requirements.git
openstack/sahara-dashboard:
git_ref: stable/newton
git_url: https://git.openstack.org/openstack/sahara-dashboard.git

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,103 @@
[
{
"_id": "Response-Time-Dashboard",
"_type": "dashboard",
"_source": {
"title": "Response Time Dashboard",
"hits": 0,
"description": "",
"panelsJSON": "[{\"id\":\"Env-1-Response-Time\",\"type\":\"visualization\",\"panelIndex\":1,\"size_x\":3,\"size_y\":2,\"col\":1,\"row\":1},{\"id\":\"Env-2-Response-Time\",\"type\":\"visualization\",\"panelIndex\":2,\"size_x\":3,\"size_y\":2,\"col\":4,\"row\":1},{\"id\":\"Env-3-Response-Time\",\"type\":\"visualization\",\"panelIndex\":3,\"size_x\":3,\"size_y\":2,\"col\":7,\"row\":1},{\"id\":\"Env-4-Response-Time\",\"type\":\"visualization\",\"panelIndex\":4,\"size_x\":3,\"size_y\":2,\"col\":1,\"row\":3},{\"id\":\"Env-5-Response-Time\",\"type\":\"visualization\",\"panelIndex\":5,\"size_x\":3,\"size_y\":2,\"col\":4,\"row\":3},{\"id\":\"Env-6-Response-Time\",\"type\":\"visualization\",\"panelIndex\":6,\"size_x\":3,\"size_y\":2,\"col\":7,\"row\":3}]",
"optionsJSON": "{\"darkTheme\":true}",
"uiStateJSON": "{}",
"version": 1,
"timeRestore": false,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"filter\":[{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}}}]}"
}
}
},
{
"_id": "Env-1-Response-Time",
"_type": "visualization",
"_source": {
"title": "Env-1 Response Time",
"visState": "{\"title\":\"New Visualization\",\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"showCircles\":true,\"smoothLines\":false,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-1\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
}
}
},
{
"_id": "Env-4-Response-Time",
"_type": "visualization",
"_source": {
"title": "Env-4 Response Time",
"visState": "{\"title\":\"Env-3 Response Time\",\"type\":\"line\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-4\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
}
}
},
{
"_id": "Env-5-Response-Time",
"_type": "visualization",
"_source": {
"title": "Env-5 Response Time",
"visState": "{\"title\":\"Env-4 Response Time\",\"type\":\"line\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-5\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
}
}
},
{
"_id": "Env-6-Response-Time",
"_type": "visualization",
"_source": {
"title": "Env-6 Response Time",
"visState": "{\"title\":\"Env-5 Response Time\",\"type\":\"line\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-6\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
}
}
},
{
"_id": "Env-3-Response-Time",
"_type": "visualization",
"_source": {
"title": "Env-3 Response Time",
"visState": "{\"aggs\":[{\"id\":\"1\",\"params\":{\"customLabel\":\"Avg Response Time ms\",\"field\":\"ResponseTime\"},\"schema\":\"metric\",\"type\":\"avg\"},{\"id\":\"2\",\"params\":{\"customInterval\":\"2h\",\"extended_bounds\":{},\"field\":\"Timestamp\",\"interval\":\"auto\",\"min_doc_count\":1},\"schema\":\"segment\",\"type\":\"date_histogram\"}],\"listeners\":{},\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"title\":\"Env-2 Response Time\",\"type\":\"line\"}",
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-3\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
}
}
},
{
"_id": "Env-2-Response-Time",
"_type": "visualization",
"_source": {
"title": "Env-2 Response Time",
"visState": "{\"aggs\":[{\"id\":\"1\",\"params\":{\"customLabel\":\"Avg Response Time ms\",\"field\":\"ResponseTime\"},\"schema\":\"metric\",\"type\":\"avg\"},{\"id\":\"2\",\"params\":{\"customInterval\":\"2h\",\"extended_bounds\":{},\"field\":\"Timestamp\",\"interval\":\"auto\",\"min_doc_count\":1},\"schema\":\"segment\",\"type\":\"date_histogram\"}],\"listeners\":{},\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"title\":\"Env-1 Response Time\",\"type\":\"line\"}",
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-2\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
}
}
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,77 @@
#!/usr/bin/env bash
: ${DB_CONNECTION_STRING:?"You need to specify DB_CONNECTION_STRING parameter"}
: ${ENV_NAME:?"You need to specify ENV_NAME parameter"}
: ${MANAGEMENT_INTERFACE:="p1p1.602"}
: ${COBBLER_ADDRESS:="172.20.8.34"}
: ${CUSTOM_YAML}
: ${KARGO_REPO}
: ${KARGO_COMMIT}
: ${FUEL_CCP_COMMIT}
: ${ADMIN_USER}
: ${ADMIN_PASSWORD}
: ${ADMIN_NODE_CLEANUP}
DEPLOY_METHOD="kargo"
WORKSPACE="~/kargo_workspace_${ENV_NAME}"
SSH_OPTIONS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
get_env_nodes ()
{
ENV_NODES_NAMES=$(echo $(psql ${DB_CONNECTION_STRING} -c "select name from servers where environment_id in (select id from environments where name='${ENV_NAME}')" -P format=unaligned -t))
if [ -z "${ENV_NODES_NAMES}" ]
then
echo "No nodes in environment with name ${ENV_NAME}"
exit 1
fi
}
get_env_nodes_ips ()
{
ENV_NODES_IPS=$(echo $(ssh ${SSH_OPTIONS} root@${COBBLER_ADDRESS} bash -ex << EOF
for COBBLER_SYSTEM_NAME in ${ENV_NODES_NAMES}
do
NODE_IP=\$(cobbler system dumpvars --name=\${COBBLER_SYSTEM_NAME} | grep ^ip_address_${MANAGEMENT_INTERFACE} | awk '{print \$3}')
NODE_IPS+=\${NODE_IP}" "
done
echo \${NODE_IPS}
EOF
))
}
main ()
{
get_env_nodes
get_env_nodes_ips
export ADMIN_IP=$(echo ${ENV_NODES_IPS} | awk '{print $1}')
export SLAVE_IPS=$(echo ${ENV_NODES_IPS})
# for SLAVE_IP in ${SLAVE_IPS}
# do
# ssh ${SSH_OPTIONS} root@${SLAVE_IP} bash -ex << EOF
#echo "deb https://apt.dockerproject.org/repo ubuntu-\$(grep DISTRIB_CODENAME /etc/lsb-release | awk -F"=" '{print \$2}') main" >> /etc/apt/sources.list
#apt-get update && apt-get install -y --allow-unauthenticated -o Dpkg::Options::="--force-confdef" docker-engine
#EOF
# done
if [ -d "$WORKSPACE" ] ; then
rm -rf $WORKSPACE
fi
mkdir -p $WORKSPACE
cd $WORKSPACE
if [ -d './fuel-ccp-installer' ] ; then
rm -rf ./fuel-ccp-installer
fi
git clone https://review.openstack.org/openstack/fuel-ccp-installer
cd ./fuel-ccp-installer
if [ "$FUEL_CCP_COMMIT" ]; then
git fetch git://git.openstack.org/openstack/fuel-ccp-installer $FUEL_CCP_COMMIT && git checkout FETCH_HEAD
fi
echo "Running on $NODE_NAME: $ENV_NAME"
bash -xe "./utils/jenkins/run_k8s_deploy_test.sh"
}
main

View File

@ -0,0 +1,46 @@
---
- hosts: main-kuber
remote_user: root
tasks:
- name: Fetch heka package
get_url:
url: "{{ heka_package_url }}"
dest: /tmp/heka_amd64.deb
mode: 0664
force: yes
- name: Download heka package locally
fetch:
src: /tmp/heka_amd64.deb
dest: ./heka_amd64.deb
fail_on_missing: yes
flat: yes
- hosts: cluster-nodes
remote_user: root
tasks:
- name: Propagate heka package across cluster nodes
copy:
src: ./heka_amd64.deb
dest: /tmp/heka_amd64.deb
- hosts: all-cluster-nodes
remote_user: root
tasks:
- name: Install heka package
apt: deb=/tmp/heka_amd64.deb
- name: Adding heka user to docker group
user: name='heka' groups=docker append=yes
- name: Copy heka conf
template: src=heka/00-hekad.toml.j2 dest=/etc/heka/conf.d/00-hekad.toml
notify: restart heka
- name: Copy heka lua scripts
template: src=heka/kubeapi_to_int.lua.j2 dest=/usr/share/heka/lua_filters/kubeapi_to_int.lua
register: heka_lua
notify: restart heka
- name: ensure heka is running
systemd: state=started name=heka enabled=yes
handlers:
- name: restart heka
systemd: state=restarted name=heka

View File

@ -0,0 +1,71 @@
#!/bin/bash -xe
HOSTNAME=`hostname`
ELASTICSEARCH_NODE=${ELASTICSEARCH_NODE:-172.20.9.3}
# install java
sudo add-apt-repository -y ppa:webupd8team/java
sudo apt-get update
sudo apt-get -y install oracle-java8-installer
# install elastic by adding extra repository
wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
echo "deb http://packages.elastic.co/elasticsearch/2.x/debian stable main" | sudo tee -a /etc/apt/sources.list.d/elasticsearch-2.x.list
sudo apt-get update
sudo apt-get -y install elasticsearch
# edit configuration:
sed -i -E -e 's/^.*cluster.name: .*$/ cluster.name: elasticsearch_k8s/g' /etc/elasticsearch/elasticsearch.yml
sed -i -E -e "s/^.*node.name: .*$/ cluster.name: ${HOSTNAME}/g" /etc/elasticsearch/elasticsearch.yml
sed -i -E -e "s/^.*network.host: .*$/ network.host: ${ELASTICSEARCH_NODE}/g" /etc/elasticsearch/elasticsearch.yml
# increase memory limits:
sed -i -E -e "s/^.*ES_HEAP_SIZE=.*$/ES_HEAP_SIZE=10g/g" /etc/default/elasticsearch
# start service:
sudo systemctl restart elasticsearch
sudo systemctl daemon-reload
sudo systemctl enable elasticsearch
# install kibana from extra repository:
echo "deb http://packages.elastic.co/kibana/4.5/debian stable main" | sudo tee -a /etc/apt/sources.list
sudo apt-get update
sudo apt-get -y install kibana
sed -i -E -e "s/^.*elasticsearch.url:.*$/ elasticsearch.url: \"http://${ELASTICSEARCH_NODE}:9200\"/g" /opt/kibana/config/kibana.yml
# enable kibana service:
sudo systemctl daemon-reload
sudo systemctl enable kibana
sudo systemctl start kibana
# install nginx:
sudo apt-get -y install nginx
# set kibana admin:password (admin:admin)
echo "admin:`openssl passwd admin`" | sudo tee -a /etc/nginx/htpasswd.users
# prepare nginx config:
cat << EOF >> /etc/nginx/sites-available/default
server {
listen 80;
server_name ${HOSTNAME};
auth_basic "Restricted Access";
auth_basic_user_file /etc/nginx/htpasswd.users;
location / {
proxy_pass http://localhost:5601;
proxy_http_version 1.1;
proxy_set_header Upgrade \$http_upgrade;
proxy_set_header Connection 'upgrade';
proxy_set_header Host \$host;
proxy_cache_bypass \$http_upgrade;
}
}
EOF
# check and start nginx service:
sudo nginx -t
sudo systemctl restart nginx

View File

@ -0,0 +1,60 @@
#!/bin/bash
set -e
export ANSIBLE_HOST_KEY_CHECKING=False
export SSH_USER="root"
export SSH_PASS="r00tme"
cd $(dirname $(realpath $0))
ENV=${1}
if [ -z "${ENV}" ]; then
echo "Please provide env number $(basename $0) [1|2|3|4|5|6]"
exit 1
fi
# elastic for k8s at rackspace as default
ELASTICSEARCH_NODE=${ELASTICSEARCH_NODE:-172.20.9.3}
# heka 0.10.0 as default
HEKA_PACKAGE_URL=${HEKA_PACKAGE_URL:-https://github.com/mozilla-services/heka/releases/download/v0.10.0/heka_0.10.0_amd64.deb}
KUBE_MAIN_NODE="172.20.8.6${ENV}"
SSH_OPTS="-q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
echo "Get clusters nodes ..."
NODES_TMP=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} 'kubectl get nodes -o jsonpath='"'"'{.items[*].status.addresses[?(@.type=="InternalIP")].address}'"'"'')
ALL_IP_ON_KUBER_NODE=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} ip addr | grep 172.20 | awk '{print $2}' | awk -F'/' '{print $1}')
GREP_STRING_TMP=""
for i in $ALL_IP_ON_KUBER_NODE; do
GREP_STRING_TMP="${GREP_STRING_TMP}${i}|"
done
GREP_STRING=${GREP_STRING_TMP:0:-1}
SSH_AUTH="ansible_ssh_user=${SSH_USER} ansible_ssh_pass=${SSH_PASS}"
echo "[main-kuber]" > cluster-hosts
echo "${KUBE_MAIN_NODE} ${SSH_AUTH}" >> cluster-hosts
echo "[cluster-nodes]" >> cluster-hosts
set +e
# Remove IP of kuber node
for i in ${NODES_TMP} ; do
TMP_VAR=$(echo $i | grep -vE "(${GREP_STRING})")
NODES="${NODES} ${TMP_VAR}"
done
set -e
for i in ${NODES} ; do
if [ "$i" != "${KUBE_MAIN_NODE}" ]; then
echo "${i} ${SSH_AUTH}" >> cluster-hosts
fi
done
echo "[all-cluster-nodes:children]" >> cluster-hosts
echo "main-kuber" >> cluster-hosts
echo "cluster-nodes" >> cluster-hosts
# Calculate parallel ansible execution
NODES_IPS=( $NODES )
if [[ "${#NODES_IPS[@]}" -lt 50 ]] && [[ "${#NODES_IPS[@]}" -gt 5 ]]; then
ANSIBLE_FORKS="${#NODES_IPS[@]}"
elif [[ "${#NODES_IPS[@]}" -ge 50 ]]; then
ANSIBLE_FORKS=50
else
ANSIBLE_FORKS=10
fi
echo "Starting ansible ..."
ansible-playbook -v --ssh-extra-args "-o\ StrictHostKeyChecking=no" -f ${ANSIBLE_FORKS} -i ./cluster-hosts -e env_num=${ENV} -e elasticsearch_node="${ELASTICSEARCH_NODE}" -e heka_package_url=${HEKA_PACKAGE_URL} ./deploy-heka.yaml --diff

View File

@ -0,0 +1,69 @@
# vim: set syntax=yaml
[hekad]
maxprocs = 2
[DockerLogInput]
endpoint = "unix:///var/run/docker.sock"
#decoder = "KubeAPI_decoder"
decoder = "MultiDecoder"
[MultiDecoder]
type = "MultiDecoder"
subs = ["KubeAPI_decoder", "EnvironmentScribbler"]
cascade_strategy = "all"
#log_sub_errors = true
{% raw %}
[KubeAPI_decoder]
type = "PayloadRegexDecoder"
match_regex = '\S+ \S+ .+ (?P<Code>\S+)\] (?P<Method>[A-Z]+) (?P<Url>\S+)\: \((?P<ResponseTime>\S+)ms\) (?P<StatusCode>\d+) \[\[(?P<Agent>.+)\] (?P<RemoteIP>\S+)\:(?P<RemotePort>\d+)\]'
[KubeAPI_decoder.message_fields]
Type = "KubeAPIlog"
Logger = "Docker"
Code = "%Code%"
Method = "%Method%"
Url|uri = "%Url%"
ResponseTime = "%ResponseTime%"
StatusCode = "%StatusCode%"
Agent = "%Agent%"
RemoteIP|ipv4 = "%RemoteIP%"
RemotePort = "%RemotePort%"
{% endraw %}
[EnvironmentScribbler]
type = "ScribbleDecoder"
[EnvironmentScribbler.message_fields]
Environment = "env-{{ env_num }}"
[KubeAPI_to_int]
type = "SandboxFilter"
filename = "lua_filters/kubeapi_to_int.lua"
message_matcher = "Type == 'KubeAPIlog'"
[ESJsonEncoder]
index = "env-{{ env_num }}-{{ '%{Type}-%{%Y.%m.%d}' }}"
#es_index_from_timestamp = true
type_name = "%{Type}"
[ElasticSearchOutput]
message_matcher = "Type == 'heka.sandbox.KubeAPIlog' || Type == 'DockerLog'"
server = "http://{{ elasticsearch_node }}:9200"
flush_interval = 5000
flush_count = 10
encoder = "ESJsonEncoder"
[PayloadEncoder]
append_newlines = false
#
[LogOutput]
<<<<<<< HEAD
#message_matcher = "Type == 'KubeAPIlog'"
message_matcher = "TRUE"
#encoder = "ESJsonEncoder"
encoder = "PayloadEncoder"
=======
message_matcher = "Type == 'heka.sandbox.KubeAPIlog' || Type == 'DockerLog'"
#message_matcher = "TRUE"
encoder = "ESJsonEncoder"
#encoder = "PayloadEncoder"
>>>>>>> b0caa3ceb82399dd16465645eebdebf90242662c

View File

@ -0,0 +1,30 @@
{% raw %}
-- Invert Response time and some more fields to integer type
local fields = {["ResponseTime"] = 0, ["RemotePort"] = 0, ["StatusCode"] = 0}
local msg = {
Type = "KubeAPIlog",
Severity = 6,
Fields = fields
}
function process_message ()
fields["ResponseTime"] = tonumber(read_message("Fields[ResponseTime]"))
fields["RemotePort"] = tonumber(read_message("Fields[RemotePort]"))
fields["StatusCode"] = tonumber(read_message("Fields[StatusCode]"))
msg.Payload = read_message("Payload")
fields["Code"] = read_message("Fields[Code]")
fields["ContainerID"] = read_message("Fields[ContainerID]")
fields["ContainerName"] = read_message("Fields[ContainerName]")
fields["Environment"] = read_message("Fields[Environment]")
fields["Method"] = read_message("Fields[Method]")
fields["RemoteIP"] = read_message("Fields[RemoteIP]")
fields["Url"] = read_message("Fields[Url]")
local ok, msg = pcall(inject_message, msg)
if not ok then
inject_payload("txt", "error", msg)
end
return 0
end
{% endraw %}

View File

@ -0,0 +1,124 @@
---
- hosts: common
remote_user: root
tasks:
- name: Install common packages
apt: name={{ item }} state=installed
with_items:
- python-pip
tags: [ 'always' ]
- name: Install docker for Ubuntu 14.04
apt: name=docker.io state=installed
when: ansible_distribution == 'Ubuntu' and ansible_distribution_version == '14.04'
tags: [ 'always' ]
- name: Install docker for Ubuntu 16.01
apt: name=docker state=installed
when: ansible_distribution == 'Ubuntu' and ansible_distribution_version == '16.0.'
tags: [ 'always' ]
- name: Install python deps
pip: name={{ item }}
with_items:
- docker-py
- docker-compose
tags: [ 'always' ]
- hosts: grafana
remote_user: root
vars:
postgresql_root_user: root
postgresql_root_password: aijoom1Shiex
grafana_postgresql_user: grafana
grafana_postgresql_password: sHskdhos6se
grafana_postgresql_db: grafana
grafana_user: admin
grafana_password: admin
tasks:
- name: Install packages for grafana
apt: name={{ item }} state=installed
with_items:
- postgresql-client-9.3
- python-psycopg2
- name: Create postgres data dir
file: path=/var/lib/postgres/data/db state=directory
tags: [ 'grafana' ]
- name: Run postgres in docker
docker_container:
name: postgres
image: 'postgres:latest'
ports: 5432:5432
volumes: '/var/lib/postgres/data:/var/lib/postgres/data'
env:
POSTGRES_USER: "{{ postgresql_root_user }}"
POSTGRES_PASSWORD: "{{ postgresql_root_password }}"
PGDATA: /var/lib/postgres/data/db
tags: [ 'grafana' ]
- name: Create DB for grafana
postgresql_db:
name: "{{ grafana_postgresql_db }}"
login_user: "{{ postgresql_root_user }}"
login_password: "{{ postgresql_root_password }}"
login_host: localhost
encoding: 'UTF-8'
tags: [ 'grafana' ]
- name: Create user for grafana in postgres
postgresql_user:
name: "{{ grafana_postgresql_user }}"
login_user: "{{ postgresql_root_user }}"
login_password: "{{ postgresql_root_password }}"
login_host: localhost
password: "{{ grafana_postgresql_password }}"
db: grafana
priv: ALL
tags: [ 'grafana' ]
- name: Create data dir for Grafana
file: path=/var/lib/grafana state=directory
tags: [ 'grafana' ]
- name: Start Grafana container
docker_container:
name: grafana
image: 'grafana/grafana:4.0.1'
volumes: '/var/lib/grafana:/var/lib/grafana'
ports: 3000:3000
env:
GF_SECURITY_ADMIN_PASSWORD: "{{ grafana_user }}"
GF_SECURITY_ADMIN_USER: "{{ grafana_password }}"
GF_DATABASE_TYPE: postgres
GF_DATABASE_HOST: "{{ ansible_default_ipv4.address }}"
GF_DATABASE_NAME: "{{ grafana_postgresql_db }}"
GF_DATABASE_USER: "{{ grafana_postgresql_user }}"
GF_DATABASE_PASSWORD: "{{ grafana_postgresql_password }}"
GF_INSTALL_PLUGINS: grafana-piechart-panel
tags: [ 'grafana' ]
- hosts: prometheuses
remote_user: root
tasks:
- name: Data dir for prometheus
file: path=/var/lib/prometheus state=directory
tags: [ 'prometheus' ]
- include: docker_prometheus.yaml
- hosts: prometheus-kuber
remote_user: root
tasks:
- name: Copy prometheus config
template: src=prometheus/prometheus-kuber.yml.j2 dest=/var/lib/prometheus/prometheus.yml
register: prometheus_yml
tags: [ 'prometheus', 'prometheus-conf' ]
- include: docker_prometheus.yaml
- name: Send kill -1 to prometheus if prometheus.yml changed
command: pkill -1 prometheus
when: prometheus_yml.changed
tags: [ 'prometheus', 'prometheus-conf']
- hosts: prometheus-system
remote_user: root
tasks:
- name: Copy prometheus config
template: src=prometheus/prometheus-system.yml.j2 dest=/var/lib/prometheus/prometheus.yml
register: prometheus_yml
tags: [ 'prometheus', 'prometheus-conf' ]
- include: docker_prometheus.yaml
- name: Send kill -1 to prometheus if prometheus.yml changed
command: pkill -1 prometheus
when: prometheus_yml.changed
tags: [ 'prometheus', 'prometheus-conf']

View File

@ -0,0 +1,118 @@
---
- hosts: all-cluster-nodes
remote_user: root
tasks:
- name: Create user telegraf
user: name=telegraf home=/opt/telegraf
- name: Create /opt/telegraf
file: path=/opt/telegraf state=directory owner=telegraf
- name: Create bin dir for telegraf
file: path=/opt/telegraf/bin state=directory owner=telegraf
- name: Create etc dir for telegraf
file: path=/opt/telegraf/etc state=directory owner=telegraf
- name: Copy telegraf to server
copy: src=../../telegraf/opt/bin/telegraf dest=/opt/telegraf/bin/telegraf mode=0755
register: telegraf_bin
- name: Copy telegraf.service
copy: src=telegraf/telegraf.service dest=/etc/systemd/system/telegraf.service
register: telegraf_service
- name: Start and enable telegraf
systemd: state=started enabled=yes daemon_reload=yes name=telegraf
- name: Delete allmetrics.tmp.lock
file: path=/opt/telegraf/bin/data/allmetrics.tmp.lock state=absent
when: telegraf_service.changed or telegraf_bin.changed
- name: Restart telegraf if telegraf binary has been changed
systemd: state=restarted name=telegraf
when: telegraf_bin.changed
- name: Install software
apt: name={{ item }} state=installed
with_items:
- sysstat
- numactl
- name: Copy system metric scripts
copy: src=../../telegraf/opt/system_stats/{{ item }} dest=/opt/telegraf/bin/{{ item }} mode=0755
with_items:
- entropy.sh
- iostat_per_device.sh
- memory_bandwidth.sh
- numa_stat_per_pid.sh
- per_process_cpu_usage.sh
- list_openstack_processes.sh
- network_tcp_queue.sh
- name: Copy pcm-memory-one-line.x
copy: src=../../telegraf/opt/system_stats/intel_pcm_mem/pcm-memory-one-line.x dest=/opt/telegraf/bin/pcm-memory-one-line.x mode=0755
- name: Add sysctl for pcm
sysctl: name=kernel.nmi_watchdog value=0 state=present reload=yes
- name: Load kernel module msr
modprobe: name=msr state=present
- name: Add module autoload
lineinfile: dest=/etc/modules line='msr'
- name: Add user telegraf to sudoers
lineinfile:
dest: /etc/sudoers
state: present
line: "telegraf ALL=(ALL) NOPASSWD: ALL"
- hosts: cluster-nodes
remote_user: root
tasks:
- name: Copy telegraf config
copy: src=./telegraf/telegraf-sys.conf dest=/opt/telegraf/etc/telegraf.conf
register: telegraf_conf
- name: Restart telegraf if config has been changed
systemd: state=restarted name=telegraf
when: telegraf_conf.changed
- hosts: main-kuber
remote_user: root
tasks:
- name: Copy openstack scripts
copy: src=../../telegraf/opt/osapi/{{ item }} dest=/opt/telegraf/bin/{{ item }} mode=0755
with_items:
- glog.sh
- osapitime.sh
- vmtime.sh
tags: [ 'openstack' ]
- name: Copy etcd scripts
copy: src=../../telegraf/opt/k8s_etcd/{{ item }} dest=/opt/telegraf/bin/{{ item }} mode=0755
with_items:
- etcd_get_metrics.sh
- k8s_get_metrics.sh
- name: Install software for scripts
apt: name={{ item }} state=installed
with_items:
- mysql-client
- bc
- jq
tags: [ 'openstack' ]
- name: Create dirs for scripts
file: path=/opt/telegraf/bin/{{ item }} state=directory owner=telegraf
with_items:
- log
- data
- name: Copy telegraf config
template: src=telegraf/telegraf-openstack.conf.j2 dest=/opt/telegraf/etc/telegraf.conf
register: telegraf_conf
tags: [ 'openstack' ]
- name: Delete allmetrics.tmp.lock
file: path=/opt/telegraf/bin/data/allmetrics.tmp.lock state=absent
when: telegraf_conf.changed
- name: Restart telegraf if config has been changed
systemd: state=restarted name=telegraf
when: telegraf_conf.changed
tags: [ 'openstack' ]
- hosts: all-cluster-nodes
remote_user: root
tasks:
- name: Reload telegraf is service file has been changed
systemd: daemon_reload=yes state=reloaded name=telegraf
when: telegraf_service.changed
- hosts: main
remote_user: root
tasks:
- name: update prometheus config
template: src=./prometheus/targets.yml.j2 dest=/var/lib/prometheus/targets-{{ cluster_tag }}.yml
tags: [ 'prometheus' ]

View File

@ -0,0 +1,46 @@
#!/bin/bash
CLUSTER=${1}
TMP_YAML=$(mktemp -u)
export ANSIBLE_HOST_KEY_CHECKING=False
export SSH_USER="root"
export SSH_PASS="r00tme"
cd $(dirname $(realpath $0))
ENV=${1}
if [ -z "${ENV}" ]; then
echo "Please provide env number $(basename $0) [1|2|3|4|5|6]"
exit 1
fi
PROMETHEUS_HOST="172.20.9.115"
KUBE_MAIN_NODE="172.20.8.6${ENV}"
CLUSTER_TAG="env-${ENV}"
ETCD=""
SSH_OPTS="-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
TARGETS=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} curl -ks https://127.0.0.1:2379/v2/members | python -m json.tool | grep 2379)
if [ -z "$TARGETS" ]; then
echo "No etcd found"
exit 1
fi
for i in ${TARGETS}; do
TEMP_TARGET=${i#\"https://}
ETCD="$ETCD ${TEMP_TARGET%\"}"
done
echo "- targets:" > ${TMP_YAML}
for i in ${ETCD}; do
echo " - $i" >> ${TMP_YAML}
done
echo " labels:" >> ${TMP_YAML}
echo " env: ${CLUSTER_TAG}" >> ${TMP_YAML}
echo "Targets file is ready"
cat ${TMP_YAML}
sshpass -p ${SSH_PASS} scp ${SSH_OPTS} ${TMP_YAML} root@${PROMETHEUS_HOST}:/var/lib/prometheus/etcd-env-${1}.yml
rm ${TMP_YAML}

View File

@ -0,0 +1,2 @@
#!/bin/bash
ansible-playbook -i ./hosts ./deploy-graf-prom.yaml --tags "grafana"

View File

@ -0,0 +1,2 @@
#!/bin/bash
ansible-playbook -i ./hosts ./deploy-graf-prom.yaml --tags "prometheus"

View File

@ -0,0 +1,65 @@
#!/bin/bash
set -e
export ANSIBLE_HOST_KEY_CHECKING=False
export SSH_USER="root"
export SSH_PASS="r00tme"
cd $(dirname $(realpath $0))
ENV=${1}
if [ -z "${ENV}" ]; then
echo "Please provide env number $(basename $0) [1|2|3|4|5|6]"
exit 1
fi
PROMETHEUS_NODE="172.20.124.25"
KUBE_MAIN_NODE="172.20.8.6${ENV}"
CLUSTER_TAG="env-${ENV}"
# Secret option
ANSIBLE_TAG=$2
SSH_OPTS="-q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
echo "Get clusters nodes"
NODES_TMP=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} 'kubectl get nodes -o jsonpath='"'"'{.items[*].status.addresses[?(@.type=="InternalIP")].address}'"'"'')
ALL_IP_ON_KUBER_NODE=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} ip addr | grep 172.20 | awk '{print $2}' | awk -F'/' '{print $1}')
GREP_STRING_TMP=""
for i in $ALL_IP_ON_KUBER_NODE; do
GREP_STRING_TMP="${GREP_STRING_TMP}${i}|"
done
GREP_STRING=${GREP_STRING_TMP:0:-1}
SSH_AUTH="ansible_ssh_user=${SSH_USER} ansible_ssh_pass=${SSH_PASS}"
echo "[main]" > cluster-hosts
echo "${PROMETHEUS_NODE} ${SSH_AUTH}" >> cluster-hosts
echo "[main-kuber]" >> cluster-hosts
echo "${KUBE_MAIN_NODE} ${SSH_AUTH}" >> cluster-hosts
echo "[cluster-nodes]" >> cluster-hosts
set +e
# Remove IP of kuber node
for i in ${NODES_TMP} ; do
TMP_VAR=$(echo $i | grep -vE "(${GREP_STRING})")
NODES="${NODES} ${TMP_VAR}"
done
set -e
for i in ${NODES} ; do
if [ "$i" != "${KUBE_MAIN_NODE}" ]; then
echo "${i} ${SSH_AUTH}" >> cluster-hosts
fi
done
echo "[all-cluster-nodes:children]" >> cluster-hosts
echo "main-kuber" >> cluster-hosts
echo "cluster-nodes" >> cluster-hosts
LINES=$(wc -l cluster-hosts | awk '{print $1}')
NUM_NODES=$(($LINES - 7))
if [ ${NUM_NODES} -le 0 ]; then
echo "Something wrong, $NUM_NODES nodes found"
exit 1
else
echo "${NUM_NODES} nodes found"
fi
if [ -z "${ANSIBLE_TAG}" ]; then
ansible-playbook -f 40 -i ./cluster-hosts -e cluster_tag=${CLUSTER_TAG} ./deploy-telegraf.yaml
else
ansible-playbook -f 40 -i ./cluster-hosts -e cluster_tag=${CLUSTER_TAG} -t ${ANSIBLE_TAG} ./deploy-telegraf.yaml
fi

View File

@ -0,0 +1,10 @@
---
- name: Deploy prometheus in docker
docker_container:
name: prometheus
image: 'prom/prometheus:v1.4.0'
ports: 9090:9090
state: started
volumes: ['/var/lib/prometheus:/prometheus']
command: '-config.file=/prometheus/prometheus.yml -storage.local.retention 168h0m0s -storage.local.max-chunks-to-persist 3024288 -storage.local.memory-chunks=50502740 -storage.local.num-fingerprint-mutexes=300960'
tags: [ 'prometheus' ]

View File

@ -0,0 +1,58 @@
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor'
rule_files:
# - "first.rules"
# - "second.rules"
scrape_configs:
- job_name: 'prometheus'
scrape_interval: 5s
scrape_timeout: 5s
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['172.20.9.115:9090']
{% for env_num in range(1,7) %}
- job_name: 'k8-env-{{env_num}}'
scrape_interval: 30s
scrape_timeout: 30s
scheme: https
tls_config:
insecure_skip_verify: true
kubernetes_sd_configs:
- api_server: 'https://172.20.8.6{{env_num}}:443'
role: node
tls_config:
insecure_skip_verify: true
basic_auth:
username: kube
password: changeme
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__address__]
target_label: env
regex: .*
replacement: env-{{env_num}}
- job_name: 'etcd-env-{{env_num}}'
scrape_interval: 5s
scrape_timeout: 5s
scheme: https
tls_config:
insecure_skip_verify: true
file_sd_configs:
- files:
- etcd-env-{{env_num}}.yml
{% endfor %}

View File

@ -0,0 +1,33 @@
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor'
rule_files:
# - "first.rules"
# - "second.rules"
scrape_configs:
- job_name: 'prometheus'
scrape_interval: 5s
scrape_timeout: 5s
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['172.20.124.25:9090']
{% for env_num in range(1,7) %}
- job_name: 'telegraf-systems-env-{{env_num}}'
scrape_interval: 30s
scrape_timeout: 30s
file_sd_configs:
- files:
- targets-env-{{env_num}}.yml
{% endfor %}

View File

@ -0,0 +1,6 @@
- targets:
{% for host in groups['all-cluster-nodes']%}
- {{hostvars[host]['inventory_hostname']}}:9126
{% endfor %}
labels:
env: {{ cluster_tag }}

View File

@ -0,0 +1,5 @@
#!/bin/bash
export LANG=C
set -o nounset # Treat unset variables as an error
echo "system entropy=$(cat /proc/sys/kernel/random/entropy_avail)"

View File

@ -0,0 +1,33 @@
#!/bin/bash -e
ETCD=/usr/local/bin/etcdctl
type jq >/dev/null 2>&1 || ( echo "Jq is not installed" ; exit 1 )
type curl >/dev/null 2>&1 || ( echo "Curl is not installed" ; exit 1 )
# get etcd members credentials
MEMBERS="${ETCD} --endpoints https://127.0.0.1:2379 member list"
LEADER_ID=$(eval "$MEMBERS" | awk -F ':' '/isLeader=true/ {print $1}')
LEADER_ENDPOINT=$(eval "$MEMBERS" | awk '/isLeader=true/ {print $4}' | cut -d"=" -f2)
SLAVE_ID=$(eval "$MEMBERS" | grep 'isLeader=false' | head -n 1 | awk -F ":" '{print $1}')
SLAVE_ENDPOINT=$(eval "$MEMBERS" | grep 'isLeader=false' | head -n 1 | awk '{print $4}' | cut -d"=" -f2)
# member count:
metric_members_count=`curl -s -k https://172.20.9.15:2379/v2/members | jq -c '.members | length'`
metric_total_keys_count=`${ETCD} --endpoints https://127.0.0.1:2379 ls -r --sort | wc -l`
metric_total_size_dataset=`pidof etcd | xargs ps -o rss | awk '{rss=+$1} END {print rss}'`
metric_store_stats=`curl -s -k ${LEADER_ENDPOINT}/v2/stats/store| tr -d \"\{\} | sed -e 's/:/=/g'`
metric_latency_from_leader_avg=`curl -s -k ${LEADER_ENDPOINT}/v2/stats/leader | \
jq -c ".followers.\"${SLAVE_ID}\".latency.average"`
metric_leader_stats=`curl -s -k ${LEADER_ENDPOINT}/v2/stats/self | \
jq -c "{ sendBandwidthRate: .sendBandwidthRate, sendAppendRequestCnt: \
.sendAppendRequestCnt, sendPkgRate: .sendPkgRate }"| tr -d \"\{\} | sed -e 's/:/=/g'`
metric_slave_stats=`curl -s -k ${SLAVE_ENDPOINT}/v2/stats/self | \
jq -c "{ recvBandwidthRate: .recvBandwidthRate, recvAppendRequestCnt: \
.recvAppendRequestCnt, recvPkgRate: .recvPkgRate }"| tr -d \"\{\} | sed -e 's/:/=/g'`
cat << EOF
etcd_general_stats,group=etcd_cluster_metrics members_count=${metric_members_count},dataset_size=${metric_total_size_dataset},total_keys_count=${metric_total_keys_count}
etcd_leader_stats,group=etcd_cluster_metrics $metric_leader_stats
etcd_follower_stats,group=etcd_cluster_metrics ${metric_slave_stats},latency_from_leader_avg=${metric_latency_from_leader_avg}
etcd_store_stats,group=etcd_cluster_metrics $metric_store_stats
EOF

View File

@ -0,0 +1,105 @@
#!/bin/bash
# Logs extractor / parser
# checking that we are good
if [[ -z "${TMP_DIR}" || -z "${POD}" || -z "${CONTAINER}" || -z "${K8S_NS}" || -z "${OS_LOG_FIELDS}" || -z ${CONTID} ]]; then
echo "Required variables are not set, exiting!"
exit 1
fi
# Variables declaration
SSH_USER="${SSH_USER:-root}"
SSH_PASS="${SSH_PASS:-r00tme}"
LOG_ENTRIES_NUMBER=${LOG_ENTRIES_NUMBER:-1000}
LAST_TIME_STAMP_FILE="${TMP_DIR}/timestamp.tmp"
# get | set last timestamp for log entries
function last_ts_data()
{
local action
action=${1}
shift
if [ "${action}" == "get" ]; then
if [ -e ${LAST_TIME_STAMP_FILE} ]; then
cat ${LAST_TIME_STAMP_FILE}
fi
else
echo "$*" > ${LAST_TIME_STAMP_FILE}
fi
}
function print_out()
{
if [ -z "${TMP_METRICS}" ];then
echo "$@"
else
echo "$@" >> ${TMP_METRICS}
fi
}
function micro_to_seconds()
{
local input
local output
input="${1}"
output=$(echo "scale=4;${input}/1000000" | bc)
if echo ${output} | grep -q '^\..'; then
output="0${output}"
fi
echo "${output}"
}
# extract container logs from k8s
function get_logs()
{
local sdate
local stime
local scalltime
local lasttimestamp
local is_foundlast
local tmpdata
tmpdata="${TMP_DIR}/tmpdata.log"
if [ -e "${tmpdata}" ]; then rm -f ${tmpdata}; fi
if [ "${CONTAINER}" == "keystone" ];then
sshpass -p ${SSH_PASS} ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ${SSH_USER}@${HOST} "tail -n${LOG_ENTRIES_NUMBER} /var/log/ccp/keystone/keystone-access.log | cut -d' ' -f${OS_LOG_FIELDS} | sed -e 's#\[##g' -e 's#\]##g'" 2>/dev/null > ${tmpdata}
else
sshpass -p ${SSH_PASS} ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ${SSH_USER}@${HOST} "docker logs --tail ${LOG_ENTRIES_NUMBER} ${CONTID} 2>&1 | grep 'INFO' | grep 'GET /' | cut -d' ' -f${OS_LOG_FIELDS}" 2>/dev/null > ${tmpdata}
fi
is_foundlast=false
lasttimestamp=$(last_ts_data "get")
if [ -z "${lasttimestamp}" ]; then
while read log
do
sdate=$(echo ${log} | cut -d' ' -f1)
stime=$(echo ${log} | cut -d' ' -f2)
scalltime=$(echo ${log} | cut -d' ' -f3)
if [ "${CONTAINER}" == "keystone" ];then scalltime=$(micro_to_seconds ${scalltime});fi
if [ ! -z "${scalltime}" ]; then
print_out "os_api_response_time,container=${CONTAINER},pod=${POD},instance=${HOST},requestdate=${sdate},requesttime=${stime} processingtime=${scalltime}"
fi
done < <(cat ${tmpdata})
sdate=$(tail -n 1 ${tmpdata} | cut -d' ' -f1)
stime=$(tail -n 1 ${tmpdata} | cut -d' ' -f2)
last_ts_data "set" "${sdate}${stime}"
else
while read log
do
sdate=$(echo ${log} | cut -d' ' -f1)
stime=$(echo ${log} | cut -d' ' -f2)
scalltime=$(echo ${log} | cut -d' ' -f3)
if [ "${CONTAINER}" == "keystone" ];then scalltime=$(micro_to_seconds ${scalltime});fi
if [[ "${is_foundlast}" = "false" && "${lasttimestamp}" = "${sdate}${stime}" ]]; then
#echo "FOUND: ${sdate}${stime} ${scalltime}"
is_foundlast=true
continue
fi
if [ "${is_foundlast}" == "true" ]; then
if [ ! -z "${scalltime}" ]; then
print_out "os_api_response_time,container=${CONTAINER},pod=${POD},instance=${HOST},requestdate=${sdate},requesttime=${stime} processingtime=${scalltime}"
fi
fi
done < <(cat ${tmpdata})
if [ "${is_foundlast}" == "true" ]; then
sdate=$(tail -n 1 ${tmpdata} | cut -d' ' -f1)
stime=$(tail -n 1 ${tmpdata} | cut -d' ' -f2)
last_ts_data "set" "${sdate}${stime}"
fi
fi
rm -f ${tmpdata}
}
# Main logic
get_logs

View File

@ -0,0 +1,6 @@
#!/bin/bash
# output from iostat -Ndx is
# Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
export LANG=C
iostat -Ndx | tail -n +4 | head -n -1 | awk '{print "system_per_device_iostat,device="$1" read_merge="$2",write_merge="$3",await="$10",read_await="$11",write_await="$12",util="$14",average_queue="$9}'

View File

@ -0,0 +1,75 @@
#!/bin/bash -e
K8S_MASTER=127.0.0.1
if [[ $1 ]] ; then
K8S_MASTER=$1
fi
type jq >/dev/null 2>&1 || ( echo "Jq is not installed" ; exit 1 )
type curl >/dev/null 2>&1 || ( echo "Curl is not installed" ; exit 1 )
curl_get() {
url="https://${K8S_MASTER}$@"
curl -k -s -u kube:changeme $url || ( echo "Curl failed at: $url" 1>&2; exit 1 )
}
# gathering frequent API calls output to separate file(in order to avoid long timeouts):
node_file=`mktemp /tmp/XXXXX`
pods_file=`mktemp /tmp/XXXXX`
endpoints_file=`mktemp /tmp/XXXXX`
curl_get "/api/v1/nodes" > $node_file
curl_get "/api/v1/pods" > $pods_file
curl_get "/api/v1/endpoints" > $endpoints_file
# metrics withdrawal:
number_of_namespaces_total=`curl_get "/api/v1/namespaces" | jq '[ .items[] .metadata.name ] | length'`
number_of_services_total=`curl_get "/api/v1/services" | jq -c '[ .items[] .metadata.name ] | length'`
number_of_nodes_total=`jq -c '[ .items[] .metadata.name ] | length' $node_file`
number_of_unsched=`jq -c '[ .items[] | select(.spec.unschedulable != null) .metadata.name ] | length' $node_file`
number_in_each_status=`jq -c '[ .items[] | .status.conditions[] | select(.type == "Ready") .status \
| gsub("(?<a>.+)"; "number_of_status_\(.a)" ) ] | group_by(.) | map({(.[0]): length}) | add ' $node_file \
| tr -d \"\{\} | sed -e 's/:/=/g'`
number_of_pods_total=`jq -c '[ .items[] .metadata.name ] | length' $pods_file`
number_of_pods_state_Pending=`jq -c '[ .items[] .status.phase | select(. == "Pending")] | length' $pods_file`
number_of_pods_state_Running=`jq -c '[ .items[] .status.phase | select(. == "Running")] | length' $pods_file`
number_of_pods_state_Succeeded=`jq -c '[ .items[] .status.phase | select(. == "Succeeded")] | length' $pods_file`
number_of_pods_state_Failed=`jq -c '[ .items[] .status.phase | select(. == "Failed")] | length' $pods_file`
number_of_pods_state_Unknown=`jq -c '[ .items[] .status.phase | select(. == "Unknown")] | length' $pods_file`
number_of_pods_per_node=`jq -c '[ .items[] | .spec.nodeName ] | group_by(.) | \
map("k8s_pods_per_node,group=k8s_cluster_metrics,pod_node=\(.[0]) value=\(length)")' $pods_file \
| sed -e 's/\["//g' -e 's/"\]//g' -e 's/","/\n/g'`
number_of_pods_per_ns=`jq -c '[ .items[] | .metadata.namespace ] | group_by(.) | \
map("k8s_pods_per_namespace,group=k8s_cluster_metrics,ns=\(.[0]) value=\(length)")' $pods_file \
| sed -e 's/\["//g' -e 's/"\]//g' -e 's/","/\n/g'`
number_of_endpoints_each_service=`jq -c '[ .items[] | { service: .metadata.name, endpoints: .subsets[] } | \
. as { service: $svc, endpoints: $endp } | $endp.addresses | length | . as $addr | $endp.ports | length | \
. as $prts | "k8s_services,group=k8s_cluster_metrics,service=\($svc) endpoints_number=\($addr * $prts)" ] ' $endpoints_file \
| sed -e 's/\["//g' -e 's/"\]//g' -e 's/","/\n/g'`
number_of_endpoints_total=`jq -c '[ .items[] | .subsets[] | { addrs: .addresses, ports: .ports } \
| map (length ) | .[0] * .[1] ] | add' $endpoints_file`
number_of_API_instances=`curl_get "/api/" | jq -c '.serverAddressByClientCIDRs | length'`
number_of_controllers=`curl_get "/api/v1/replicationcontrollers" | jq '.items | length'`
number_of_scheduler_instances=`curl_get /api/v1/namespaces/kube-system/pods?labelSelector='k8s-app=kube-scheduler' \
| jq -c '.items | length' `
cluster_resources_CPU=`jq -c '[ .items[] .status.capacity.cpu | tonumber ] | add' $node_file`
cluster_resources_RAM=`jq -c '[ .items[] .status.capacity.memory| gsub("[a-z]+$"; "" ; "i") | tonumber] | add' $node_file`
# output:
cat << EOF
k8s_nodes,group=k8s_cluster_metrics number_of_nodes_total=${number_of_nodes_total},number_of_unsched=${number_of_unsched}
k8s_nodes_states,group=k8s_cluster_metrics ${number_in_each_status}
k8s_namespaces,group=k8s_cluster_metrics number_of_namespaces_total=${number_of_namespaces_total}
k8s_pods,group=k8s_cluster_metrics number_of_pods_total=${number_of_pods_total}
k8s_pods_states,group=k8s_cluster_metrics number_of_pods_state_Pending=${number_of_pods_state_Pending},number_of_pods_state_Running=${number_of_pods_state_Running},number_of_pods_state_Succeeded=${number_of_pods_state_Succeeded},number_of_pods_state_Failed=${number_of_pods_state_Failed},number_of_pods_state_Unknown=${number_of_pods_state_Unknown}
${number_of_pods_per_node}
${number_of_pods_per_ns}
${number_of_endpoints_each_service}
k8s_services,group=k8s_cluster_metrics number_of_services_total=${number_of_services_total},number_of_endpoints_total=${number_of_endpoints_total}
k8s_number_of_API_instances,group=k8s_cluster_metrics value=${number_of_API_instances}
k8s_number_of_controllers,group=k8s_cluster_metrics value=${number_of_controllers}
k8s_number_of_scheduler_instances,group=k8s_cluster_metrics value=${number_of_scheduler_instances}
k8s_cluster_resources,group=k8s_cluster_metrics cpu_total=${cluster_resources_CPU},ram_total=${cluster_resources_RAM}
EOF
# cleanup
rm -f $node_file $pods_file $endpoints_file

View File

@ -0,0 +1,15 @@
#!/bin/bash
export LANG=C
PS_ALL=$(ps --no-headers -A -o command | grep -vE '(sh|bash)')
M_NAME=system_openstack_list
MARIADB=$(echo "${PS_ALL}" | grep 'mariadb' | wc -l)
RABBITMQ=$(echo "${PS_ALL}" | grep 'rabbitmq' | wc -l)
KEYSTONE=$(echo "${PS_ALL}" | grep 'keystone' | wc -l)
GLANCE=$(echo "${PS_ALL}" | grep -E '(glance-api|glance-registry)' | wc -l)
CINDER=$(echo "${PS_ALL}" | grep 'cinder' | wc -l)
NOVA=$(echo "${PS_ALL}" | grep -E '(nova-api|nova-conductor|nova-consoleauth|nova-scheduler)' | wc -l)
NEUTRON=$(echo "${PS_ALL}" | grep -E '(neutron-server|neutron-metadata-agent|neutron-dhcp-agent|neutron-l3-agent|neutron-openvswitch-agent)' | wc -l)
OPENVSWITCH=$(echo "${PS_ALL}" | grep -E '(ovsdb-server|ovs-vswitchd|ovsdb-client)' | wc -l)
echo "${M_NAME} mariadb=${MARIADB},rabbitmq=${RABBITMQ},keystone=${KEYSTONE},glance=${GLANCE},cinder=${CINDER},nova=${NOVA},neutron=${NEUTRON},openvswitch=${OPENVSWITCH}"

View File

@ -0,0 +1,7 @@
#!/bin/bash
# Output in MB/s
# echo 0 > /proc/sys/kernel/nmi_watchdog
# modprobe msr
export LANG=C
MEM_BW=$(sudo /opt/telegraf/bin/pcm-memory-one-line.x /csv 1 2>/dev/null | tail -n 1 | awk '{print $28}')
echo "system_memory bandwidth=${MEM_BW}"

View File

@ -0,0 +1,13 @@
#!/bin/bash
export LANG=C
IFS='
'
SUM_RESV_Q=0
SUM_SEND_Q=0
for i in $(netstat -4 -n); do
RESV_Q=$(echo $i | awk '{print $2}')
SEND_Q=$(echo $i | awk '{print $3}')
SUM_RESV_Q=$((${SUM_RESV_Q} + ${RESV_Q}))
SUM_SEND_Q=$((${SUM_SEND_Q} + ${SEND_Q}))
done
echo "system_tcp_queue sum_recv=${SUM_RESV_Q},sum_send=${SUM_SEND_Q}"

View File

@ -0,0 +1,22 @@
#!/bin/bash
set -o nounset # Treat unset variables as an error
#set -x
export LANG=C
if [ ! -d '/sys/devices/system/node' ]; then
# This host does not have NUMA
exit 44
fi
ALL_PROCESS="$(ps --no-headers -A -o pid,ucomm)"
for i in $(echo "${ALL_PROCESS}" | awk '{print $1}'); do
if [ -f "/proc/$i/numa_maps" ]; then
NUM_STAT=$(numastat -p $i)
PROC_NAME=$(echo "${ALL_PROCESS}" | grep -E "( $i |^$i )" | awk '{print $2}')
echo "${NUM_STAT}" | grep Huge | awk -v p=$i -v n=$PROC_NAME \
'{printf "system_numa_memory_per_pid,pid="p",name="n" memory_huge="$NF","}'
echo "${NUM_STAT}" | grep Heap | awk '{printf "memory_heap="$NF","}'
echo "${NUM_STAT}" | grep Stack | awk '{printf "memory_stack="$NF","}'
echo "${NUM_STAT}" | grep Private | awk '{print "memory_private="$NF}'
fi
done

View File

@ -0,0 +1,215 @@
#!/bin/bash
# Variables declaration
WORKDIR="$(cd "$(dirname ${0})" && pwd)"
OS_LOG_PARSER="${WORKDIR}/glog.sh"
TMPDATADIR="${WORKDIR}/data"
TMP_METRICS="${TMPDATADIR}/allmetrics.tmp"
MODE="${MODE:-bg}"
SCRIPT_LOG_DIR="${WORKDIR}/logs"
SCRIPT_LOG_FILE="${SCRIPT_LOG_DIR}/run_results_$(date +%Y-%m-%d).log"
SCRIPT_LOG_LVL=2
K8S_NS="${K8S_NS:-ccp}"
declare -a OSCONTROLLER=(
'cinder-api:1,2,21'
'glance-api:1,2,22'
'heat-api:1,2,22'
'neutron-metadata-agent:1,2,17'
'neutron-server:1,2,22'
'nova-api:1,2,21'
'keystone:4,5,11'
)
declare -a OSCOMPUTE=(
'nova-compute:'
)
# crete subfolder under working directory
function mk_dir()
{
local newdir="${TMPDATADIR}/${1}"
if [ ! -d "${newdir}" ]; then
mkdir -p ${newdir}
fi
}
# log function
function log()
{
local input
local dtstamp
input="$*"
dtstamp="$(date +%Y-%m-%d_%H%M%S)"
if [ ! -d "${SCRIPT_LOG_DIR}" ]; then
mkdir -p "${SCRIPT_LOG_DIR}"
fi
case "${SCRIPT_LOG_LVL}" in
3)
if [ ! -z "${input}" ]; then
echo "${dtstamp}: ${input}" | tee -a "${SCRIPT_LOG_FILE}"
fi
;;
2)
if [ ! -z "${input}" ]; then
echo "${dtstamp}: ${input}" >> "${SCRIPT_LOG_FILE}"
fi
;;
1)
if [ ! -z "${input}" ]; then
echo "${dtstamp}: ${input}"
fi
;;
*)
;;
esac
}
# get roles according to predefined in OSCONTROLLER & OSCOMPUTE
function get_role()
{
local role
local input
local arr_name
local arr_name_fields
role=${1}
shift
input=$*
case ${role} in
"controller")
for i in $(seq 0 $(( ${#OSCONTROLLER[@]} - 1)))
do
arr_name=$(echo ${OSCONTROLLER[${i}]} | cut -d":" -f1)
arr_name_fields=$(echo ${OSCONTROLLER[${i}]} | cut -d":" -f2)
if [[ "${arr_name}" == "${input}" ]]; then
echo "${arr_name_fields}"
return 0
fi
done
;;
"compute")
for i in $(seq 0 $(( ${#OSCOMPUTE[@]} - 1)))
do
arr_name=$(echo ${OSCOMPUTE[${i}]} | cut -d":" -f1)
arr_name_fields=$(echo ${OSCOMPUTE[${i}]} | cut -d":" -f2)
if [ "${arr_name}" == "${input}" ]; then
echo "${arr_name_fields}"
return 0
fi
done
;;
esac
return 1
}
# diff in seconds
function tdiff()
{
local now
local datetime
local result
datetime="$(date -d "${1}" +%s)"
now="$(date +%s)"
result=$(( ${now} - ${datetime} ))
echo ${result}
}
# lock file function
function glock()
{
local action
local lockfile
local accessdate
local old_in_sec=120
action="${1}"
# lockfile="${TMP_METRICS}.lock"
lockfile="${TMPDATADIR}/allmetrics.tmp.lock"
if [[ "${action}" == "lock" && ! -e "${lockfile}" ]]; then
touch "${lockfile}"
elif [[ "${action}" == "lock" && -e "${lockfile}" ]]; then
accessdate="$(stat ${lockfile} | grep Modify | cut -d' ' -f2,3)"
if [ "$(tdiff "${accessdate}")" -ge "${old_in_sec}" ]; then
rm "${lockfile}"
touch "${lockfile}"
else
log "Lock file ${lockfile} exists!"
return 1
fi
else
rm "${lockfile}"
fi
return 0
}
# wait for parcers launched in backgroud mode
function gatherchildren()
{
local childrencount
while true
do
childrencount=$(ps axf| grep ${OS_LOG_PARSER} | grep -v grep | wc -l)
if [ "${childrencount}" -eq 0 ]; then
return
fi
log "Children running ${childrencount}."
sleep 1
done
}
# list of running contaners
function get_k8s_containers()
{
local cont_host
local cont_pod
local cont_name
local cont_id
local os_log_fields
local cont_tmp_dir
local _raw_data
glock "lock"
if [ "$?" -ne 0 ]; then exit 1;fi
#echo '[' > ${TMP_METRICS}
_raw_data="${TMPDATADIR}/._raw_data"
rm -rf ${_raw_data}
kubectl get pods -n "${K8S_NS}" -o 'go-template={{range .items}}{{if or (ne .status.phase "Succeeded") (eq .status.phase "Running")}}{{.spec.nodeName}},{{.metadata.name}},{{range .status.containerStatuses}}{{.name}},{{.containerID}}{{end}}{{"\n"}}{{end}}{{end}}' > ${_raw_data}
for data in $(cat ${_raw_data})
do
cont_host=$(echo ${data} | cut -d',' -f1)
cont_pod=$(echo ${data} | cut -d',' -f2)
cont_name=$(echo ${data} | cut -d',' -f3)
cont_id=$(echo ${data} | cut -d',' -f4 | sed 's|^docker://||')
cont_tmp_dir="${cont_host}_${cont_pod}_${cont_name}"
os_log_fields=$(get_role "controller" "${cont_name}")
if [ "$?" -eq 0 ]; then
mk_dir "${cont_tmp_dir}"
export K8S_NS=${K8S_NS}
export TMP_DIR=${TMPDATADIR}/${cont_tmp_dir}
# export TMP_METRICS=${TMP_METRICS}
export TMP_METRICS="${TMPDATADIR}/results/${cont_pod}.tmp"
export CONTID=${cont_id}
export CONTAINER=${cont_name}
export HOST=${cont_host}
export POD=${cont_pod}
export OS_LOG_FIELDS=${os_log_fields}
log "MODE=${MODE} CONTID=${cont_id} TMP_METRICS=${TMP_METRICS} ROLE=controller HOST=${cont_host} POD=${cont_pod} CONTAINER=${cont_name} OS_LOG_FIELDS=${os_log_fields} TMP_DIR=${TMPDATADIR}/${cont_tmp_dir} K8S_NS=${K8S_NS} ${OS_LOG_PARSER}"
if [[ "${MODE}" == "bg" ]]; then
log "${cont_pod} ${cont_name} ${cont_id}"
${OS_LOG_PARSER} &
else
${OS_LOG_PARSER}
fi
unset TMP_METRICS
unset CONTID
unset CONTAINER
unset POD
unset OS_LOG_FIELDS
unset HOST
fi
# os_log_fields=$(get_role "compute" "${cont_name}")
# if [ "$?" -eq 0 ]; then
# mk_dir "${cont_tmp_dir}"
# log "ROLE=compute HOST=${cont_host} POD=${cont_pod} CONTAINER=${cont_name} OS_LOG_FIELDS=${os_log_fields} TMP_DIR=${TMPDATADIR}/${cont_tmp_dir} K8S_NS=${K8S_NS} ${OS_LOG_PARSER}"
# fi
done
gatherchildren
if [ "$(ls ${TMPDATADIR}/results/ | wc -l)" -gt 0 ]; then
cat ${TMPDATADIR}/results/*.tmp
log "Resulting lines $(cat ${TMPDATADIR}/results/*.tmp | wc -l)"
rm -rf ${TMPDATADIR}/results/*
fi
glock "unlock"
}
# Main logic
mk_dir
mk_dir "results"
get_k8s_containers

View File

@ -0,0 +1,6 @@
#!/bin/bash
export LANG=C
for i in $(ps --no-headers -A -o pid); do
pidstat -p $i | tail -n 1 | grep -v PID | awk '{print "system_per_process_cpu_usage,process="$9" user="$4",system="$5}'
done

View File

@ -0,0 +1,12 @@
#!/bin/bash
#
WORKDIR="$(cd "$(dirname ${0})" && pwd)"
SCRIPT="${WORKDIR}/$(basename ${0})"
MYSQLUSER="nova"
MYSQPASSWD="password"
MYSQLHOST="mariadb.ccp"
avgdata=$(mysql -u${MYSQLUSER} -p${MYSQPASSWD} -h ${MYSQLHOST} -D nova --skip-column-names --batch -e "select diff from (select avg(unix_timestamp(launched_at) - unix_timestamp(created_at)) as diff from instances where vm_state != 'error' and launched_at >= subtime(now(),'30')) t1 where diff IS NOT NULL;" 2>/dev/null | sed 's/\t/,/g';)
if [ ! -z "${avgdata}" ]; then
echo "vm_spawn_avg_time timediffinsec=${avgdata}"
fi

View File

@ -0,0 +1,116 @@
[global_tags]
metrics_source="system_openstack"
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "15s"
flush_jitter = "5s"
precision = ""
debug = false
quiet = false
hostname = ""
omit_hostname = false
[[outputs.prometheus_client]]
listen = ":9126"
[[inputs.cpu]]
percpu = true
totalcpu = true
fielddrop = ["time_*"]
[[inputs.disk]]
ignore_fs = ["tmpfs", "devtmpfs"]
[[inputs.diskio]]
[[inputs.kernel]]
[[inputs.mem]]
[[inputs.processes]]
[[inputs.swap]]
[[inputs.system]]
[[inputs.kernel_vmstat]]
[[inputs.net]]
[[inputs.netstat]]
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/vmtime.sh",
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "30s"
commands = [
"/opt/telegraf/bin/osapitime.sh",
]
timeout = "60s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/etcd_get_metrics.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/k8s_get_metrics.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.openstack]]
interval = '40s'
identity_endpoint = "http://keystone.ccp.svc.cluster.local:5000/v3"
domain = "default"
project = "admin"
username = "admin"
password = "password"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/iostat_per_device.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/per_process_cpu_usage.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/entropy.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "60s"
commands = [
"/opt/telegraf/bin/numa_stat_per_pid.sh"
]
timeout = "60s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/memory_bandwidth.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/list_openstack_processes.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/network_tcp_queue.sh"
]
timeout = "30s"
data_format = "influx"

View File

@ -0,0 +1,81 @@
[global_tags]
metrics_source="system"
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "15s"
flush_jitter = "5s"
precision = ""
debug = false
quiet = false
hostname = ""
omit_hostname = false
[[outputs.prometheus_client]]
listen = ":9126"
[[inputs.cpu]]
percpu = true
totalcpu = true
fielddrop = ["time_*"]
[[inputs.disk]]
ignore_fs = ["tmpfs", "devtmpfs"]
[[inputs.diskio]]
[[inputs.kernel]]
[[inputs.mem]]
[[inputs.processes]]
[[inputs.swap]]
[[inputs.system]]
[[inputs.kernel_vmstat]]
[[inputs.net]]
[[inputs.netstat]]
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/iostat_per_device.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/per_process_cpu_usage.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/entropy.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "60s"
commands = [
"/opt/telegraf/bin/numa_stat_per_pid.sh"
]
timeout = "60s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/memory_bandwidth.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/list_openstack_processes.sh"
]
timeout = "30s"
data_format = "influx"
[[inputs.exec]]
interval = "15s"
commands = [
"/opt/telegraf/bin/network_tcp_queue.sh"
]
timeout = "30s"
data_format = "influx"

View File

@ -0,0 +1,948 @@
.. _Methodology_for_Containerized_Openstack_Monitoring:
**************************************************
Methodology for Containerized Openstack Monitoring
**************************************************
:Abstract:
This document describes one of the Containerized Openstack monitoring solutions
to provide scalable and comprehensive architecture and obtain all crucial performance
metrics on each structure layer.
Containerized Openstack Monitoring Architecture
===============================================
This part of documentation describes required performance metrics in each
distinguished Containerized Openstack layer.
Containerized Openstack comprises three layers where Monitoring System should
be able to query all necessary counters:
- OS layer
- Kubernetes layer
- Openstack layer
Monitoring instruments must be logically divided in two groups:
- Monitoring Server Side
- Node Client Side
Operation System Layer
----------------------
We were using Ubuntu Xenial on top of bare-metal servers for both server and node side.
Baremetal hardware description
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
We deployed everything at 200 servers environment with following hardware characteristics:
.. table::
+-------+----------------+------------------------+
|server |vendor,model |HP,DL380 Gen9 |
+-------+----------------+------------------------+
|CPU |vendor,model |Intel,E5-2680 v3 |
| +----------------+------------------------+
| |processor_count |2 |
| +----------------+------------------------+
| |core_count |12 |
| +----------------+------------------------+
| |frequency_MHz |2500 |
+-------+----------------+------------------------+
|RAM |vendor,model |HP,752369-081 |
| +----------------+------------------------+
| |amount_MB |262144 |
+-------+----------------+------------------------+
|NETWORK|interface_name |p1p1 |
| +----------------+------------------------+
| |vendor,model |Intel,X710 Dual Port |
| +----------------+------------------------+
| |bandwidth |10G |
+-------+----------------+------------------------+
|STORAGE|dev_name |/dev/sda |
| +----------------+------------------------+
| |vendor,model | | raid10 - HP P840 |
| | | | 12 disks EH0600JEDHE |
| +----------------+------------------------+
| |SSD/HDD |HDD |
| +----------------+------------------------+
| |size | 3,6TB |
+-------+----------------+------------------------+
Operating system configuration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Baremetal nodes were provisioned with Cobbler with our in-home preseed scripts.
OS versions we used:
.. table:: Versions Operating Systems
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|Ubuntu |Ubuntu 16.04.1 LTS |
+--------------------+-----------------------------------------+
|Kernel |4.4.0-47-generic |
+--------------------+-----------------------------------------+
You can find /etc folder contents from the one of the typical system we were using:
:download:`etc_tarball <configs/node1.tar.gz>`
Required system metrics
^^^^^^^^^^^^^^^^^^^^^^^
At this layer we must get this list of processes:
.. table::
+------------------------+-----------------------------------------+
|List of processes |Mariadb |
| +-----------------------------------------+
| |Rabbitmq |
| |-----------------------------------------+
| |Keystone |
| +-----------------------------------------+
| |Glance |
| +-----------------------------------------+
| |Cinder |
| +-----------------------------------------+
| |Nova |
| +-----------------------------------------+
| |Neutron |
| +-----------------------------------------+
| |Openvswitch |
| +-----------------------------------------+
| |Kubernetes |
+------------------------+-----------------------------------------+
And following list of metrics:
.. table::
+------------------------+-----------------------------------------+
|Node load average |1min |
| +-----------------------------------------+
| |5min |
| |-----------------------------------------+
| |15min |
+------------------------+-----------------------------------------+
|Global process stats |Running |
| +-----------------------------------------+
| |Stopped |
| |-----------------------------------------+
| |Waiting |
+------------------------+-----------------------------------------+
|Global CPU Usage | Steal |
| +-----------------------------------------+
| | Wait |
| +-----------------------------------------+
| | User |
| +-----------------------------------------+
| | System |
| +-----------------------------------------+
| | Interrupt |
| +-----------------------------------------+
| | Nice |
| +-----------------------------------------+
| | Idle |
+------------------------+-----------------------------------------+
|Per CPU Usage | User |
| +-----------------------------------------+
| | System |
+------------------------+-----------------------------------------+
|Global memory usage |bandwidth |
| +-----------------------------------------+
| |Cached |
| +-----------------------------------------+
| |Buffered |
| +-----------------------------------------+
| |Free |
| +-----------------------------------------+
| |Used |
| +-----------------------------------------+
| |Total |
+------------------------+-----------------------------------------+
|Numa monitoring |Numa_hit |
|For each node +-----------------------------------------+
| |Numa_miss |
| |-----------------------------------------+
| |Numa_foreign |
| +-----------------------------------------+
| |Local_node |
| +-----------------------------------------+
| |Other_node |
+------------------------+-----------------------------------------+
|Numa monitoring |Huge |
|For each pid +-----------------------------------------+
| |Heap |
| |-----------------------------------------+
| |Stack |
| +-----------------------------------------+
| |Private |
+------------------------+-----------------------------------------+
|Global IOSTAT \+ |Merge reads /s |
|Per device IOSTAT +-----------------------------------------+
| |Merge write /s |
| +-----------------------------------------+
| |read/s |
| +-----------------------------------------+
| |write/s |
| +-----------------------------------------+
| |Read transfer |
| +-----------------------------------------+
| |Write transfer |
| +-----------------------------------------+
| |Read latency |
| +-----------------------------------------+
| |Write latency |
| +-----------------------------------------+
| |Write transfer |
| +-----------------------------------------+
| |Queue size |
| +-----------------------------------------+
| |Await |
+------------------------+-----------------------------------------+
|Network per interface |Octets /s (in, out) |
| +-----------------------------------------+
| |Packet /s (in, out) |
| |-----------------------------------------+
| |Dropped /s |
+------------------------+-----------------------------------------+
|Other system metrics |Entropy |
| +-----------------------------------------+
| |DF per device |
+------------------------+-----------------------------------------+
Kubernetes Layer
----------------
`Kargo`_ from `Fuel-CCP-installer`_ was our main tool to deploy K8S
on top of provisioned systems (monitored nodes).
Kargo sets up Kubernetes in the following way:
- masters: Calico, Kubernetes API services
- nodes: Calico, Kubernetes minion services
- etcd: etcd service
Kargo deployment parameters
^^^^^^^^^^^^^^^^^^^^^^^^^^^
You can find Kargo deployment script in `Kargo deployment script`_ section
.. code:: bash
docker_options: "--insecure-registry 172.20.8.35:5000 -D"
upstream_dns_servers: [172.20.8.34, 8.8.4.4]
nameservers: [172.20.8.34, 8.8.4.4]
kube_service_addresses: 10.224.0.0/12
kube_pods_subnet: 10.240.0.0/12
kube_network_node_prefix: 22
kube_apiserver_insecure_bind_address: "0.0.0.0"
dns_replicas: 3
dns_cpu_limit: "100m"
dns_memory_limit: "512Mi"
dns_cpu_requests: "70m"
dns_memory_requests: "70Mi"
deploy_netchecker: false
.. table::
+----------------------+-----------------------------------------+
|Software |Version |
+----------------------+-----------------------------------------+
|`Fuel-CCP-Installer`_ |6fd81252cb2d2c804f388337aa67d4403700f094 |
| | |
+----------------------+-----------------------------------------+
|`Kargo`_ |2c23027794d7851ee31363c5b6594180741ee923 |
+----------------------+-----------------------------------------+
Required K8S metrics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here we should get K8S health
metrics and ETCD performance metrics:
.. table::
+------------------------+-----------------------------------------+
|ETCD performance metrics|members count / states |
| +-----------------------------------------+
| |numbers of keys in a cluster |
| |-----------------------------------------+
| |Size of data set |
| +-----------------------------------------+
| |Avg. latency from leader to followers |
| +-----------------------------------------+
| |Bandwidth rate, send/receive |
| +-----------------------------------------+
| |Create store success/fail |
| +-----------------------------------------+
| |Get success/fail |
| +-----------------------------------------+
| |Set success/fail |
| +-----------------------------------------+
| |Package rate, send/receive |
| +-----------------------------------------+
| |Expire count |
| +-----------------------------------------+
| |Update success/fail |
| +-----------------------------------------+
| |Compare-and-swap success/fail |
| +-----------------------------------------+
| |Watchers |
| +-----------------------------------------+
| |Delete success/fail |
| +-----------------------------------------+
| |Compare-and-delete success/fail |
| +-----------------------------------------+
| |Append req, send/ receive |
+------------------------+-----------------------------------------+
|K8S health metrics |Number of node in each state |
| +-----------------------------------------+
| |Total number of namespaces |
| +-----------------------------------------+
| |Total number of PODs per cluster,node,ns |
| +-----------------------------------------+
| |Total of number of services |
| +-----------------------------------------+
| |Endpoints in each service |
| +-----------------------------------------+
| |Number of API service instances |
| +-----------------------------------------+
| |Number of controller instances |
| +-----------------------------------------+
| |Number of scheduler instances |
| +-----------------------------------------+
| |Cluster resources, scheduler view |
+------------------------+-----------------------------------------+
|K8S API log analysis |Number of responses (per each HTTP code) |
| +-----------------------------------------+
| |Response Time |
+------------------------+-----------------------------------------+
For last two metrics we should utilize log collector to store and parse all
log records within K8S environments.
Openstack Layer
-----------------
CCP stands for "Containerized Control Plane". CCP aims to build, run and manage
production-ready OpenStack containers on top of Kubernetes cluster.
.. table::
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|`Fuel-CCP`_ |8570d0e0e512bd16f8449f0a10b1e3900fd09b2d |
+--------------------+-----------------------------------------+
CCP configuration
^^^^^^^^^^^^^^^^^
CCP was deployed on top of 200 nodes K8S cluster in the following configuration:
.. code-block:: yaml
node[1-3]: Kubernetes
node([4-6])$: # 4-6
roles:
- controller
- openvswitch
node[7-9]$: # 7-9
roles:
- rabbitmq
node10$: # 10
roles:
- galera
node11$: # 11
roles:
- heat
node(1[2-9])$: # 12-19
roles:
- compute
- openvswitch
node[2-9][0-9]$: # 20-99
roles:
- compute
- openvswitch
node(1[0-9][0-9])$: # 100-199
roles:
- compute
- openvswitch
node200$:
roles:
- backup
CCP Openstack services list ( `versions.yaml`_ ):
.. code-block:: yaml
openstack/cinder:
git_ref: stable/newton
git_url: https://github.com/openstack/cinder.git
openstack/glance:
git_ref: stable/newton
git_url: https://github.com/openstack/glance.git
openstack/heat:
git_ref: stable/newton
git_url: https://github.com/openstack/heat.git
openstack/horizon:
git_ref: stable/newton
git_url: https://github.com/openstack/horizon.git
openstack/keystone:
git_ref: stable/newton
git_url: https://github.com/openstack/keystone.git
openstack/neutron:
git_ref: stable/newton
git_url: https://github.com/openstack/neutron.git
openstack/nova:
git_ref: stable/newton
git_url: https://github.com/openstack/nova.git
openstack/requirements:
git_ref: stable/newton
git_url: https://git.openstack.org/openstack/requirements.git
openstack/sahara-dashboard:
git_ref: stable/newton
git_url: https://git.openstack.org/openstack/sahara-dashboard.git
`K8S Ingress Resources`_ rules were enabled during CCP deployment to expose Openstack services
endpoints to external routable network.
See CCP deployment script and configuration files in the
`CCP deployment and configuration files`_ section.
Required Openstack-related metrics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
At this layer we should get openstack environment metrics,
API and resources utilization metrics.
.. table:: Versions of CCP-related software
+------------------------+-----------------------------------------+
|Openstack metrics |Total number of controller nodes |
| +-----------------------------------------+
| |Total number of services |
| |-----------------------------------------+
| |Total number of compute nodes |
| +-----------------------------------------+
| |Total number of nodes |
| +-----------------------------------------+
| |Total number of VMs |
| +-----------------------------------------+
| |Number of VMs per tenant, per node |
| +-----------------------------------------+
| |Resource utilization per project,service |
| +-----------------------------------------+
| |Total number of tenants |
| +-----------------------------------------+
| |API request time |
| +-----------------------------------------+
| |Mean time to spawn VM |
+------------------------+-----------------------------------------+
Implementation
==============
This part of documentation describes Monitoring System implementation.
Here is software list that we chose to realize all required tasks:
.. table::
+-----------------------------------------+-----------------------------------------+
|Monitoring Node Server Side |Monitored Node Client Side |
+--------------------+--------------------+--------------------+--------------------+
|Metrics server |Log storage |Metrics agent |Log collector |
| | | | |
+--------------------+--------------------+--------------------+--------------------+
| `Prometheus`_ \+ | `ElasticSearch`_ |`Telegraf`_ | `Heka`_ |
| `Grafana`_ | \+ `Kibana`_ | | |
+--------------------+--------------------+--------------------+--------------------+
Server Side Software
---------------------
Prometheus
^^^^^^^^^^
.. table::
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|`Prometheus GitHub`_|7e369b9318a4d5d97a004586a99f10fa51a46b26 |
+--------------------+-----------------------------------------+
Due to high load rate we faced an issue with Prometheus performance at metrics count up to 15 millions.
We split Prometheus setup in 2 standalone nodes. First node - to poll API metrics from K8S-related services
that natively available at `/metrics` uri and exposed by K8S API and ETCD API by default.
Second node - to store all other metrics that should be collected and calculated locally on environment
servers via Telegraf.
Prometheus nodes deployments scripts and configuration files could be found at `Prometheus deployment and configuration files`_ section
Grafana
^^^^^^^
.. table::
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|`Grafana`_ |v4.0.1 |
+--------------------+-----------------------------------------+
Grafana was used as a metrics visualizer with several dashboards for each metrics group.
Separate individual dashboards were built for each group of metrics:
- System nodes metrics
- Kubernetes metrics
- ETCD metrics
- Openstack metrics
You can find their setting at `Grafana dashboards configuration`_
Grafana server deployment script:
.. code-block:: bash
#!/bin/bash
ansible-playbook -i ./hosts ./deploy-graf-prom.yaml --tags "grafana"
It uses the same yaml configuration file `deploy-graf-prom.yaml`_ from `Prometheus deployment and configuration files`_ section.
ElasticSearch
^^^^^^^^^^^^^
.. table::
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|`ElasticSearch`_ |2.4.2 |
+--------------------+-----------------------------------------+
ElasticSearch is well-known proven log storage and we used it as a standalone
node for collecting Kubernetes API logs and all other logs from containers across environment.
For appropriate performance at 200 nodes lab we increased `ES_HEAP_SIZE` from default 1G to 10G
in /etc/default/elasticsearch configuration file.
Elastic search and Kibana dashboard were installed with
`deploy_elasticsearch_kibana.sh`_ deployment script.
Kibana
^^^^^^
.. table::
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|`Kibana`_ |4.5.4 |
+--------------------+-----------------------------------------+
We used Kibana as a main visualization tool for Elastic Search. We were able to create chart
graphs based on K8S API logs analysis. Kibana was installed on a single separate node
with a single dashboard representing K8S API Response time graph.
Dashboard settings:
:download:`Kibana_dashboard.json <configs/dashboards/Kibana_dashboard.json>`
Client side Software
--------------------
Telegraf
^^^^^^^^
.. table::
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|`Telegraf`_ |v1.0.0-beta2-235-gbc14ac5 |
| |git: openstack_stats |
| |bc14ac5b9475a59504b463ad8f82ed810feed3ec |
+--------------------+-----------------------------------------+
Telegraf was chosen as client-side metrics agent. It provides multiple ways to poll and calculate from variety of
different sources. With regard to its plugin-driven nature, it takes data from different inputs and
exposes calculated metrics in Prometheus format. We used forked version of Telegraf with custom patches to
be able to utilize custom Openstack-input plugin:
- `GitHub Telegraf Fork`_
- `Go SDK for OpenStack`_
Following automation scripts and configuration files were used to start Telegraf agent
across environment nodes.
`Telegraf deployment and configuration files`_
Below you can see which plugins were used to obtain metrics.
Standart Plugins
""""""""""""""""
.. code:: bash
inputs.cpu CPU
inputs.disk
inputs.diskio
inputs.kernel
inputs.mem
inputs.processes
inputs.swap
inputs.system
inputs.kernel_vmstat
inputs.net
inputs.netstat
inputs.exec
Openstack input plugin
""""""""""""""""""""""
`inputs.openstack` custom plugin was used to gather the most of required Openstack-related metrics.
settings:
.. code:: bash
interval = '40s'
identity_endpoint = "http://keystone.ccp.svc.cluster.local:5000/v3"
domain = "default"
project = "admin"
username = "admin"
password = "password"
`System.exec` plugin
""""""""""""""""""""
`system.exec` plugin was used to trigger scripts to poll
and calculate all non-standard metrics.
common settings:
.. code:: bash
interval = "15s"
timeout = "30s"
data_format = "influx"
commands:
.. code:: bash
"/opt/telegraf/bin/list_openstack_processes.sh"
"/opt/telegraf/bin/per_process_cpu_usage.sh"
"/opt/telegraf/bin/numa_stat_per_pid.sh"
"/opt/telegraf/bin/iostat_per_device.sh"
"/opt/telegraf/bin/memory_bandwidth.sh"
"/opt/telegraf/bin/network_tcp_queue.sh"
"/opt/telegraf/bin/etcd_get_metrics.sh"
"/opt/telegraf/bin/k8s_get_metrics.sh"
"/opt/telegraf/bin/vmtime.sh"
"/opt/telegraf/bin/osapitime.sh"
You can see full Telegraf configuration file and its custom input scripts in the
section `Telegraf deployment and configuration files`_.
Heka
^^^^
.. table::
+--------------------+-----------------------------------------+
|Software |Version |
+--------------------+-----------------------------------------+
|`Heka`_ |0.10.0 |
+--------------------+-----------------------------------------+
We chose Heka as log collecting agent for its wide variety of inputs
(possibility to feed data from Docker socket), filters (custom shorthand SandBox filters in LUA language)
and possibility to encode data for ElasticSearch.
With Heka agent started across environment servers we were able to send containers' logs to ElasticSearch
server. With custom LUA filter we extracted K8S API data and convert it in appropriate format to
visualize API timing counters (Average Response Time).
Heka deployment scripts and configuration file with LUA custom filter are in
`Heka deployment and configuration`_ section.
Applications
============
Kargo deployment script
-----------------------
deploy_k8s_using_kargo.sh
^^^^^^^^^^^^^^^^^^^^^^^^^
.. literalinclude:: configs/deploy_k8s_using_kargo.sh
:language: bash
CCP deployment and configuration files
---------------------------------------
deploy-ccp.sh
^^^^^^^^^^^^^
.. literalinclude:: configs/ccp/deploy-ccp.sh
:language: bash
ccp.yaml
^^^^^^^^
.. literalinclude:: configs/ccp/ccp.yaml
:language: yaml
configs.yaml
^^^^^^^^^^^^
.. literalinclude:: configs/ccp/configs.yaml
:language: yaml
topology.yaml
^^^^^^^^^^^^^
.. literalinclude:: configs/ccp/topology.yaml
:language: yaml
repos.yaml
^^^^^^^^^^
.. literalinclude:: configs/ccp/repos.yaml
:language: yaml
versions.yaml
^^^^^^^^^^^^^
.. literalinclude:: configs/ccp/versions.yaml
:language: yaml
Prometheus deployment and configuration files
---------------------------------------------
Deployment scripts
^^^^^^^^^^^^^^^^^^
deploy_prometheus.sh
""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy_prometheus.sh
:language: bash
deploy-graf-prom.yaml
"""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy-graf-prom.yaml
:language: yaml
docker_prometheus.yaml
""""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/docker_prometheus.yaml
:language: yaml
deploy_etcd_collect.sh
""""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy_etcd_collect.sh
:language: bash
Configuration files
^^^^^^^^^^^^^^^^^^^
prometheus-kuber.yml.j2
"""""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/prometheus/prometheus-kuber.yml.j2
:language: bash
prometheus-system.yml.j2
""""""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/prometheus/prometheus-system.yml.j2
:language: bash
targets.yml.j2
""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/prometheus/targets.yml.j2
:language: bash
Grafana dashboards configuration
--------------------------------
:download:`Systems_nodes_statistics.json <configs/dashboards/Systems_nodes_statistics.json>`
:download:`Kubernetes_statistics.json <configs/dashboards/Kubernetes_statistics.json>`
:download:`ETCD.json <configs/dashboards/ETCD.json>`
:download:`OpenStack.json <configs/dashboards/OpenStack.json>`
ElasticSearch deployment script
-------------------------------
deploy_elasticsearch_kibana.sh
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. literalinclude:: configs/elasticsearch-heka/deploy_elasticsearch_kibana.sh
:language: bash
Telegraf deployment and configuration files
-------------------------------------------
deploy_telegraf.sh
^^^^^^^^^^^^^^^^^^
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy_telegraf.sh
:language: bash
deploy-telegraf.yaml
^^^^^^^^^^^^^^^^^^^^
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy-telegraf.yaml
:language: yaml
Telegraf system
^^^^^^^^^^^^^^^
telegraf-sys.conf
"""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/telegraf-sys.conf
:language: bash
Telegraf openstack
^^^^^^^^^^^^^^^^^^^
telegraf-openstack.conf.j2
""""""""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/telegraf-openstack.conf.j2
:language: bash
Telegraf inputs scripts
^^^^^^^^^^^^^^^^^^^^^^^
list_openstack_processes.sh
"""""""""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/list_openstack_processes.sh
:language: bash
per_process_cpu_usage.sh
""""""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/per_process_cpu_usage.sh
:language: bash
numa_stat_per_pid.sh
""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/numa_stat_per_pid.sh
:language: bash
iostat_per_device.sh
""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/iostat_per_device.sh
:language: bash
memory_bandwidth.sh
"""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/memory_bandwidth.sh
:language: bash
network_tcp_queue.sh
""""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/network_tcp_queue.sh
:language: bash
etcd_get_metrics.sh
"""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/etcd_get_metrics.sh
:language: bash
k8s_get_metrics.sh
""""""""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/k8s_get_metrics.sh
:language: bash
vmtime.sh
"""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/vmtime.sh
:language: bash
osapitime.sh
""""""""""""
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/osapitime.sh
:language: bash
Heka deployment and configuration
---------------------------------
Deployment
^^^^^^^^^^
deploy_heka.sh
""""""""""""""
.. literalinclude:: configs/elasticsearch-heka/deploy_heka.sh
:language: bash
deploy-heka.yaml
""""""""""""""""
.. literalinclude:: configs/elasticsearch-heka/deploy-heka.yaml
:language: yaml
Configuration
^^^^^^^^^^^^^
00-hekad.toml.j2
""""""""""""""""
.. literalinclude:: configs/elasticsearch-heka/heka/00-hekad.toml.j2
:language: bash
kubeapi_to_int.lua.j2
"""""""""""""""""""""
.. literalinclude:: configs/elasticsearch-heka/heka/kubeapi_to_int.lua.j2
:language: bash
.. references:
.. _Fuel-CCP-Installer: https://github.com/openstack/fuel-ccp-installer
.. _Kargo: https://github.com/kubernetes-incubator/kargo.git
.. _Fuel-CCP: https://github.com/openstack/fuel-ccp
.. _Prometheus: https://prometheus.io/
.. _Prometheus GitHub: https://github.com/prometheus/prometheus
.. _Grafana: http://grafana.org/
.. _ElasticSearch: https://www.elastic.co/products/elasticsearch
.. _Kibana: https://www.elastic.co/products/kibana
.. _Telegraf: https://www.influxdata.com/time-series-platform/telegraf/
.. _GitHub Telegraf Fork: https://github.com/spjmurray/telegraf/tree/openstack_stats/plugins/inputs/openstack
.. _Go SDK for OpenStack: https://github.com/rackspace/gophercloud/
.. _Heka: https://hekad.readthedocs.io/en/v0.10.0/
.. _K8S Ingress Resources: http://kubernetes.io/docs/user-guide/ingress/