Pacemaker Monitoring
This patch introduces pacemaker monitoring in Browbeat. The 'pcs status' output is scraped and processed through a collectd python plugin, and metrics are visualized in Grafana panels. Change-Id: I587d186a068f721c65f85e6069e4beaa98bacde7
This commit is contained in:
parent
fde79f74c1
commit
8ad0bce482
@ -319,6 +319,13 @@ ovs_flows_monitoring: false
|
||||
# before enabling this plugin.
|
||||
ovn_monitoring: false
|
||||
|
||||
#######################
|
||||
# Pacemaker Monitoring
|
||||
#######################
|
||||
# Monitors pcs status metrics on controller.
|
||||
pacemaker_monitoring: true
|
||||
pacemaker_controller_collectd_interval: 15
|
||||
|
||||
controller_monitored_ints:
|
||||
- "tap"
|
||||
|
||||
|
@ -230,6 +230,8 @@
|
||||
dest: /usr/local/bin/collectd_rabbitmq_monitoring.py
|
||||
- src: collectd_swift_stat.py
|
||||
dest: /usr/local/bin/collectd_swift_stat.py
|
||||
- src: collectd_pacemaker_monitoring.py
|
||||
dest: /usr/local/bin/collectd_pacemaker_monitoring.py
|
||||
when: "('Controller' in group_names and inventory_hostname == groups['Controller'][0]) or ('Undercloud' in group_names)"
|
||||
|
||||
- name: Copy python plugins
|
||||
|
@ -546,6 +546,19 @@ PreCacheChain "PreCache"
|
||||
</Plugin>
|
||||
|
||||
{% endif %}
|
||||
|
||||
{%if pacemaker_monitoring %}
|
||||
<Plugin python>
|
||||
ModulePath "/usr/local/bin/"
|
||||
Import "collectd_pacemaker_monitoring"
|
||||
|
||||
<Module collectd_pacemaker_monitoring>
|
||||
Interval {{pacemaker_controller_collectd_interval}}
|
||||
</Module>
|
||||
</Plugin>
|
||||
|
||||
{% endif %}
|
||||
|
||||
{%if gnocchi_status_controller_collectd_plugin %}
|
||||
{%if inventory_hostname == groups['Controller'][0] %}
|
||||
<Plugin python>
|
||||
|
@ -83,6 +83,45 @@
|
||||
dest: "{{ browbeat_containers_path }}/collectd-openstack/config/collectd.conf"
|
||||
when: "config_type == 'compute' and ovn_compute_collectd_plugin"
|
||||
|
||||
- name: Create pipe for to use pcs cli inside collectd container
|
||||
shell: |
|
||||
sudo mkfifo /home/"{{ host_remote_user }}"/collectd_pipe
|
||||
become: yes
|
||||
become_user: root
|
||||
ignore_errors: yes
|
||||
when: "config_type == 'controller' and pacemaker_monitoring"
|
||||
|
||||
- name: Create script to run eval on pcs pipe
|
||||
file:
|
||||
path: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh"
|
||||
state: touch
|
||||
mode: 0775
|
||||
owner: root
|
||||
become: yes
|
||||
become_user: root
|
||||
ignore_errors: yes
|
||||
when: "config_type == 'controller' and pacemaker_monitoring"
|
||||
|
||||
- name: Add content to script to run eval on pcs pipe
|
||||
copy:
|
||||
dest: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh"
|
||||
content: |
|
||||
while true; do eval "$(cat /home/{{ host_remote_user }}/collectd_pipe)"; done
|
||||
become: yes
|
||||
become_user: root
|
||||
ignore_errors: yes
|
||||
when: "config_type == 'controller' and pacemaker_monitoring"
|
||||
|
||||
- name: Run script to run eval on pcs pipe
|
||||
shell: |
|
||||
nohup ./collectd_pipe_eval.sh > /var/log/containers/stdouts/collectd_pacemaker.out 2>&1 &
|
||||
become: yes
|
||||
become_user: root
|
||||
ignore_errors: yes
|
||||
args:
|
||||
chdir: "/home/{{ host_remote_user }}"
|
||||
when: "config_type == 'controller' and pacemaker_monitoring"
|
||||
|
||||
- name: Build and Run container
|
||||
block:
|
||||
- name: Build collectd-openstack container
|
||||
@ -104,10 +143,15 @@
|
||||
{% if ovs_flows_monitoring %}
|
||||
-v /etc/openvswitch/:/etc/openvswitch/ -v /var/run/openvswitch/:/var/run/openvswitch/ \
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if config_type == 'controller' %}
|
||||
{% if ovn_monitoring %}
|
||||
-v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \
|
||||
-v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \
|
||||
{% endif %}
|
||||
{% if pacemaker_monitoring %}
|
||||
-v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
collectd-openstack
|
||||
become: yes
|
||||
|
@ -538,6 +538,19 @@ PreCacheChain "PreCache"
|
||||
</Plugin>
|
||||
|
||||
{% endif %}
|
||||
|
||||
{%if pacemaker_monitoring %}
|
||||
<Plugin python>
|
||||
ModulePath "/usr/local/bin/"
|
||||
Import "collectd_pacemaker_monitoring"
|
||||
|
||||
<Module collectd_pacemaker_monitoring>
|
||||
Interval {{pacemaker_controller_collectd_interval}}
|
||||
</Module>
|
||||
</Plugin>
|
||||
|
||||
{% endif %}
|
||||
|
||||
{%if gnocchi_status_controller_collectd_plugin %}
|
||||
{%if inventory_hostname == groups['Controller'][0] %}
|
||||
<Plugin python>
|
||||
|
@ -166,6 +166,8 @@ dashboard:
|
||||
{% include 'partials/osp_response_times.yaml' %}
|
||||
|
||||
{% include 'partials/ovn_db_tables.yaml' %}
|
||||
|
||||
{% include 'partials/pacemaker_monitoring.yaml' %}
|
||||
{% endif %}
|
||||
|
||||
{% include 'partials/ovn_metrics.yaml' %}
|
||||
|
@ -0,0 +1,102 @@
|
||||
- title: Pacemaker Metrics
|
||||
collapse: true
|
||||
height: 200px
|
||||
showTitle: true
|
||||
panels:
|
||||
- title: $Cloud - $Node - Pacemaker General Metrics
|
||||
type: graph
|
||||
legend:
|
||||
alignAsTable: true
|
||||
avg: false
|
||||
current: true
|
||||
max: true
|
||||
min: true
|
||||
rightSide: true
|
||||
show: true
|
||||
total: false
|
||||
values: true
|
||||
nullPointMode: 'null'
|
||||
targets:
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-total_nodes, 'total_nodes')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_nodes, 'online_hosts')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_guests, 'online_guests')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-resource_instances, 'resource_instances')
|
||||
- title: $Cloud - $Node - Pacemaker Resource Total Count
|
||||
type: graph
|
||||
legend:
|
||||
alignAsTable: true
|
||||
avg: false
|
||||
current: true
|
||||
max: true
|
||||
min: true
|
||||
rightSide: true
|
||||
show: true
|
||||
total: false
|
||||
values: true
|
||||
nullPointMode: 'null'
|
||||
targets:
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_total_count, 'cinder_resource_total_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_total_count, 'galera_resource_total_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_total_count, 'haproxy_resource_total_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_total_count, 'ovn_resource_total_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_total_count, 'rabbitmq_resource_total_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_total_count, 'redis_resource_total_count')
|
||||
- title: $Cloud - $Node - Pacemaker Resource Master Count
|
||||
type: graph
|
||||
legend:
|
||||
alignAsTable: true
|
||||
avg: false
|
||||
current: true
|
||||
max: true
|
||||
min: true
|
||||
rightSide: true
|
||||
show: true
|
||||
total: false
|
||||
values: true
|
||||
nullPointMode: 'null'
|
||||
targets:
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_master_count, 'cinder_resource_master_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_master_count, 'galera_resource_master_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_master_count, 'haproxy_resource_master_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_master_count, 'ovn_resource_master_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_master_count, 'rabbitmq_resource_master_count')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_master_count, 'redis_resource_master_count')
|
||||
- title: $Cloud - $Node - Pacemaker Resource Failures
|
||||
type: graph
|
||||
legend:
|
||||
alignAsTable: true
|
||||
avg: false
|
||||
current: true
|
||||
max: true
|
||||
min: true
|
||||
rightSide: true
|
||||
show: true
|
||||
total: false
|
||||
values: true
|
||||
nullPointMode: 'null'
|
||||
targets:
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_failures, 'cinder_resource_failures')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_failures, 'galera_resource_failures')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_failures, 'haproxy_resource_failures')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_failures, 'ovn_resource_failures')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_failures, 'rabbitmq_resource_failures')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_failures, 'redis_resource_failures')
|
||||
- title: $Cloud - $Node - Pacemaker Daemon Status
|
||||
type: graph
|
||||
legend:
|
||||
alignAsTable: true
|
||||
avg: false
|
||||
current: true
|
||||
max: true
|
||||
min: true
|
||||
rightSide: true
|
||||
show: true
|
||||
total: false
|
||||
values: true
|
||||
nullPointMode: 'null'
|
||||
targets:
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-corosync_daemon_status, 'corosync_daemon_status')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pacemaker_daemon_status, 'pacemaker_daemon_status')
|
||||
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pcsd_daemon_status, 'pcsd_daemon_status')
|
||||
|
||||
|
@ -26,6 +26,7 @@ ADD files/collectd_ceph_storage.py /usr/local/bin/collectd_ceph_storage.py
|
||||
ADD files/collectd_gnocchi_status.py /usr/local/bin/collectd_gnocchi_status.py
|
||||
ADD files/collectd_rabbitmq_monitoring.py /usr/local/bin/collectd_rabbitmq_monitoring.py
|
||||
ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py
|
||||
ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py
|
||||
ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh
|
||||
ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh
|
||||
|
||||
|
@ -0,0 +1,131 @@
|
||||
import collectd
|
||||
import os
|
||||
|
||||
LOG_FILE_PATH = '/var/log/containers/stdouts/collectd_pacemaker.out'
|
||||
PIPE_FILE_PATH = '/collectd_pipe'
|
||||
INTERVAL = 15
|
||||
|
||||
def config_func(config):
|
||||
log_file_path_set = False
|
||||
|
||||
for node in config.children:
|
||||
key = node.key.lower()
|
||||
|
||||
if key == 'interval':
|
||||
global INTERVAL
|
||||
INTERVAL = int(node.values[0])
|
||||
|
||||
def read_func():
|
||||
global INTERVAL
|
||||
global LOG_FILE_PATH
|
||||
|
||||
os.system('''echo "pcs status" > '''+PIPE_FILE_PATH)
|
||||
|
||||
with open(LOG_FILE_PATH, 'rb') as f:
|
||||
full_output = f.readlines()
|
||||
|
||||
latest_output = []
|
||||
|
||||
for line in full_output[-1::-1]:
|
||||
latest_output.append(line)
|
||||
if "Cluster name:" in line:
|
||||
break
|
||||
|
||||
components_list = ["total_nodes", "online_nodes", "online_guests",
|
||||
"resource_instances", "haproxy_resource_total_count",
|
||||
"galera_resource_total_count", "rabbitmq_resource_total_count",
|
||||
"redis_resource_total_count", "ovn_resource_total_count", "cinder_resource_total_count",
|
||||
"haproxy_resource_master_count", "galera_resource_master_count", "rabbitmq_resource_master_count",
|
||||
"redis_resource_master_count", "ovn_resource_master_count", "cinder_resource_master_count",
|
||||
"corosync_daemon_status", "pacemaker_daemon_status", "pcsd_daemon_status",
|
||||
"haproxy_resource_failures", "galera_resource_failures", "rabbitmq_resource_failures",
|
||||
"redis_resource_failures", "ovn_resource_failures", "cinder_resource_failures"]
|
||||
|
||||
for component in components_list:
|
||||
if component == "total_nodes":
|
||||
for line in latest_output[-1::-1]:
|
||||
if "nodes configured" in line:
|
||||
line_split = line.split()
|
||||
nodes_index = line_split.index("nodes")
|
||||
val = int(line_split[nodes_index-1])
|
||||
break
|
||||
|
||||
elif component == "online_nodes":
|
||||
for line in latest_output[-1::-1]:
|
||||
if "Online: [" in line and "Guest" not in line:
|
||||
line_split = line.split("[")[1].replace(" ]","").strip().split()
|
||||
val = int(len(line_split))
|
||||
|
||||
elif component == "online_guests":
|
||||
for line in latest_output[-1::-1]:
|
||||
if "GuestOnline: [" in line:
|
||||
line_split = line.split("[")[1].replace(" ]","").strip().split()
|
||||
val = int(len(line_split))
|
||||
|
||||
elif component == "resource_instances":
|
||||
for line in latest_output[-1::-1]:
|
||||
if "resource instances configured" in line:
|
||||
line_split = line.split()
|
||||
nodes_index = line_split.index("resource")
|
||||
val = int(line_split[nodes_index-1])
|
||||
|
||||
elif "resource_total_count" in component:
|
||||
resource = component.split("_")[0]
|
||||
val = 0
|
||||
for line in latest_output:
|
||||
if (resource == "haproxy" or resource == "galera"
|
||||
or resource == "rabbitmq" or resource == "redis"):
|
||||
if resource+"-bundle-" in line and "Guest" not in line:
|
||||
val += 1
|
||||
if resource == "ovn":
|
||||
if "ovn-dbs-bundle-" in line and "Guest" not in line:
|
||||
val += 1
|
||||
if resource == "cinder":
|
||||
if "openstack-cinder-volume-" in line and "Guest" not in line:
|
||||
val += 1
|
||||
|
||||
elif "resource_master_count" in component:
|
||||
resource = component.split("_")[0]
|
||||
val = 0
|
||||
for line in latest_output:
|
||||
if (resource == "haproxy" or resource == "galera"
|
||||
or resource == "rabbitmq" or resource == "redis"):
|
||||
if resource+"-bundle-" in line and "Master" in line:
|
||||
val += 1
|
||||
if resource == "ovn":
|
||||
if "ovn-dbs-bundle-" in line and "Master" in line:
|
||||
val += 1
|
||||
if resource == "cinder":
|
||||
if "openstack-cinder-volume-" in line and "Master" in line:
|
||||
val += 1
|
||||
|
||||
if "daemon_status" in component:
|
||||
daemon = component.split("_")[0]
|
||||
val = 0
|
||||
for line in latest_output:
|
||||
if daemon+":" in line and "active/enabled" in line:
|
||||
val = 1
|
||||
break
|
||||
|
||||
if "resource_failures" in component:
|
||||
resource = component.split("_")[0]
|
||||
val = 0
|
||||
is_failures = False
|
||||
for line in latest_output[-1::-1]:
|
||||
if "Failed" in line:
|
||||
is_failures = True
|
||||
if resource in line and is_failures:
|
||||
val += 1
|
||||
if is_failures and "Daemon Status" in line:
|
||||
is_failures = False
|
||||
|
||||
metric = collectd.Values()
|
||||
metric.plugin = 'pacemaker_monitoring'
|
||||
metric.interval = INTERVAL
|
||||
metric.type = 'gauge'
|
||||
metric.type_instance = component
|
||||
metric.values = [val]
|
||||
metric.dispatch()
|
||||
|
||||
collectd.register_config(config_func)
|
||||
collectd.register_read(read_func)
|
Loading…
x
Reference in New Issue
Block a user