Pacemaker Monitoring

This patch introduces pacemaker monitoring in Browbeat.
The 'pcs status' output is scraped and processed through
a collectd python plugin, and metrics are visualized in
Grafana panels.

Change-Id: I587d186a068f721c65f85e6069e4beaa98bacde7
This commit is contained in:
Sanjay Chari 2021-10-07 17:07:54 +05:30
parent fde79f74c1
commit 8ad0bce482
9 changed files with 315 additions and 0 deletions

View File

@ -319,6 +319,13 @@ ovs_flows_monitoring: false
# before enabling this plugin.
ovn_monitoring: false
#######################
# Pacemaker Monitoring
#######################
# Monitors pcs status metrics on controller.
pacemaker_monitoring: true
pacemaker_controller_collectd_interval: 15
controller_monitored_ints:
- "tap"

View File

@ -230,6 +230,8 @@
dest: /usr/local/bin/collectd_rabbitmq_monitoring.py
- src: collectd_swift_stat.py
dest: /usr/local/bin/collectd_swift_stat.py
- src: collectd_pacemaker_monitoring.py
dest: /usr/local/bin/collectd_pacemaker_monitoring.py
when: "('Controller' in group_names and inventory_hostname == groups['Controller'][0]) or ('Undercloud' in group_names)"
- name: Copy python plugins

View File

@ -546,6 +546,19 @@ PreCacheChain "PreCache"
</Plugin>
{% endif %}
{%if pacemaker_monitoring %}
<Plugin python>
ModulePath "/usr/local/bin/"
Import "collectd_pacemaker_monitoring"
<Module collectd_pacemaker_monitoring>
Interval {{pacemaker_controller_collectd_interval}}
</Module>
</Plugin>
{% endif %}
{%if gnocchi_status_controller_collectd_plugin %}
{%if inventory_hostname == groups['Controller'][0] %}
<Plugin python>

View File

@ -83,6 +83,45 @@
dest: "{{ browbeat_containers_path }}/collectd-openstack/config/collectd.conf"
when: "config_type == 'compute' and ovn_compute_collectd_plugin"
- name: Create pipe for to use pcs cli inside collectd container
shell: |
sudo mkfifo /home/"{{ host_remote_user }}"/collectd_pipe
become: yes
become_user: root
ignore_errors: yes
when: "config_type == 'controller' and pacemaker_monitoring"
- name: Create script to run eval on pcs pipe
file:
path: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh"
state: touch
mode: 0775
owner: root
become: yes
become_user: root
ignore_errors: yes
when: "config_type == 'controller' and pacemaker_monitoring"
- name: Add content to script to run eval on pcs pipe
copy:
dest: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh"
content: |
while true; do eval "$(cat /home/{{ host_remote_user }}/collectd_pipe)"; done
become: yes
become_user: root
ignore_errors: yes
when: "config_type == 'controller' and pacemaker_monitoring"
- name: Run script to run eval on pcs pipe
shell: |
nohup ./collectd_pipe_eval.sh > /var/log/containers/stdouts/collectd_pacemaker.out 2>&1 &
become: yes
become_user: root
ignore_errors: yes
args:
chdir: "/home/{{ host_remote_user }}"
when: "config_type == 'controller' and pacemaker_monitoring"
- name: Build and Run container
block:
- name: Build collectd-openstack container
@ -104,10 +143,15 @@
{% if ovs_flows_monitoring %}
-v /etc/openvswitch/:/etc/openvswitch/ -v /var/run/openvswitch/:/var/run/openvswitch/ \
{% endif %}
{% endif %}
{% if config_type == 'controller' %}
{% if ovn_monitoring %}
-v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \
-v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \
{% endif %}
{% if pacemaker_monitoring %}
-v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \
{% endif %}
{% endif %}
collectd-openstack
become: yes

View File

@ -538,6 +538,19 @@ PreCacheChain "PreCache"
</Plugin>
{% endif %}
{%if pacemaker_monitoring %}
<Plugin python>
ModulePath "/usr/local/bin/"
Import "collectd_pacemaker_monitoring"
<Module collectd_pacemaker_monitoring>
Interval {{pacemaker_controller_collectd_interval}}
</Module>
</Plugin>
{% endif %}
{%if gnocchi_status_controller_collectd_plugin %}
{%if inventory_hostname == groups['Controller'][0] %}
<Plugin python>

View File

@ -166,6 +166,8 @@ dashboard:
{% include 'partials/osp_response_times.yaml' %}
{% include 'partials/ovn_db_tables.yaml' %}
{% include 'partials/pacemaker_monitoring.yaml' %}
{% endif %}
{% include 'partials/ovn_metrics.yaml' %}

View File

@ -0,0 +1,102 @@
- title: Pacemaker Metrics
collapse: true
height: 200px
showTitle: true
panels:
- title: $Cloud - $Node - Pacemaker General Metrics
type: graph
legend:
alignAsTable: true
avg: false
current: true
max: true
min: true
rightSide: true
show: true
total: false
values: true
nullPointMode: 'null'
targets:
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-total_nodes, 'total_nodes')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_nodes, 'online_hosts')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_guests, 'online_guests')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-resource_instances, 'resource_instances')
- title: $Cloud - $Node - Pacemaker Resource Total Count
type: graph
legend:
alignAsTable: true
avg: false
current: true
max: true
min: true
rightSide: true
show: true
total: false
values: true
nullPointMode: 'null'
targets:
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_total_count, 'cinder_resource_total_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_total_count, 'galera_resource_total_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_total_count, 'haproxy_resource_total_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_total_count, 'ovn_resource_total_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_total_count, 'rabbitmq_resource_total_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_total_count, 'redis_resource_total_count')
- title: $Cloud - $Node - Pacemaker Resource Master Count
type: graph
legend:
alignAsTable: true
avg: false
current: true
max: true
min: true
rightSide: true
show: true
total: false
values: true
nullPointMode: 'null'
targets:
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_master_count, 'cinder_resource_master_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_master_count, 'galera_resource_master_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_master_count, 'haproxy_resource_master_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_master_count, 'ovn_resource_master_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_master_count, 'rabbitmq_resource_master_count')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_master_count, 'redis_resource_master_count')
- title: $Cloud - $Node - Pacemaker Resource Failures
type: graph
legend:
alignAsTable: true
avg: false
current: true
max: true
min: true
rightSide: true
show: true
total: false
values: true
nullPointMode: 'null'
targets:
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_failures, 'cinder_resource_failures')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_failures, 'galera_resource_failures')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_failures, 'haproxy_resource_failures')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_failures, 'ovn_resource_failures')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_failures, 'rabbitmq_resource_failures')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_failures, 'redis_resource_failures')
- title: $Cloud - $Node - Pacemaker Daemon Status
type: graph
legend:
alignAsTable: true
avg: false
current: true
max: true
min: true
rightSide: true
show: true
total: false
values: true
nullPointMode: 'null'
targets:
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-corosync_daemon_status, 'corosync_daemon_status')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pacemaker_daemon_status, 'pacemaker_daemon_status')
- target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pcsd_daemon_status, 'pcsd_daemon_status')

View File

@ -26,6 +26,7 @@ ADD files/collectd_ceph_storage.py /usr/local/bin/collectd_ceph_storage.py
ADD files/collectd_gnocchi_status.py /usr/local/bin/collectd_gnocchi_status.py
ADD files/collectd_rabbitmq_monitoring.py /usr/local/bin/collectd_rabbitmq_monitoring.py
ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py
ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py
ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh
ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh

View File

@ -0,0 +1,131 @@
import collectd
import os
LOG_FILE_PATH = '/var/log/containers/stdouts/collectd_pacemaker.out'
PIPE_FILE_PATH = '/collectd_pipe'
INTERVAL = 15
def config_func(config):
log_file_path_set = False
for node in config.children:
key = node.key.lower()
if key == 'interval':
global INTERVAL
INTERVAL = int(node.values[0])
def read_func():
global INTERVAL
global LOG_FILE_PATH
os.system('''echo "pcs status" > '''+PIPE_FILE_PATH)
with open(LOG_FILE_PATH, 'rb') as f:
full_output = f.readlines()
latest_output = []
for line in full_output[-1::-1]:
latest_output.append(line)
if "Cluster name:" in line:
break
components_list = ["total_nodes", "online_nodes", "online_guests",
"resource_instances", "haproxy_resource_total_count",
"galera_resource_total_count", "rabbitmq_resource_total_count",
"redis_resource_total_count", "ovn_resource_total_count", "cinder_resource_total_count",
"haproxy_resource_master_count", "galera_resource_master_count", "rabbitmq_resource_master_count",
"redis_resource_master_count", "ovn_resource_master_count", "cinder_resource_master_count",
"corosync_daemon_status", "pacemaker_daemon_status", "pcsd_daemon_status",
"haproxy_resource_failures", "galera_resource_failures", "rabbitmq_resource_failures",
"redis_resource_failures", "ovn_resource_failures", "cinder_resource_failures"]
for component in components_list:
if component == "total_nodes":
for line in latest_output[-1::-1]:
if "nodes configured" in line:
line_split = line.split()
nodes_index = line_split.index("nodes")
val = int(line_split[nodes_index-1])
break
elif component == "online_nodes":
for line in latest_output[-1::-1]:
if "Online: [" in line and "Guest" not in line:
line_split = line.split("[")[1].replace(" ]","").strip().split()
val = int(len(line_split))
elif component == "online_guests":
for line in latest_output[-1::-1]:
if "GuestOnline: [" in line:
line_split = line.split("[")[1].replace(" ]","").strip().split()
val = int(len(line_split))
elif component == "resource_instances":
for line in latest_output[-1::-1]:
if "resource instances configured" in line:
line_split = line.split()
nodes_index = line_split.index("resource")
val = int(line_split[nodes_index-1])
elif "resource_total_count" in component:
resource = component.split("_")[0]
val = 0
for line in latest_output:
if (resource == "haproxy" or resource == "galera"
or resource == "rabbitmq" or resource == "redis"):
if resource+"-bundle-" in line and "Guest" not in line:
val += 1
if resource == "ovn":
if "ovn-dbs-bundle-" in line and "Guest" not in line:
val += 1
if resource == "cinder":
if "openstack-cinder-volume-" in line and "Guest" not in line:
val += 1
elif "resource_master_count" in component:
resource = component.split("_")[0]
val = 0
for line in latest_output:
if (resource == "haproxy" or resource == "galera"
or resource == "rabbitmq" or resource == "redis"):
if resource+"-bundle-" in line and "Master" in line:
val += 1
if resource == "ovn":
if "ovn-dbs-bundle-" in line and "Master" in line:
val += 1
if resource == "cinder":
if "openstack-cinder-volume-" in line and "Master" in line:
val += 1
if "daemon_status" in component:
daemon = component.split("_")[0]
val = 0
for line in latest_output:
if daemon+":" in line and "active/enabled" in line:
val = 1
break
if "resource_failures" in component:
resource = component.split("_")[0]
val = 0
is_failures = False
for line in latest_output[-1::-1]:
if "Failed" in line:
is_failures = True
if resource in line and is_failures:
val += 1
if is_failures and "Daemon Status" in line:
is_failures = False
metric = collectd.Values()
metric.plugin = 'pacemaker_monitoring'
metric.interval = INTERVAL
metric.type = 'gauge'
metric.type_instance = component
metric.values = [val]
metric.dispatch()
collectd.register_config(config_func)
collectd.register_read(read_func)