Pacemaker Monitoring

This patch introduces pacemaker monitoring in Browbeat. The 'pcs status' output is scraped and processed through a collectd python plugin, and metrics are visualized in Grafana panels. Change-Id: I587d186a068f721c65f85e6069e4beaa98bacde7
2021-10-07 17:07:54 +05:30 · 2021-10-07 17:07:54 +05:30 · 8ad0bce482
commit 8ad0bce482
parent fde79f74c1
9 changed files with 315 additions and 0 deletions
--- a/ansible/install/group_vars/all.yml
+++ b/ansible/install/group_vars/all.yml
@ -319,6 +319,13 @@ ovs_flows_monitoring: false
 # before enabling this plugin.
 ovn_monitoring: false

+#######################
+# Pacemaker Monitoring
+#######################
+# Monitors pcs status metrics on controller.
+pacemaker_monitoring: true
+pacemaker_controller_collectd_interval: 15
+
 controller_monitored_ints:
  - "tap"

--- a/ansible/install/roles/collectd-openstack/tasks/main.yml
+++ b/ansible/install/roles/collectd-openstack/tasks/main.yml
@ -230,6 +230,8 @@
      dest: /usr/local/bin/collectd_rabbitmq_monitoring.py
    - src: collectd_swift_stat.py
      dest: /usr/local/bin/collectd_swift_stat.py
+    - src: collectd_pacemaker_monitoring.py
+      dest: /usr/local/bin/collectd_pacemaker_monitoring.py
  when: "('Controller' in group_names and inventory_hostname == groups['Controller'][0]) or ('Undercloud' in group_names)"

 - name: Copy python plugins
--- a/ansible/install/roles/collectd-openstack/templates/controller.collectd.conf.j2
+++ b/ansible/install/roles/collectd-openstack/templates/controller.collectd.conf.j2
@ -546,6 +546,19 @@ PreCacheChain "PreCache"
 </Plugin>

 {% endif %}
+
+{%if pacemaker_monitoring %}
+<Plugin python>
+  ModulePath "/usr/local/bin/"
+  Import "collectd_pacemaker_monitoring"
+
+  <Module collectd_pacemaker_monitoring>
+    Interval {{pacemaker_controller_collectd_interval}}
+  </Module>
+</Plugin>
+
+{% endif %}
+
 {%if gnocchi_status_controller_collectd_plugin %}
 {%if inventory_hostname == groups['Controller'][0] %}
 <Plugin python>
--- a/ansible/install/roles/collectd/tasks/main.yml
+++ b/ansible/install/roles/collectd/tasks/main.yml
@ -83,6 +83,45 @@
    dest: "{{ browbeat_containers_path }}/collectd-openstack/config/collectd.conf"
  when: "config_type == 'compute' and ovn_compute_collectd_plugin"

+- name: Create pipe for to use pcs cli inside collectd container
+  shell: |
+    sudo mkfifo /home/"{{ host_remote_user }}"/collectd_pipe
+  become: yes
+  become_user: root
+  ignore_errors: yes
+  when: "config_type == 'controller' and pacemaker_monitoring"
+
+- name: Create script to run eval on pcs pipe
+  file:
+    path: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh"
+    state: touch
+    mode: 0775
+    owner: root
+  become: yes
+  become_user: root
+  ignore_errors: yes
+  when: "config_type == 'controller' and pacemaker_monitoring"
+
+- name: Add content to script to run eval on pcs pipe
+  copy:
+    dest: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh"
+    content: |
+      while true; do eval "$(cat /home/{{ host_remote_user }}/collectd_pipe)"; done
+  become: yes
+  become_user: root
+  ignore_errors: yes
+  when: "config_type == 'controller' and pacemaker_monitoring"
+
+- name: Run script to run eval on pcs pipe
+  shell: |
+    nohup ./collectd_pipe_eval.sh > /var/log/containers/stdouts/collectd_pacemaker.out 2>&1 &
+  become: yes
+  become_user: root
+  ignore_errors: yes
+  args:
+    chdir: "/home/{{ host_remote_user }}"
+  when: "config_type == 'controller' and pacemaker_monitoring"
+
 - name: Build and Run container
  block:
    - name: Build collectd-openstack container
@ -104,10 +143,15 @@
          {% if ovs_flows_monitoring %}
          -v /etc/openvswitch/:/etc/openvswitch/ -v /var/run/openvswitch/:/var/run/openvswitch/ \
          {% endif %}
+          {% endif %}
+          {% if config_type == 'controller' %}
          {% if ovn_monitoring %}
          -v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \
          -v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \
          {% endif %}
+          {% if pacemaker_monitoring %}
+          -v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \
+          {% endif %}
          {% endif %}
          collectd-openstack
      become: yes
--- a/ansible/install/roles/collectd/templates/controller.collectd.conf.j2
+++ b/ansible/install/roles/collectd/templates/controller.collectd.conf.j2
@ -538,6 +538,19 @@ PreCacheChain "PreCache"
 </Plugin>

 {% endif %}
+
+{%if pacemaker_monitoring %}
+<Plugin python>
+  ModulePath "/usr/local/bin/"
+  Import "collectd_pacemaker_monitoring"
+
+  <Module collectd_pacemaker_monitoring>
+    Interval {{pacemaker_controller_collectd_interval}}
+  </Module>
+</Plugin>
+
+{% endif %}
+
 {%if gnocchi_status_controller_collectd_plugin %}
 {%if inventory_hostname == groups['Controller'][0] %}
 <Plugin python>
--- a/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2
+++ b/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2
@ -166,6 +166,8 @@ dashboard:
  {% include 'partials/osp_response_times.yaml' %}

  {% include 'partials/ovn_db_tables.yaml' %}
+
+  {% include 'partials/pacemaker_monitoring.yaml' %}
 {% endif %}

 {% include 'partials/ovn_metrics.yaml' %}
--- a/ansible/install/roles/grafana-dashboards/templates/partials/pacemaker_monitoring.yaml
+++ b/ansible/install/roles/grafana-dashboards/templates/partials/pacemaker_monitoring.yaml
@ -0,0 +1,102 @@
+    - title: Pacemaker Metrics
+      collapse: true
+      height: 200px
+      showTitle: true
+      panels:
+        - title: $Cloud - $Node - Pacemaker General Metrics
+          type: graph
+          legend:
+            alignAsTable: true
+            avg: false
+            current: true
+            max: true
+            min: true
+            rightSide: true
+            show: true
+            total: false
+            values: true
+          nullPointMode: 'null'
+          targets:
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-total_nodes, 'total_nodes')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_nodes, 'online_hosts')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_guests, 'online_guests')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-resource_instances, 'resource_instances')
+        - title: $Cloud - $Node - Pacemaker Resource Total Count
+          type: graph
+          legend:
+            alignAsTable: true
+            avg: false
+            current: true
+            max: true
+            min: true
+            rightSide: true
+            show: true
+            total: false
+            values: true
+          nullPointMode: 'null'
+          targets:
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_total_count, 'cinder_resource_total_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_total_count, 'galera_resource_total_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_total_count, 'haproxy_resource_total_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_total_count, 'ovn_resource_total_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_total_count, 'rabbitmq_resource_total_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_total_count, 'redis_resource_total_count')
+        - title: $Cloud - $Node - Pacemaker Resource Master Count
+          type: graph
+          legend:
+            alignAsTable: true
+            avg: false
+            current: true
+            max: true
+            min: true
+            rightSide: true
+            show: true
+            total: false
+            values: true
+          nullPointMode: 'null'
+          targets:
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_master_count, 'cinder_resource_master_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_master_count, 'galera_resource_master_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_master_count, 'haproxy_resource_master_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_master_count, 'ovn_resource_master_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_master_count, 'rabbitmq_resource_master_count')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_master_count, 'redis_resource_master_count')
+        - title: $Cloud - $Node - Pacemaker Resource Failures
+          type: graph
+          legend:
+            alignAsTable: true
+            avg: false
+            current: true
+            max: true
+            min: true
+            rightSide: true
+            show: true
+            total: false
+            values: true
+          nullPointMode: 'null'
+          targets:
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_failures, 'cinder_resource_failures')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_failures, 'galera_resource_failures')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_failures, 'haproxy_resource_failures')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_failures, 'ovn_resource_failures')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_failures, 'rabbitmq_resource_failures')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_failures, 'redis_resource_failures')
+        - title: $Cloud - $Node - Pacemaker Daemon Status
+          type: graph
+          legend:
+            alignAsTable: true
+            avg: false
+            current: true
+            max: true
+            min: true
+            rightSide: true
+            show: true
+            total: false
+            values: true
+          nullPointMode: 'null'
+          targets:
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-corosync_daemon_status, 'corosync_daemon_status')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pacemaker_daemon_status, 'pacemaker_daemon_status')
+            - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pcsd_daemon_status, 'pcsd_daemon_status')
+
+
--- a/browbeat-containers/collectd-openstack/Dockerfile
+++ b/browbeat-containers/collectd-openstack/Dockerfile
@ -26,6 +26,7 @@ ADD files/collectd_ceph_storage.py /usr/local/bin/collectd_ceph_storage.py
 ADD files/collectd_gnocchi_status.py /usr/local/bin/collectd_gnocchi_status.py
 ADD files/collectd_rabbitmq_monitoring.py /usr/local/bin/collectd_rabbitmq_monitoring.py
 ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py
+ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py
 ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh
 ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh

--- a/browbeat-containers/collectd-openstack/files/collectd_pacemaker_monitoring.py
+++ b/browbeat-containers/collectd-openstack/files/collectd_pacemaker_monitoring.py
@ -0,0 +1,131 @@
+import collectd
+import os
+
+LOG_FILE_PATH = '/var/log/containers/stdouts/collectd_pacemaker.out'
+PIPE_FILE_PATH = '/collectd_pipe'
+INTERVAL = 15
+
+def config_func(config):
+    log_file_path_set = False
+
+    for node in config.children:
+        key = node.key.lower()
+
+        if key == 'interval':
+            global INTERVAL
+            INTERVAL = int(node.values[0])
+
+def read_func():
+    global INTERVAL
+    global LOG_FILE_PATH
+
+    os.system('''echo "pcs status" > '''+PIPE_FILE_PATH)
+
+    with open(LOG_FILE_PATH, 'rb') as f:
+        full_output = f.readlines()
+
+    latest_output = []
+
+    for line in full_output[-1::-1]:
+        latest_output.append(line)
+        if "Cluster name:" in line:
+            break
+
+    components_list = ["total_nodes", "online_nodes", "online_guests",
+                       "resource_instances", "haproxy_resource_total_count",
+                       "galera_resource_total_count", "rabbitmq_resource_total_count",
+                       "redis_resource_total_count", "ovn_resource_total_count", "cinder_resource_total_count",
+                       "haproxy_resource_master_count", "galera_resource_master_count", "rabbitmq_resource_master_count",
+                       "redis_resource_master_count", "ovn_resource_master_count", "cinder_resource_master_count",
+                       "corosync_daemon_status", "pacemaker_daemon_status", "pcsd_daemon_status",
+                       "haproxy_resource_failures", "galera_resource_failures", "rabbitmq_resource_failures",
+                       "redis_resource_failures", "ovn_resource_failures", "cinder_resource_failures"]
+
+    for component in components_list:
+        if component == "total_nodes":
+            for line in latest_output[-1::-1]:
+                if "nodes configured" in line:
+                    line_split = line.split()
+                    nodes_index = line_split.index("nodes")
+                    val = int(line_split[nodes_index-1])
+                    break
+
+        elif component == "online_nodes":
+            for line in latest_output[-1::-1]:
+                if "Online: [" in line and "Guest" not in line:
+                    line_split = line.split("[")[1].replace(" ]","").strip().split()
+                    val = int(len(line_split))
+
+        elif component == "online_guests":
+            for line in latest_output[-1::-1]:
+                if "GuestOnline: [" in line:
+                    line_split = line.split("[")[1].replace(" ]","").strip().split()
+                    val = int(len(line_split))
+
+        elif component == "resource_instances":
+            for line in latest_output[-1::-1]:
+                if "resource instances configured" in line:
+                    line_split = line.split()
+                    nodes_index = line_split.index("resource")
+                    val = int(line_split[nodes_index-1])
+
+        elif "resource_total_count" in component:
+            resource = component.split("_")[0]
+            val = 0
+            for line in latest_output:
+                if (resource == "haproxy" or resource == "galera"
+                    or resource == "rabbitmq" or resource == "redis"):
+                    if resource+"-bundle-" in line and "Guest" not in line:
+                        val += 1
+                if resource == "ovn":
+                    if "ovn-dbs-bundle-" in line and "Guest" not in line:
+                        val += 1
+                if resource == "cinder":
+                    if "openstack-cinder-volume-" in line and "Guest" not in line:
+                        val += 1
+
+        elif "resource_master_count" in component:
+            resource = component.split("_")[0]
+            val = 0
+            for line in latest_output:
+                if (resource == "haproxy" or resource == "galera"
+                    or resource == "rabbitmq" or resource == "redis"):
+                    if resource+"-bundle-" in line and "Master" in line:
+                        val += 1
+                if resource == "ovn":
+                    if "ovn-dbs-bundle-" in line and "Master" in line:
+                        val += 1
+                if resource == "cinder":
+                    if "openstack-cinder-volume-" in line and "Master" in line:
+                        val += 1
+
+        if "daemon_status" in component:
+            daemon = component.split("_")[0]
+            val = 0
+            for line in latest_output:
+                if daemon+":" in line and "active/enabled" in line:
+                    val = 1
+                    break
+
+        if "resource_failures" in component:
+            resource = component.split("_")[0]
+            val = 0
+            is_failures = False
+            for line in latest_output[-1::-1]:
+                if "Failed" in line:
+                    is_failures = True
+                if resource in line and is_failures:
+                    val += 1
+                if is_failures and "Daemon Status" in line:
+                    is_failures = False
+
+        metric = collectd.Values()
+        metric.plugin = 'pacemaker_monitoring'
+        metric.interval = INTERVAL
+        metric.type = 'gauge'
+        metric.type_instance = component
+        metric.values = [val]
+        metric.dispatch()
+
+collectd.register_config(config_func)
+collectd.register_read(read_func)