From 8ad0bce48238bbe0bb6bca3ac7bb9c85d4861ed1 Mon Sep 17 00:00:00 2001 From: Sanjay Chari Date: Thu, 7 Oct 2021 17:07:54 +0530 Subject: [PATCH] Pacemaker Monitoring This patch introduces pacemaker monitoring in Browbeat. The 'pcs status' output is scraped and processed through a collectd python plugin, and metrics are visualized in Grafana panels. Change-Id: I587d186a068f721c65f85e6069e4beaa98bacde7 --- ansible/install/group_vars/all.yml | 7 + .../roles/collectd-openstack/tasks/main.yml | 2 + .../templates/controller.collectd.conf.j2 | 13 ++ ansible/install/roles/collectd/tasks/main.yml | 44 ++++++ .../templates/controller.collectd.conf.j2 | 13 ++ ...enstack_general_system_performance.yaml.j2 | 2 + .../partials/pacemaker_monitoring.yaml | 102 ++++++++++++++ .../collectd-openstack/Dockerfile | 1 + .../files/collectd_pacemaker_monitoring.py | 131 ++++++++++++++++++ 9 files changed, 315 insertions(+) create mode 100644 ansible/install/roles/grafana-dashboards/templates/partials/pacemaker_monitoring.yaml create mode 100644 browbeat-containers/collectd-openstack/files/collectd_pacemaker_monitoring.py diff --git a/ansible/install/group_vars/all.yml b/ansible/install/group_vars/all.yml index 5851d06ce..c2a0cc590 100644 --- a/ansible/install/group_vars/all.yml +++ b/ansible/install/group_vars/all.yml @@ -319,6 +319,13 @@ ovs_flows_monitoring: false # before enabling this plugin. ovn_monitoring: false +####################### +# Pacemaker Monitoring +####################### +# Monitors pcs status metrics on controller. +pacemaker_monitoring: true +pacemaker_controller_collectd_interval: 15 + controller_monitored_ints: - "tap" diff --git a/ansible/install/roles/collectd-openstack/tasks/main.yml b/ansible/install/roles/collectd-openstack/tasks/main.yml index 5e4825a4c..f4b3d96dc 100644 --- a/ansible/install/roles/collectd-openstack/tasks/main.yml +++ b/ansible/install/roles/collectd-openstack/tasks/main.yml @@ -230,6 +230,8 @@ dest: /usr/local/bin/collectd_rabbitmq_monitoring.py - src: collectd_swift_stat.py dest: /usr/local/bin/collectd_swift_stat.py + - src: collectd_pacemaker_monitoring.py + dest: /usr/local/bin/collectd_pacemaker_monitoring.py when: "('Controller' in group_names and inventory_hostname == groups['Controller'][0]) or ('Undercloud' in group_names)" - name: Copy python plugins diff --git a/ansible/install/roles/collectd-openstack/templates/controller.collectd.conf.j2 b/ansible/install/roles/collectd-openstack/templates/controller.collectd.conf.j2 index beee1fc75..b37ab3a20 100644 --- a/ansible/install/roles/collectd-openstack/templates/controller.collectd.conf.j2 +++ b/ansible/install/roles/collectd-openstack/templates/controller.collectd.conf.j2 @@ -546,6 +546,19 @@ PreCacheChain "PreCache" {% endif %} + +{%if pacemaker_monitoring %} + + ModulePath "/usr/local/bin/" + Import "collectd_pacemaker_monitoring" + + + Interval {{pacemaker_controller_collectd_interval}} + + + +{% endif %} + {%if gnocchi_status_controller_collectd_plugin %} {%if inventory_hostname == groups['Controller'][0] %} diff --git a/ansible/install/roles/collectd/tasks/main.yml b/ansible/install/roles/collectd/tasks/main.yml index b30f14db9..66ccc2d4f 100644 --- a/ansible/install/roles/collectd/tasks/main.yml +++ b/ansible/install/roles/collectd/tasks/main.yml @@ -83,6 +83,45 @@ dest: "{{ browbeat_containers_path }}/collectd-openstack/config/collectd.conf" when: "config_type == 'compute' and ovn_compute_collectd_plugin" +- name: Create pipe for to use pcs cli inside collectd container + shell: | + sudo mkfifo /home/"{{ host_remote_user }}"/collectd_pipe + become: yes + become_user: root + ignore_errors: yes + when: "config_type == 'controller' and pacemaker_monitoring" + +- name: Create script to run eval on pcs pipe + file: + path: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh" + state: touch + mode: 0775 + owner: root + become: yes + become_user: root + ignore_errors: yes + when: "config_type == 'controller' and pacemaker_monitoring" + +- name: Add content to script to run eval on pcs pipe + copy: + dest: "/home/{{ host_remote_user }}/collectd_pipe_eval.sh" + content: | + while true; do eval "$(cat /home/{{ host_remote_user }}/collectd_pipe)"; done + become: yes + become_user: root + ignore_errors: yes + when: "config_type == 'controller' and pacemaker_monitoring" + +- name: Run script to run eval on pcs pipe + shell: | + nohup ./collectd_pipe_eval.sh > /var/log/containers/stdouts/collectd_pacemaker.out 2>&1 & + become: yes + become_user: root + ignore_errors: yes + args: + chdir: "/home/{{ host_remote_user }}" + when: "config_type == 'controller' and pacemaker_monitoring" + - name: Build and Run container block: - name: Build collectd-openstack container @@ -104,10 +143,15 @@ {% if ovs_flows_monitoring %} -v /etc/openvswitch/:/etc/openvswitch/ -v /var/run/openvswitch/:/var/run/openvswitch/ \ {% endif %} + {% endif %} + {% if config_type == 'controller' %} {% if ovn_monitoring %} -v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \ -v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \ {% endif %} + {% if pacemaker_monitoring %} + -v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \ + {% endif %} {% endif %} collectd-openstack become: yes diff --git a/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 b/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 index 0942e21f7..2d401d5a3 100644 --- a/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 +++ b/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 @@ -538,6 +538,19 @@ PreCacheChain "PreCache" {% endif %} + +{%if pacemaker_monitoring %} + + ModulePath "/usr/local/bin/" + Import "collectd_pacemaker_monitoring" + + + Interval {{pacemaker_controller_collectd_interval}} + + + +{% endif %} + {%if gnocchi_status_controller_collectd_plugin %} {%if inventory_hostname == groups['Controller'][0] %} diff --git a/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 b/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 index 61b3c758a..52d246d28 100644 --- a/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 +++ b/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 @@ -166,6 +166,8 @@ dashboard: {% include 'partials/osp_response_times.yaml' %} {% include 'partials/ovn_db_tables.yaml' %} + + {% include 'partials/pacemaker_monitoring.yaml' %} {% endif %} {% include 'partials/ovn_metrics.yaml' %} diff --git a/ansible/install/roles/grafana-dashboards/templates/partials/pacemaker_monitoring.yaml b/ansible/install/roles/grafana-dashboards/templates/partials/pacemaker_monitoring.yaml new file mode 100644 index 000000000..78379bdc6 --- /dev/null +++ b/ansible/install/roles/grafana-dashboards/templates/partials/pacemaker_monitoring.yaml @@ -0,0 +1,102 @@ + - title: Pacemaker Metrics + collapse: true + height: 200px + showTitle: true + panels: + - title: $Cloud - $Node - Pacemaker General Metrics + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-total_nodes, 'total_nodes') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_nodes, 'online_hosts') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-online_guests, 'online_guests') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-resource_instances, 'resource_instances') + - title: $Cloud - $Node - Pacemaker Resource Total Count + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_total_count, 'cinder_resource_total_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_total_count, 'galera_resource_total_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_total_count, 'haproxy_resource_total_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_total_count, 'ovn_resource_total_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_total_count, 'rabbitmq_resource_total_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_total_count, 'redis_resource_total_count') + - title: $Cloud - $Node - Pacemaker Resource Master Count + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_master_count, 'cinder_resource_master_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_master_count, 'galera_resource_master_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_master_count, 'haproxy_resource_master_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_master_count, 'ovn_resource_master_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_master_count, 'rabbitmq_resource_master_count') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_master_count, 'redis_resource_master_count') + - title: $Cloud - $Node - Pacemaker Resource Failures + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-cinder_resource_failures, 'cinder_resource_failures') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-galera_resource_failures, 'galera_resource_failures') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-haproxy_resource_failures, 'haproxy_resource_failures') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-ovn_resource_failures, 'ovn_resource_failures') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-rabbitmq_resource_failures, 'rabbitmq_resource_failures') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-redis_resource_failures, 'redis_resource_failures') + - title: $Cloud - $Node - Pacemaker Daemon Status + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-corosync_daemon_status, 'corosync_daemon_status') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pacemaker_daemon_status, 'pacemaker_daemon_status') + - target: alias($Cloud.$Node.pacemaker_monitoring.gauge-pcsd_daemon_status, 'pcsd_daemon_status') + + diff --git a/browbeat-containers/collectd-openstack/Dockerfile b/browbeat-containers/collectd-openstack/Dockerfile index 3f7e06c44..648d58f3e 100644 --- a/browbeat-containers/collectd-openstack/Dockerfile +++ b/browbeat-containers/collectd-openstack/Dockerfile @@ -26,6 +26,7 @@ ADD files/collectd_ceph_storage.py /usr/local/bin/collectd_ceph_storage.py ADD files/collectd_gnocchi_status.py /usr/local/bin/collectd_gnocchi_status.py ADD files/collectd_rabbitmq_monitoring.py /usr/local/bin/collectd_rabbitmq_monitoring.py ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py +ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh diff --git a/browbeat-containers/collectd-openstack/files/collectd_pacemaker_monitoring.py b/browbeat-containers/collectd-openstack/files/collectd_pacemaker_monitoring.py new file mode 100644 index 000000000..f13f3a143 --- /dev/null +++ b/browbeat-containers/collectd-openstack/files/collectd_pacemaker_monitoring.py @@ -0,0 +1,131 @@ +import collectd +import os + +LOG_FILE_PATH = '/var/log/containers/stdouts/collectd_pacemaker.out' +PIPE_FILE_PATH = '/collectd_pipe' +INTERVAL = 15 + +def config_func(config): + log_file_path_set = False + + for node in config.children: + key = node.key.lower() + + if key == 'interval': + global INTERVAL + INTERVAL = int(node.values[0]) + +def read_func(): + global INTERVAL + global LOG_FILE_PATH + + os.system('''echo "pcs status" > '''+PIPE_FILE_PATH) + + with open(LOG_FILE_PATH, 'rb') as f: + full_output = f.readlines() + + latest_output = [] + + for line in full_output[-1::-1]: + latest_output.append(line) + if "Cluster name:" in line: + break + + components_list = ["total_nodes", "online_nodes", "online_guests", + "resource_instances", "haproxy_resource_total_count", + "galera_resource_total_count", "rabbitmq_resource_total_count", + "redis_resource_total_count", "ovn_resource_total_count", "cinder_resource_total_count", + "haproxy_resource_master_count", "galera_resource_master_count", "rabbitmq_resource_master_count", + "redis_resource_master_count", "ovn_resource_master_count", "cinder_resource_master_count", + "corosync_daemon_status", "pacemaker_daemon_status", "pcsd_daemon_status", + "haproxy_resource_failures", "galera_resource_failures", "rabbitmq_resource_failures", + "redis_resource_failures", "ovn_resource_failures", "cinder_resource_failures"] + + for component in components_list: + if component == "total_nodes": + for line in latest_output[-1::-1]: + if "nodes configured" in line: + line_split = line.split() + nodes_index = line_split.index("nodes") + val = int(line_split[nodes_index-1]) + break + + elif component == "online_nodes": + for line in latest_output[-1::-1]: + if "Online: [" in line and "Guest" not in line: + line_split = line.split("[")[1].replace(" ]","").strip().split() + val = int(len(line_split)) + + elif component == "online_guests": + for line in latest_output[-1::-1]: + if "GuestOnline: [" in line: + line_split = line.split("[")[1].replace(" ]","").strip().split() + val = int(len(line_split)) + + elif component == "resource_instances": + for line in latest_output[-1::-1]: + if "resource instances configured" in line: + line_split = line.split() + nodes_index = line_split.index("resource") + val = int(line_split[nodes_index-1]) + + elif "resource_total_count" in component: + resource = component.split("_")[0] + val = 0 + for line in latest_output: + if (resource == "haproxy" or resource == "galera" + or resource == "rabbitmq" or resource == "redis"): + if resource+"-bundle-" in line and "Guest" not in line: + val += 1 + if resource == "ovn": + if "ovn-dbs-bundle-" in line and "Guest" not in line: + val += 1 + if resource == "cinder": + if "openstack-cinder-volume-" in line and "Guest" not in line: + val += 1 + + elif "resource_master_count" in component: + resource = component.split("_")[0] + val = 0 + for line in latest_output: + if (resource == "haproxy" or resource == "galera" + or resource == "rabbitmq" or resource == "redis"): + if resource+"-bundle-" in line and "Master" in line: + val += 1 + if resource == "ovn": + if "ovn-dbs-bundle-" in line and "Master" in line: + val += 1 + if resource == "cinder": + if "openstack-cinder-volume-" in line and "Master" in line: + val += 1 + + if "daemon_status" in component: + daemon = component.split("_")[0] + val = 0 + for line in latest_output: + if daemon+":" in line and "active/enabled" in line: + val = 1 + break + + if "resource_failures" in component: + resource = component.split("_")[0] + val = 0 + is_failures = False + for line in latest_output[-1::-1]: + if "Failed" in line: + is_failures = True + if resource in line and is_failures: + val += 1 + if is_failures and "Daemon Status" in line: + is_failures = False + + metric = collectd.Values() + metric.plugin = 'pacemaker_monitoring' + metric.interval = INTERVAL + metric.type = 'gauge' + metric.type_instance = component + metric.values = [val] + metric.dispatch() + +collectd.register_config(config_func) +collectd.register_read(read_func)