Converg the logstash pipelines and enhance memory backed queues

The multi-logstash pipeline setup, while amazingly fast, was crashing and causing index errors when under high load for a long period of time. Because of the crashing behavior and the fact that the folks from Elastic describe multi-pipeline queues to be "beta" at this time the logstash pipelines have been converted back into a single pipeline. The memory backed queue options are now limited by a ram disk (tmpfs) which will ensure that a burst within the queue does not cause OOM issues and ensures a highly performant deployment and limiting memory usage at the same time. Memory backed queues will be enabled when the underlying system is using "rotational" media as detected by ansible facts. This will ensure a fast and consistent experience across all deployment types. Pipeline/ml/template/dashboard setup has been added to the beat configurations which will ensure beats are properly configured even when running in an isolated deployment and outside of normal operations where beats are generally configured on the first data node. Change-Id: Ie3c775f98b14f71bcbed05db9cb1c5aa46d9c436 Signed-off-by: Kevin Carter <kevin.carter@rackspace.com>
2018-09-13 16:30:11 -05:00 · 2018-09-13 16:30:11 -05:00 · 0d4a4a92c7
commit 0d4a4a92c7
parent be70a2078c
26 changed files with 595 additions and 697 deletions
--- a/elk_metrics_6x/ansible-role-requirements.yml
+++ b/elk_metrics_6x/ansible-role-requirements.yml
@ -3,6 +3,10 @@
  scm: git
  src: https://git.openstack.org/openstack/ansible-role-systemd_service
  version: master
+- name: systemd_mount
+  scm: git
+  src: https://git.openstack.org/openstack/ansible-role-systemd_mount
+  version: master
 - name: config_template
  scm: git
  src: https://git.openstack.org/openstack/ansible-config_template
--- a/elk_metrics_6x/bootstrap-embedded-ansible.sh
+++ b/elk_metrics_6x/bootstrap-embedded-ansible.sh
@ -68,6 +68,14 @@ if [[ ! -d "${ANSIBLE_EMBED_HOME}/repositories/roles/systemd_service" ]]; then
  popd
 fi

+if [[ ! -d "${ANSIBLE_EMBED_HOME}/repositories/roles/systemd_mount" ]]; then
+  mkdir -p "${ANSIBLE_EMBED_HOME}/repositories"
+  git clone https://git.openstack.org/openstack/ansible-role-systemd_mount "${ANSIBLE_EMBED_HOME}/repositories/roles/systemd_mount"
+  pushd "${ANSIBLE_EMBED_HOME}/repositories/roles/systemd_mount"
+    git checkout 0cca0b06e20a4e3d2b6b4ca19172717b6b37b68a  # HEAD of master from 20-06-18
+  popd
+fi
+
 if [[ -f "/etc/openstack_deploy/openstack_inventory.json" ]]; then
  if [[ ! -f "${ANSIBLE_EMBED_HOME}/inventory/openstack_inventory.sh" ]]; then
    mkdir -p "${ANSIBLE_EMBED_HOME}/inventory"
--- a/elk_metrics_6x/roles/elastic_apm_server/tasks/apm_setup.yml
+++ b/elk_metrics_6x/roles/elastic_apm_server/tasks/apm_setup.yml
@ -19,9 +19,13 @@
    {{ item }}
    -E 'apm-server.host=localhost:8200'
    -E 'output.elasticsearch.hosts={{ coordination_nodes | to_json }}'
+    -E 'setup.template.enabled=true'
+    -E 'setup.template.overwrite=true'
    -e -v
  with_items:
    - "--template"
+    - "--pipelines"
+    - "--machine-learning"
    - "--dashboards"
  register: templates
  environment:
--- a/elk_metrics_6x/roles/elastic_auditbeat/tasks/auditbeat_setup.yml
+++ b/elk_metrics_6x/roles/elastic_auditbeat/tasks/auditbeat_setup.yml
@ -19,9 +19,13 @@
    {{ item }}
    -E 'output.logstash.enabled=false'
    -E 'output.elasticsearch.hosts={{ coordination_nodes | to_json }}'
+    -E 'setup.template.enabled=true'
+    -E 'setup.template.overwrite=true'
    -e -v
  with_items:
    - "--template"
+    - "--pipelines"
+    - "--machine-learning"
    - "--dashboards"
  register: templates
  environment:
--- a/elk_metrics_6x/roles/elastic_dependencies/tasks/main.yml
+++ b/elk_metrics_6x/roles/elastic_dependencies/tasks/main.yml
@ -54,10 +54,10 @@
    elastic_heap_size_default: "{{ _elastic_heap_size_default }}"
    elastic_log_rotate_path: "/var/log/{{ service_name }}"

- name: Configure systcl vm.max_map_count=262144 on elastic hosts
+- name: Configure systcl vm.max_map_count=524288 on elastic hosts
  sysctl:
    name: "vm.max_map_count"
-    value: "262144"
+    value: "524288"
    state: "present"
    reload: "yes"
  delegate_to: "{{ physical_host }}"
--- a/elk_metrics_6x/roles/elastic_dependencies/vars/vars_elasticsearch.yml
+++ b/elk_metrics_6x/roles/elastic_dependencies/vars/vars_elasticsearch.yml
@ -11,5 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Set memory fact to half
-_elastic_heap_size_default: "{{ ((h_mem | int) > 30720) | ternary(30720, h_mem) }}"
+# The heap size is set using the a half of the total memory available with
+# a cap of 32GiB. If the total available memory is less than 32GiB a buffer of
+# 10% will be used to ensure the underlying system is not starved of memory.
+_elastic_heap_size_default: "{{ ((h_mem | int) > 30720) | ternary(30720, ((h_mem | int) - ((h_mem | int) * 0.1))) }}"
--- a/elk_metrics_6x/roles/elastic_dependencies/vars/vars_logstash.yml
+++ b/elk_metrics_6x/roles/elastic_dependencies/vars/vars_logstash.yml
@ -11,5 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Set quarter memory fact
-_elastic_heap_size_default: "{{ ((q_mem | int) > 30720) | ternary(30720, q_mem) }}"
+# The heap size is set using the a quarter of the total memory available with
+# a cap of 32GiB. If the total available memory is less than 32GiB a buffer of
+# 10% will be used to ensure the underlying system is not starved of memory.
+_elastic_heap_size_default: "{{ ((q_mem | int) > 30720) | ternary(30720, ((q_mem | int) - ((q_mem | int) * 0.1))) }}"
--- a/elk_metrics_6x/roles/elastic_filebeat/tasks/filebeat_setup.yml
+++ b/elk_metrics_6x/roles/elastic_filebeat/tasks/filebeat_setup.yml
@ -19,9 +19,13 @@
    {{ item }}
    -E 'output.logstash.enabled=false'
    -E 'output.elasticsearch.hosts={{ coordination_nodes | to_json }}'
+    -E 'setup.template.enabled=true'
+    -E 'setup.template.overwrite=true'
    -e -v
  with_items:
    - "--template"
+    - "--pipelines"
+    - "--machine-learning"
    - "--dashboards"
  register: templates
  environment:
--- a/elk_metrics_6x/roles/elastic_heartbeat/tasks/heartbeat_setup.yml
+++ b/elk_metrics_6x/roles/elastic_heartbeat/tasks/heartbeat_setup.yml
@ -19,9 +19,13 @@
    {{ item }}
    -E 'output.logstash.enabled=false'
    -E 'output.elasticsearch.hosts={{ coordination_nodes | to_json }}'
+    -E 'setup.template.enabled=true'
+    -E 'setup.template.overwrite=true'
    -e -v
  with_items:
    - "--template"
+    - "--pipelines"
+    - "--machine-learning"
    - "--dashboards"
  register: templates
  until: templates is success
--- a/elk_metrics_6x/roles/elastic_journalbeat/tasks/journalbeat_setup.yml
+++ b/elk_metrics_6x/roles/elastic_journalbeat/tasks/journalbeat_setup.yml
@ -27,9 +27,13 @@
 #     {{ item }}
 #     -E 'output.logstash.enabled=false'
 #     -E 'output.elasticsearch.hosts={{ coordination_nodes | to_json }}'
+#     -E 'setup.template.enabled=true'
+#     -E 'setup.template.overwrite=true'
 #     -e -v
 #   with_items:
 #     - "--template"
+#     - "--pipelines"
+#     - "--machine-learning"
 #     - "--dashboards"
 #   register: templates
 #   until: templates is success
--- a/elk_metrics_6x/roles/elastic_logstash/defaults/main.yml
+++ b/elk_metrics_6x/roles/elastic_logstash/defaults/main.yml
@ -26,11 +26,15 @@ elastic_log_rotate_path: "/var/log/logstash"
 # Enable direct syslog input into logstash. When this is enabled syslog messages
 # can be sent directly to logstash via TCP or UDP.
 logstash_syslog_input_enabled: false
+
 # The typical syslog port is 514 however that is not available to logstash
-# because it's a "privledged" port. For this reason 1514 is used as the default.
+# because it's a "privledged" port. For this reason 5140 is used as the default.
 # Changing this port to 514 will require overrides to the service files making
 # logstash run as root (not recommended).
-logstash_syslog_input_port: 1514
+logstash_syslog_input_port: 5140
+
+# Protocol used when the syslog input is enabled. Modes are "tcp" or "udp".
+logstash_syslog_input_mode: udp

 logstash_beat_input_port: 5044
 logstash_deploy_filters: true
@ -75,3 +79,10 @@ logstash_arcsight_smart_connectors: []
 #   - host: 127.0.0.1
 #     port: 5000
 logstash_arcsight_event_brokers: []
+
+## The logstash queue type can be set to "memory" or "persisted". If the queue
+## type is set to memory a ramdisk will be created limiting the in memory queue
+## to 50% of the JVM heap size. When this option is undefined the playbook will
+## detect the media type where the queue will exist. If the media type is
+## "rotational" in memory queues will be used.
+# logstash_queue_type:
--- a/elk_metrics_6x/roles/elastic_logstash/tasks/main.yml
+++ b/elk_metrics_6x/roles/elastic_logstash/tasks/main.yml
@ -62,6 +62,69 @@
  notify:
    - Enable and restart logstash

+- name: Check queue type
+  block:
+    - name: Get block device for logstash
+      command: findmnt -no SOURCE --target=/var/lib/logstash
+      changed_when: false
+      register: _logstash_block_device
+
+    - name: Set persisted queue fact
+      set_fact:
+        logstash_queue_type: "{{ ((ansible_devices[_logstash_block_device.stdout.split('/')[-1] | regex_replace('[0-9]$','')]['rotational'] | int) != 1) | ternary('persisted', 'memory') }}"
+  rescue:
+    - name: Set persisted queue fact (fallback)
+      set_fact:
+        logstash_queue_type: memory
+  when:
+    - logstash_queue_type is undefined
+
+- name: Systemd memory backed queue block
+  block:
+    - name: Get logstash UID
+      command: id -u logstash
+      register: logstash_uid
+      changed_when: false
+      when:
+        - ansible_service_mgr == 'systemd'
+
+    - name: Get logstash GID
+      command: id -g logstash
+      register: logstash_gid
+      changed_when: false
+      when:
+        - ansible_service_mgr == 'systemd'
+
+    - name: Run the systemd mount role
+      include_role:
+        name: systemd_mount
+        private: true
+      vars:
+        systemd_mounts:
+          - what: "tmpfs"
+            where: "/var/lib/logstash/queue"
+            type: "tmpfs"
+            options: "size={{ (q_mem | int) // 2 }}m,uid={{ logstash_uid.stdout }},gid={{ logstash_gid.stdout }},nodev,nodiratime,noatime"
+            unit:
+              Before:
+                - logstash.service
+            state: 'started'
+            enabled: true
+      when:
+        - ansible_service_mgr == 'systemd'
+
+    - name: Apply fstab options for memory queues
+      mount:
+        path: /var/lib/logstash/queue
+        src: tmpfs
+        fstype: tmpfs
+        opts: size={{ (q_mem | int) // 2 }}m
+        state: mounted
+      when:
+        - ansible_service_mgr != 'systemd'
+  when:
+    - logstash_queue_type == 'memory'
+
 - name: Create patterns directory
  file:
    name: "/opt/logstash/patterns"
--- a/elk_metrics_6x/roles/elastic_logstash/tasks/systemd.general-overrides.conf.j2
+++ b/elk_metrics_6x/roles/elastic_logstash/tasks/systemd.general-overrides.conf.j2
@ -1 +0,0 @@
-../../../templates/systemd.general-overrides.conf.j2
--- a/elk_metrics_6x/roles/elastic_logstash/templates/logstash-pipelines.yml.j2
+++ b/elk_metrics_6x/roles/elastic_logstash/templates/logstash-pipelines.yml.j2
@ -1 +0,0 @@
-../../../templates/logstash-pipelines.yml.j2
--- a/elk_metrics_6x/roles/elastic_logstash/templates/logstash.yml.j2
+++ b/elk_metrics_6x/roles/elastic_logstash/templates/logstash.yml.j2
@ -39,9 +39,10 @@ path.data: /var/lib/logstash
 # This defaults to the number of the host's CPU cores.
 #

-{% set _h_processors = ((ansible_processor_count | int) // 2) %}
-{% set _processors = ((_h_processors | int) > 0) | ternary(_h_processors, 1) %}
-{% set processors = ((_processors | int) > 8) | ternary(8, _processors) %}
+{% set _d_processors = ((ansible_processor_count | int) * 3) %}
+{% set _processors = ((_d_processors | int) > 0) | ternary(_d_processors, 2) %}
+{% set _t_processors = (_processors | int) + (ansible_processor_count | int) %}
+{% set processors = ((_t_processors | int) > 64) | ternary(64, _t_processors) %}
 pipeline.workers: {{ processors | int }}
 #
 # How many events to retrieve from inputs before sending to filters+workers
@ -51,7 +52,7 @@ pipeline.batch.size: 256
 # How long to wait in milliseconds while polling for the next event
 # before dispatching an undersized batch to filters+outputs
 #
-pipeline.batch.delay: 20
+pipeline.batch.delay: 64
 #
 # Force Logstash to exit during shutdown even if there are still inflight
 # events in memory. By default, logstash will refuse to quit until all
@ -155,7 +156,15 @@ queue.type: persisted
 # whichever criteria is reached first
 # Default is 1024mb or 1gb
 #
+{% if logstash_queue_type == 'memory' %}
+# An in memory queue is being used. The actual size of the queue is 90% of the
+# total memory limit, which is set using 50% of the heap size.
+{%   set _memory_queue_size = ((q_mem | int) // 2) %}
+{%   set _memory_queue_size_buffer = (((_memory_queue_size | int) * 0.1) | int) %}
+queue.max_bytes: {{ (_memory_queue_size | int) - (_memory_queue_size_buffer | int) }}mb
+{% else %}
 queue.max_bytes: {{ logstash_queue_size }}mb
+{% endif %}
 #
 # If using queue.type: persisted, the maximum number of acked events before forcing a checkpoint
 # Default is 1024, 0 for unlimited
@ -231,7 +240,7 @@ xpack.monitoring.enabled: true
 #xpack.monitoring.elasticsearch.ssl.verification_mode: certificate
 #xpack.monitoring.elasticsearch.sniffing: false
 xpack.monitoring.collection.interval: 30s
-#xpack.monitoring.collection.pipeline.details.enabled: true
+xpack.monitoring.collection.pipeline.details.enabled: true
 #
 # ------------ X-Pack Settings (not applicable for OSS build)--------------
 # X-Pack Management
--- a/elk_metrics_6x/roles/elastic_logstash/templates/systemd.logstash-mem-queue.conf.j2
+++ b/elk_metrics_6x/roles/elastic_logstash/templates/systemd.logstash-mem-queue.conf.j2
@ -0,0 +1,2 @@
+[Unit]
+Requires = logstash-mem-queue.service
--- a/elk_metrics_6x/roles/elastic_metricbeat/tasks/metricbeat_setup.yml
+++ b/elk_metrics_6x/roles/elastic_metricbeat/tasks/metricbeat_setup.yml
@ -19,9 +19,13 @@
    {{ item }}
    -E 'output.logstash.enabled=false'
    -E 'output.elasticsearch.hosts={{ coordination_nodes | to_json }}'
+    -E 'setup.template.enabled=true'
+    -E 'setup.template.overwrite=true'
    -e -v
  with_items:
    - "--template"
+    - "--pipelines"
+    - "--machine-learning"
    - "--dashboards"
  register: templates
  environment:
--- a/elk_metrics_6x/roles/elastic_packetbeat/tasks/packetbeat_setup.yml
+++ b/elk_metrics_6x/roles/elastic_packetbeat/tasks/packetbeat_setup.yml
@ -19,9 +19,13 @@
    {{ item }}
    -E 'output.logstash.enabled=false'
    -E 'output.elasticsearch.hosts={{ coordination_nodes | to_json }}'
+    -E 'setup.template.enabled=true'
+    -E 'setup.template.overwrite=true'
    -e -v
  with_items:
    - "--template"
+    - "--pipelines"
+    - "--machine-learning"
    - "--dashboards"
  register: templates
  environment:
--- a/elk_metrics_6x/roles/elasticsearch/defaults/main.yml
+++ b/elk_metrics_6x/roles/elasticsearch/defaults/main.yml
@ -18,3 +18,13 @@ elastic_log_rotate_path: "/var/log/elasticsearch"

 temp_dir: /var/lib/elasticsearch/tmp
 nfs_query: "[?fstype=='nfs' || fstype=='nfs4']"
+
+# Enable or Disable memory locking.
+elastic_memory_lock: true
+
+# Elasticsearch plugin list. These plugins will be re-installed whenever the
+# playbooks are executed, which ensures the plugins are always upgraded.
+elastic_plugins:
+  - ingest-attachment
+  - ingest-geoip
+  - ingest-user-agent
--- a/elk_metrics_6x/roles/elasticsearch/tasks/elasticsearch_plugins.yml
+++ b/elk_metrics_6x/roles/elasticsearch/tasks/elasticsearch_plugins.yml
@ -0,0 +1,26 @@
+---
+# Copyright 2018, Rackspace US, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+- name: Remove plugins
+  command: "/usr/share/elasticsearch/bin/elasticsearch-plugin remove --verbose {{ item }}"
+  failed_when: false
+  changed_when:
+    - remove_plugin.rc == 0
+  register: remove_plugin
+  with_items: "{{ elastic_plugins }}"
+
+- name: Install plugins
+  command: "/usr/share/elasticsearch/bin/elasticsearch-plugin install --batch --verbose {{ item }}"
+  with_items: "{{ elastic_plugins }}"
--- a/elk_metrics_6x/roles/elasticsearch/tasks/main.yml
+++ b/elk_metrics_6x/roles/elasticsearch/tasks/main.yml
@ -107,3 +107,5 @@
  when:
    - elastic_shared_fs_repos is defined
    - (elastic_shared_fs_repos | json_query(nfs_query)) | length > 0
+
+- include_tasks: "elasticsearch_plugins.yml"
--- a/elk_metrics_6x/roles/elasticsearch/templates/elasticsearch.yml.j2
+++ b/elk_metrics_6x/roles/elasticsearch/templates/elasticsearch.yml.j2
@ -3,6 +3,8 @@ cluster.name: {{ cluster_name }}
 # ------------------------------------ Node ------------------------------------
 node.name: {{ ansible_nodename }}
 # node.rack: r1
+# Set to true to enable machine learning on the node.
+node.ml: false
 # ----------------------------------- Paths ------------------------------------
 # Path to directory where to store the data (separate multiple locations by comma):
 #
@ -36,7 +38,7 @@ index.store.type: niofs
 #
 # Lock the memory on startup:
 #
-bootstrap.memory_lock: false
+bootstrap.memory_lock: {{ elastic_memory_lock }}
 #
 # Make sure that the `ES_HEAP_SIZE` environment variable is set to about half the memory
 # available on the system and that the owner of the process is allowed to use this limit.
@ -102,7 +104,7 @@ gateway.recover_after_nodes: {{ ((master_node_count | int) // 2) + 1 }}
 #
 # Require explicit names when deleting indices:
 #
-# action.destructive_requires_name: true
+action.destructive_requires_name: true

 {% set processors = ((elastic_thread_pool_size | int) > 0) | ternary(elastic_thread_pool_size, 1) %}
 {% if not (elastic_coordination_node | default(false)) | bool %}
@ -140,3 +142,5 @@ indices.recovery.max_bytes_per_sec: {{ elasticserch_interface_speed }}mb
 # https://www.elastic.co/guide/en/elasticsearch/reference/6.3/monitoring-settings.html
 xpack.monitoring.collection.enabled: true
 xpack.monitoring.collection.interval: 30s
+# Set to true to enable machine learning on the node.
+xpack.ml.enabled: false
--- a/elk_metrics_6x/roles/elasticsearch/templates/systemd.elasticsearch-overrides.conf.j2
+++ b/elk_metrics_6x/roles/elasticsearch/templates/systemd.elasticsearch-overrides.conf.j2
@ -4,3 +4,11 @@
 ExecStart=
 # This runs our ExecStart as an override.
 ExecStart=/usr/share/elasticsearch/bin/elasticsearch -p ${PID_DIR}/elasticsearch.pid
+
+{% if elastic_memory_lock | bool %}
+# Limit memory usage
+LimitMEMLOCK=infinity
+{% endif %}
+
+# Number of File Descriptors
+LimitNOFILE=131070
--- a/elk_metrics_6x/templates/_macros.j2
+++ b/elk_metrics_6x/templates/_macros.j2
@ -128,7 +128,7 @@ output.logstash:

  # The maximum number of events to bulk in a single Logstash request. The
  # default is the number of cores multiplied by the number of threads,
-  # the resultant is then multiplied again by 256 which results in a the defined
+  # the resultant is then multiplied again by 128 which results in a the defined
  # bulk max size. If the Beat sends single events, the events are collected
  # into batches. If the Beat publishes a large batch of events (larger than
  # the value specified by bulk_max_size), the batch is split. Specifying a
@ -139,7 +139,7 @@ output.logstash:
  # less than or equal to 0 disables the splitting of batches. When splitting
  # is disabled, the queue decides on the number of events to be contained in a
  # batch.
-  bulk_max_size: {{ (processors | int) * 256 }}
+  bulk_max_size: {{ (processors | int) * 128 }}

 {% if named_index is defined %}
  # Optional index name. The default index name is set to {{ named_index }}
--- a/elk_metrics_6x/templates/logstash-pipelines.yml.j2
+++ b/elk_metrics_6x/templates/logstash-pipelines.yml.j2
--- a/elk_metrics_6x/vars/variables.yml
+++ b/elk_metrics_6x/vars/variables.yml
@ -1,6 +1,6 @@
 ---
-# Option to define third memory
-q_mem: "{{ (ansible_memtotal_mb | int) // 3 }}"
+# Option to define quarter memory
+q_mem: "{{ (ansible_memtotal_mb | int) // 4 }}"

 # Option to define half memory
 h_mem: "{{ (ansible_memtotal_mb | int) // 2 }}"
@ -12,7 +12,8 @@ apm_port: 8200
 elastic_port: 9200
 elastic_hap_port: 9201
 logstash_beat_input_port: 5044
-logstash_syslog_input_port: 1514
+logstash_syslog_input_port: 5140
+logstash_syslog_input_mode: udp
 kibana_port: 5601
 kibana_nginx_port: 81
				`@ -1 +0,0 @@`
				`../../../templates/systemd.general-overrides.conf.j2`
				`@ -1 +0,0 @@`
				`../../../templates/logstash-pipelines.yml.j2`