From 09d0409ed4a69f514355925754e832e752f817ca Mon Sep 17 00:00:00 2001 From: Michal Arbet Date: Fri, 26 Feb 2021 17:50:31 +0100 Subject: [PATCH] Allow user to set sysctl_net_ipv4_tcp_retries2 This patch is adding configuration option to manipulate with kernel option sysctl_net_ipv4_tcp_retries2. More informations about kernel option in [1][2] and RedHat suggestion [3] to set for DBs and HA. [1]: https://pracucci.com/linux-tcp-rto-min-max-and-tcp-retries2.html [2]: https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ [3]: https://access.redhat.com/solutions/726753 Closes-Bug: #1917068 Change-Id: Ia0decbbfa4e33b1889b635f8bb1c9094567a2ce6 --- ansible/roles/haproxy/defaults/main.yml | 4 ++ ansible/roles/haproxy/tasks/config-host.yml | 7 +-- .../high-availability/haproxy-guide.rst | 47 +++++++++++++++++++ .../reference/high-availability/index.rst | 10 ++++ doc/source/reference/index.rst | 1 + ...die-after-VIP-switch-5f9e811783c36041.yaml | 13 +++++ 6 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 doc/source/reference/high-availability/haproxy-guide.rst create mode 100644 doc/source/reference/high-availability/index.rst create mode 100644 releasenotes/notes/fix-TCP-connections-refusing-to-die-after-VIP-switch-5f9e811783c36041.yaml diff --git a/ansible/roles/haproxy/defaults/main.yml b/ansible/roles/haproxy/defaults/main.yml index ca5a3be975..c596e03152 100644 --- a/ansible/roles/haproxy/defaults/main.yml +++ b/ansible/roles/haproxy/defaults/main.yml @@ -90,4 +90,8 @@ haproxy_check_timeout: "10s" # Check http://www.haproxy.org/download/1.5/doc/configuration.txt for available options haproxy_defaults_balance: "roundrobin" +# Avoid TCP connections refusing to die after VIP switch +# https://bugs.launchpad.net/kolla-ansible/+bug/1917068 +haproxy_host_ipv4_tcp_retries2: "KOLLA_UNSET" + kolla_externally_managed_cert: False diff --git a/ansible/roles/haproxy/tasks/config-host.yml b/ansible/roles/haproxy/tasks/config-host.yml index cad68d2c16..46b262c7a4 100644 --- a/ansible/roles/haproxy/tasks/config-host.yml +++ b/ansible/roles/haproxy/tasks/config-host.yml @@ -10,9 +10,10 @@ sysctl_file: "{{ kolla_sysctl_conf_path }}" become: true with_items: - - { name: "net.ipv4.ip_nonlocal_bind", value: 1} - - { name: "net.ipv6.ip_nonlocal_bind", value: 1} - - { name: "net.unix.max_dgram_qlen", value: 128} + - { name: "net.ipv4.ip_nonlocal_bind", value: 1 } + - { name: "net.ipv6.ip_nonlocal_bind", value: 1 } + - { name: "net.ipv4.tcp_retries2", value: "{{ haproxy_host_ipv4_tcp_retries2 }}" } + - { name: "net.unix.max_dgram_qlen", value: 128 } when: - set_sysctl | bool - item.value != 'KOLLA_SKIP' diff --git a/doc/source/reference/high-availability/haproxy-guide.rst b/doc/source/reference/high-availability/haproxy-guide.rst new file mode 100644 index 0000000000..ae7d90171e --- /dev/null +++ b/doc/source/reference/high-availability/haproxy-guide.rst @@ -0,0 +1,47 @@ +.. _haproxy-guide: + +============= +HAProxy Guide +============= + +Kolla Ansible supports a Highly Available (HA) deployment of +Openstack and other services. High-availability in Kolla +is implented as via Keepalived and HAProxy. Keepalived manages virtual IP +addresses, while HAProxy load-balances traffic to service backends. +These two components must be installed on the same hosts +and they are deployed to hosts in the ``haproxy`` group. + +Preparation and deployment +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +HAProxy and Keepalived are enabled by default. They may be disabled by +setting the following in ``/etc/kolla/globals.yml``: + +.. code-block:: yaml + + enable_haproxy: "no" + enable_keepalived: "no" + +Configuration +~~~~~~~~~~~~~ + +Failover tuning +--------------- + +When a VIP fails over from one host to another, hosts may take some +time to detect that the connection has been dropped. This can lead +to service downtime. + +To reduce the time by the kernel to close dead connections to VIP +address, modify the ``net.ipv4.tcp_retries2`` kernel option by setting +the following in ``/etc/kolla/globals.yml``: + +.. code-block:: yaml + + haproxy_host_ipv4_tcp_retries2: 6 + +This is especially helpful for connections to MariaDB. See +`here `__, +`here `__ and +`here `__ for +further information about this kernel option. diff --git a/doc/source/reference/high-availability/index.rst b/doc/source/reference/high-availability/index.rst new file mode 100644 index 0000000000..176b859572 --- /dev/null +++ b/doc/source/reference/high-availability/index.rst @@ -0,0 +1,10 @@ +================= +High-availability +================= + +This section describes high-availability configuration of services. + +.. toctree:: + :maxdepth: 1 + + haproxy-guide diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 358aef91e6..c6631cfd08 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -17,3 +17,4 @@ Projects Deployment Configuration Reference message-queues/index deployment-config/index deployment-and-bootstrapping/index + high-availability/index diff --git a/releasenotes/notes/fix-TCP-connections-refusing-to-die-after-VIP-switch-5f9e811783c36041.yaml b/releasenotes/notes/fix-TCP-connections-refusing-to-die-after-VIP-switch-5f9e811783c36041.yaml new file mode 100644 index 0000000000..185ba8eb83 --- /dev/null +++ b/releasenotes/notes/fix-TCP-connections-refusing-to-die-after-VIP-switch-5f9e811783c36041.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Added a new haproxy configuration variable, + ``haproxy_host_ipv4_tcp_retries2``, + which allows users to modify this kernel option. + This option sets maximum number of times a TCP packet is retransmitted + in established state before giving up. The default kernel value is 15, + which corresponds to a duration of approximately between 13 to 30 + minutes, depending on the retransmission timeout. This variable can be used + to mitigate an issue with stuck connections in case of VIP failover, + see `bug 1917068 `__ + for details.