From 5e9a7e9087727922d1d6b06439bc72e7767c218d Mon Sep 17 00:00:00 2001
From: Billy Olsen <billy.olsen@gmail.com>
Date: Wed, 24 Mar 2021 15:17:25 -0700
Subject: [PATCH] Disable vrrp healthchecks by default

VRRP healthchecks were enabled by default starting in the 19.07 charm
release for network deployments which utilize l3ha or dvr+snat. The VRRP
healthchecks have specific expectations that may not be satisfied in
various data centers. This leads to problems with networks as failed
healthchecks lead to router failovers.

This change alters the default config option to disable the vrrp
healthchecks by default and require users to opt in to using them. The
description around the option has been updated to indicate that doing so
may lead to routers failing over if ICMP pings are missed.

Closes-Bug: #192101
Change-Id: Ie0ebb8072fa802dc8c2478a0b3ca38202d49c85f
---
 config.yaml                         | 16 +++++++++++-----
 unit_tests/test_neutron_contexts.py |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/config.yaml b/config.yaml
index e454c984..dd63df93 100644
--- a/config.yaml
+++ b/config.yaml
@@ -376,12 +376,18 @@ options:
       is used with neutron. This option is ignored for Train+ OpenStack.
   keepalived-healthcheck-interval:
     type: int
-    default: 30
+    default: 0
     description: |
-      By default all HA routers will check their external network gateway
-      by sending a ping and if that fails they trigger a vrrp transition. This
-      option defines how frequently this check is performed. Setting this value
-      to 0 will disable the healthchecks.
+      Specifies the frequency (in seconds) at which HA routers will check
+      their external network gateway by performing an ICMP ping between the
+      virtual routers. When the ping check fails, this will trigger the HA
+      routers to failover to another node. A value of 0 will disable this
+      check. This setting only applies when using l3ha and dvr_snat.
+      .
+      WARNING: Enabling the health checks should be done with caution as it
+      may lead to rapid failovers of HA routers. ICMP pings are low priority
+      and may be dropped or take longer than the 1 second afforded by neutron,
+      which leads to routers failing over to other nodes.
   enable-auto-restarts:
     type: boolean
     default: True
diff --git a/unit_tests/test_neutron_contexts.py b/unit_tests/test_neutron_contexts.py
index 8119ab83..0ee15c22 100644
--- a/unit_tests/test_neutron_contexts.py
+++ b/unit_tests/test_neutron_contexts.py
@@ -302,7 +302,7 @@ class TestNeutronGatewayContext(CharmTestCase):
             'nfg_log_output_base': None,
             'nfg_log_rate_limit': None,
             'ovsdb_timeout': 60,
-            'keepalived_healthcheck_interval': 30,
+            'keepalived_healthcheck_interval': 0,
         })
 
     @patch('os.environ.get')