Add tests for RabbitMQ memory and disk alarms

Change-Id: Ia1e178cbeace91e978444cd44a5e0dcf0c08d7fd
2016-08-30 10:55:47 +03:00 · 2016-08-30 10:55:47 +03:00 · ddbdfec778
commit ddbdfec778
parent 70fe115643
2 changed files with 141 additions and 19 deletions
--- a/stacklight_tests/toolchain/api.py
+++ b/stacklight_tests/toolchain/api.py
@ -324,20 +324,45 @@ class ToolchainApi(object):
        self.helpers.check_notifications(notification_list,
                                         cinder_event_types)

-    def check_alarms(self, alarm_type, source, hostname, value,
+    def check_alarms(self, alarm_type, filter_value, source, hostname, value,
                     time_interval="now() - 5m"):
+        filter_by = "node_role"
+        if alarm_type == "service":
+            filter_by = "service"
        query = (
-            "select last(value) from {} where time >= {} and source = '{}' "
-            "and hostname = '{}' and value = {}".format(
-                "{}_status".format(alarm_type), time_interval, source,
-                hostname, value))
+            "select last(value) from {select_from} where time >= {time}"
+            " and source = '{source}' and {filter} and hostname = '{hostname}'"
+            " and value = {value}".format(
+                select_from="{}_status".format(alarm_type), time=time_interval,
+                source=source, hostname=hostname, value=value,
+                filter="{} = '{}'".format(filter_by, filter_value)))

        def check_result():
            result = self.INFLUXDB_GRAFANA.do_influxdb_query(
                query=query).json()["results"][0]
            return len(result)

-        msg = "Alarm with source {} and value {} was not triggered".format(
-            source, value)
+        msg = ("Alarm with source {} and {} {} and value {} was"
+               " not triggered".format(source, filter_by, filter_value, value))
        devops_helpers.wait(check_result, timeout=60 * 5,
                            interval=10, timeout_msg=msg)
+
+    def get_rabbitmq_memory_usage(self, interval="now() - 5m"):
+        query = ("select last(value) from rabbitmq_used_memory "
+                 "where time >= {interval}".format(interval=interval))
+        result = self.INFLUXDB_GRAFANA.do_influxdb_query(query=query).json()
+        return result["results"][0]["series"][0]["values"][0][1]
+
+    def set_rabbitmq_memory_watermark(self, controller, limit, timeout=5 * 60):
+        def check_result():
+            with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
+                exec_res = remote.execute(
+                    "rabbitmqctl set_vm_memory_high_watermark {}".format(
+                        limit))
+                if exec_res['exit_code'] == 0:
+                    return True
+                else:
+                    return False
+        msg = "Failed to set vm_memory_high_watermark to {}".format(limit)
+        devops_helpers.wait(check_result, timeout=timeout,
+                            interval=10, timeout_msg=msg)
--- a/stacklight_tests/toolchain/test_alarms.py
+++ b/stacklight_tests/toolchain/test_alarms.py
@ -23,6 +23,10 @@ WARNING_STATUS = 1
 CRITICAL_STATUS = 3
 WARNING_PERCENT = 91
 CRITICAL_PERCENT = 96
+RABBITMQ_DISK_WARNING_PERCENT = 99.99
+RABBITMQ_DISK_CRITICAL_PERCENT = 100
+RABBITMQ_MEMORY_WARNING_VALUE = 1.01
+RABBITMQ_MEMORY_CRITICAL_VALUE = 1.0001


@test(groups=["plugins"])
@ -31,26 +35,60 @@ class TestToolchainAlarms(api.ToolchainApi):
    """

    def _check_filesystem_alarms(self, nailgun_node, filesystem, source,
-                                 filename, alarm_type="node"):
-        self.check_alarms(
-            alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
+                                 filename, node_role, alarm_type="node"):
+        self.check_alarms(alarm_type, node_role, source,
+                          nailgun_node["hostname"], OKAY_STATUS)
        with self.fuel_web.get_ssh_for_nailgun_node(nailgun_node) as remote:
            self.remote_ops.fill_up_filesystem(
                remote, filesystem, WARNING_PERCENT, filename)
            logger.info("Checking {}-warning alarm".format(source))
-            self.check_alarms(
-                alarm_type, source, nailgun_node["hostname"], WARNING_STATUS)
+            self.check_alarms(alarm_type, node_role, source,
+                              nailgun_node["hostname"], WARNING_STATUS)
            self.remote_ops.clean_filesystem(remote, filename)
-            self.check_alarms(
-                alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
+            self.check_alarms(alarm_type, node_role,
+                              source, nailgun_node["hostname"], OKAY_STATUS)
            self.remote_ops.fill_up_filesystem(
                remote, filesystem, CRITICAL_PERCENT, filename)
            logger.info("Checking {}-critical alarm".format(source))
-            self.check_alarms(
-                alarm_type, source, nailgun_node["hostname"], CRITICAL_STATUS)
+            self.check_alarms(alarm_type, node_role, source,
+                              nailgun_node["hostname"], CRITICAL_STATUS)
            self.remote_ops.clean_filesystem(remote, filename)
-            self.check_alarms(
-                alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
+            self.check_alarms(alarm_type, node_role, source,
+                              nailgun_node["hostname"], OKAY_STATUS)
+
+    def _check_rabbit_mq_disk_alarms(self, controller, status, percent):
+        cmd = ("rabbitmqctl set_disk_free_limit $(df | grep /dev/dm-4 | "
+               "awk '{{ printf(\"%.0f\\n\", 1024 * ((($3 + $4) * "
+               "{percent} / 100) - $3))}}')")
+        self.check_alarms("service", "rabbitmq", "disk",
+                          controller["hostname"], OKAY_STATUS)
+        with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
+            default_value = remote.check_call(
+                "rabbitmqctl environment | grep disk_free_limit | "
+                "sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
+            remote.check_call(cmd.format(percent=percent))
+            self.check_alarms("service", "rabbitmq", "disk",
+                              controller["hostname"], status)
+            remote.check_call("rabbitmqctl set_disk_free_limit {}".format(
+                default_value))
+            self.check_alarms("service", "rabbitmq", "disk",
+                              controller["hostname"], OKAY_STATUS)
+
+    def _check_rabbit_mq_memory_alarms(self, controller, status, value):
+        cmd = "rabbitmqctl set_vm_memory_high_watermark absolute \"{memory}\""
+        self.check_alarms("service", "rabbitmq", "memory",
+                          controller["hostname"], OKAY_STATUS)
+        with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
+            default_value = remote.check_call(
+                "rabbitmqctl environment | grep disk_free_limit | "
+                "sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
+            mem_usage = self.get_rabbitmq_memory_usage()
+            remote.check_call(cmd.format(memory=int(mem_usage * value)))
+            self.check_alarms("service", "rabbitmq", "memory",
+                              controller["hostname"], status)
+            self.set_rabbitmq_memory_watermark(controller, default_value)
+            self.check_alarms("service", "rabbitmq", "memory",
+                              controller["hostname"], OKAY_STATUS)

    @test(depends_on_groups=["deploy_toolchain"],
          groups=["check_mysql_fs_alarms", "toolchain", "alarms"])
@ -74,4 +112,63 @@ class TestToolchainAlarms(api.ToolchainApi):
            self.helpers.cluster_id, ["controller"])[0]
        self._check_filesystem_alarms(
            controller, "/dev/mapper/mysql-root", "mysql-fs",
-            "/var/lib/mysql/test/bigfile")
+            "/var/lib/mysql/test/bigfile", "mysql-nodes")
+
+    @test(depends_on_groups=["deploy_toolchain"],
+          groups=["check_rabbitmq_disk_alarm", "toolchain", "alarms"])
+    @log_snapshot_after_test
+    def check_rabbitmq_disk_alarm(self):
+        """Check that rabbitmq-disk-limit-warning and
+        rabbitmq-disk-limit-critical alarms work as expected.
+
+        Scenario:
+            1. Check the last value of the okay alarm in InfluxDB.
+            2. Set RabbitMQ disk limit to 99.99 percent of available space.
+            3. Check the last value of the warning alarm in InfluxDB.
+            4. Set RabbitMQ disk limit to the default value.
+            5. Check the last value of the okay alarm in InfluxDB.
+            6. Set RabbitMQ disk limit to 100 percent of available space.
+            7. Check the last value of the critical alarm in InfluxDB.
+            8. Set RabbitMQ disk limit to the default value.
+            9. Check the last value of the okay alarm in InfluxDB.
+
+        Duration 10m
+        """
+        self.env.revert_snapshot("deploy_toolchain")
+        controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
+            self.helpers.cluster_id, ["controller"])[0]
+        self._check_rabbit_mq_disk_alarms(controller, WARNING_STATUS,
+                                          RABBITMQ_DISK_WARNING_PERCENT)
+        self._check_rabbit_mq_disk_alarms(controller, CRITICAL_STATUS,
+                                          RABBITMQ_DISK_CRITICAL_PERCENT)
+
+    @test(depends_on_groups=["deploy_toolchain"],
+          groups=["check_rabbitmq_memory_alarm", "toolchain",
+                  "alarms"])
+    @log_snapshot_after_test
+    def check_rabbitmq_memory_alarm(self):
+        """Check that rabbitmq-memory-limit-warning and
+        rabbitmq-memory-limit-critical alarms work as expected.
+
+        Scenario:
+            1. Check the last value of the okay alarm in InfluxDB.
+            2. Set RabbitMQ memory limit to 101 percent of currently
+            used memory.
+            3. Check the last value of the warning alarm in InfluxDB.
+            4. Set RabbitMQ memory limit to the default value.
+            5. Check the last value of the okay alarm in InfluxDB.
+            6. Set RabbitMQ memory limit to 100.01 percent of currently
+            used memory.
+            7. Check the last value of the critical alarm in InfluxDB.
+            8. Set RabbitMQ memory limit to the default value.
+            9. Check the last value of the okay alarm in InfluxDB.
+
+        Duration 10m
+        """
+        self.env.revert_snapshot("deploy_toolchain")
+        controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
+            self.helpers.cluster_id, ["controller"])[0]
+        self._check_rabbit_mq_memory_alarms(controller, WARNING_STATUS,
+                                            RABBITMQ_MEMORY_WARNING_VALUE)
+        self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
+                                            RABBITMQ_MEMORY_CRITICAL_VALUE)