From ddbdfec778ac3d71986889c40d6709a7c247e169 Mon Sep 17 00:00:00 2001 From: Vladimir Ushakov Date: Tue, 30 Aug 2016 10:55:47 +0300 Subject: [PATCH] Add tests for RabbitMQ memory and disk alarms Change-Id: Ia1e178cbeace91e978444cd44a5e0dcf0c08d7fd --- stacklight_tests/toolchain/api.py | 39 +++++-- stacklight_tests/toolchain/test_alarms.py | 121 +++++++++++++++++++--- 2 files changed, 141 insertions(+), 19 deletions(-) diff --git a/stacklight_tests/toolchain/api.py b/stacklight_tests/toolchain/api.py index f4f67a9..1dc7f5e 100644 --- a/stacklight_tests/toolchain/api.py +++ b/stacklight_tests/toolchain/api.py @@ -324,20 +324,45 @@ class ToolchainApi(object): self.helpers.check_notifications(notification_list, cinder_event_types) - def check_alarms(self, alarm_type, source, hostname, value, + def check_alarms(self, alarm_type, filter_value, source, hostname, value, time_interval="now() - 5m"): + filter_by = "node_role" + if alarm_type == "service": + filter_by = "service" query = ( - "select last(value) from {} where time >= {} and source = '{}' " - "and hostname = '{}' and value = {}".format( - "{}_status".format(alarm_type), time_interval, source, - hostname, value)) + "select last(value) from {select_from} where time >= {time}" + " and source = '{source}' and {filter} and hostname = '{hostname}'" + " and value = {value}".format( + select_from="{}_status".format(alarm_type), time=time_interval, + source=source, hostname=hostname, value=value, + filter="{} = '{}'".format(filter_by, filter_value))) def check_result(): result = self.INFLUXDB_GRAFANA.do_influxdb_query( query=query).json()["results"][0] return len(result) - msg = "Alarm with source {} and value {} was not triggered".format( - source, value) + msg = ("Alarm with source {} and {} {} and value {} was" + " not triggered".format(source, filter_by, filter_value, value)) devops_helpers.wait(check_result, timeout=60 * 5, interval=10, timeout_msg=msg) + + def get_rabbitmq_memory_usage(self, interval="now() - 5m"): + query = ("select last(value) from rabbitmq_used_memory " + "where time >= {interval}".format(interval=interval)) + result = self.INFLUXDB_GRAFANA.do_influxdb_query(query=query).json() + return result["results"][0]["series"][0]["values"][0][1] + + def set_rabbitmq_memory_watermark(self, controller, limit, timeout=5 * 60): + def check_result(): + with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote: + exec_res = remote.execute( + "rabbitmqctl set_vm_memory_high_watermark {}".format( + limit)) + if exec_res['exit_code'] == 0: + return True + else: + return False + msg = "Failed to set vm_memory_high_watermark to {}".format(limit) + devops_helpers.wait(check_result, timeout=timeout, + interval=10, timeout_msg=msg) diff --git a/stacklight_tests/toolchain/test_alarms.py b/stacklight_tests/toolchain/test_alarms.py index 8e3899b..c6e4600 100644 --- a/stacklight_tests/toolchain/test_alarms.py +++ b/stacklight_tests/toolchain/test_alarms.py @@ -23,6 +23,10 @@ WARNING_STATUS = 1 CRITICAL_STATUS = 3 WARNING_PERCENT = 91 CRITICAL_PERCENT = 96 +RABBITMQ_DISK_WARNING_PERCENT = 99.99 +RABBITMQ_DISK_CRITICAL_PERCENT = 100 +RABBITMQ_MEMORY_WARNING_VALUE = 1.01 +RABBITMQ_MEMORY_CRITICAL_VALUE = 1.0001 @test(groups=["plugins"]) @@ -31,26 +35,60 @@ class TestToolchainAlarms(api.ToolchainApi): """ def _check_filesystem_alarms(self, nailgun_node, filesystem, source, - filename, alarm_type="node"): - self.check_alarms( - alarm_type, source, nailgun_node["hostname"], OKAY_STATUS) + filename, node_role, alarm_type="node"): + self.check_alarms(alarm_type, node_role, source, + nailgun_node["hostname"], OKAY_STATUS) with self.fuel_web.get_ssh_for_nailgun_node(nailgun_node) as remote: self.remote_ops.fill_up_filesystem( remote, filesystem, WARNING_PERCENT, filename) logger.info("Checking {}-warning alarm".format(source)) - self.check_alarms( - alarm_type, source, nailgun_node["hostname"], WARNING_STATUS) + self.check_alarms(alarm_type, node_role, source, + nailgun_node["hostname"], WARNING_STATUS) self.remote_ops.clean_filesystem(remote, filename) - self.check_alarms( - alarm_type, source, nailgun_node["hostname"], OKAY_STATUS) + self.check_alarms(alarm_type, node_role, + source, nailgun_node["hostname"], OKAY_STATUS) self.remote_ops.fill_up_filesystem( remote, filesystem, CRITICAL_PERCENT, filename) logger.info("Checking {}-critical alarm".format(source)) - self.check_alarms( - alarm_type, source, nailgun_node["hostname"], CRITICAL_STATUS) + self.check_alarms(alarm_type, node_role, source, + nailgun_node["hostname"], CRITICAL_STATUS) self.remote_ops.clean_filesystem(remote, filename) - self.check_alarms( - alarm_type, source, nailgun_node["hostname"], OKAY_STATUS) + self.check_alarms(alarm_type, node_role, source, + nailgun_node["hostname"], OKAY_STATUS) + + def _check_rabbit_mq_disk_alarms(self, controller, status, percent): + cmd = ("rabbitmqctl set_disk_free_limit $(df | grep /dev/dm-4 | " + "awk '{{ printf(\"%.0f\\n\", 1024 * ((($3 + $4) * " + "{percent} / 100) - $3))}}')") + self.check_alarms("service", "rabbitmq", "disk", + controller["hostname"], OKAY_STATUS) + with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote: + default_value = remote.check_call( + "rabbitmqctl environment | grep disk_free_limit | " + "sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip() + remote.check_call(cmd.format(percent=percent)) + self.check_alarms("service", "rabbitmq", "disk", + controller["hostname"], status) + remote.check_call("rabbitmqctl set_disk_free_limit {}".format( + default_value)) + self.check_alarms("service", "rabbitmq", "disk", + controller["hostname"], OKAY_STATUS) + + def _check_rabbit_mq_memory_alarms(self, controller, status, value): + cmd = "rabbitmqctl set_vm_memory_high_watermark absolute \"{memory}\"" + self.check_alarms("service", "rabbitmq", "memory", + controller["hostname"], OKAY_STATUS) + with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote: + default_value = remote.check_call( + "rabbitmqctl environment | grep disk_free_limit | " + "sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip() + mem_usage = self.get_rabbitmq_memory_usage() + remote.check_call(cmd.format(memory=int(mem_usage * value))) + self.check_alarms("service", "rabbitmq", "memory", + controller["hostname"], status) + self.set_rabbitmq_memory_watermark(controller, default_value) + self.check_alarms("service", "rabbitmq", "memory", + controller["hostname"], OKAY_STATUS) @test(depends_on_groups=["deploy_toolchain"], groups=["check_mysql_fs_alarms", "toolchain", "alarms"]) @@ -74,4 +112,63 @@ class TestToolchainAlarms(api.ToolchainApi): self.helpers.cluster_id, ["controller"])[0] self._check_filesystem_alarms( controller, "/dev/mapper/mysql-root", "mysql-fs", - "/var/lib/mysql/test/bigfile") + "/var/lib/mysql/test/bigfile", "mysql-nodes") + + @test(depends_on_groups=["deploy_toolchain"], + groups=["check_rabbitmq_disk_alarm", "toolchain", "alarms"]) + @log_snapshot_after_test + def check_rabbitmq_disk_alarm(self): + """Check that rabbitmq-disk-limit-warning and + rabbitmq-disk-limit-critical alarms work as expected. + + Scenario: + 1. Check the last value of the okay alarm in InfluxDB. + 2. Set RabbitMQ disk limit to 99.99 percent of available space. + 3. Check the last value of the warning alarm in InfluxDB. + 4. Set RabbitMQ disk limit to the default value. + 5. Check the last value of the okay alarm in InfluxDB. + 6. Set RabbitMQ disk limit to 100 percent of available space. + 7. Check the last value of the critical alarm in InfluxDB. + 8. Set RabbitMQ disk limit to the default value. + 9. Check the last value of the okay alarm in InfluxDB. + + Duration 10m + """ + self.env.revert_snapshot("deploy_toolchain") + controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ["controller"])[0] + self._check_rabbit_mq_disk_alarms(controller, WARNING_STATUS, + RABBITMQ_DISK_WARNING_PERCENT) + self._check_rabbit_mq_disk_alarms(controller, CRITICAL_STATUS, + RABBITMQ_DISK_CRITICAL_PERCENT) + + @test(depends_on_groups=["deploy_toolchain"], + groups=["check_rabbitmq_memory_alarm", "toolchain", + "alarms"]) + @log_snapshot_after_test + def check_rabbitmq_memory_alarm(self): + """Check that rabbitmq-memory-limit-warning and + rabbitmq-memory-limit-critical alarms work as expected. + + Scenario: + 1. Check the last value of the okay alarm in InfluxDB. + 2. Set RabbitMQ memory limit to 101 percent of currently + used memory. + 3. Check the last value of the warning alarm in InfluxDB. + 4. Set RabbitMQ memory limit to the default value. + 5. Check the last value of the okay alarm in InfluxDB. + 6. Set RabbitMQ memory limit to 100.01 percent of currently + used memory. + 7. Check the last value of the critical alarm in InfluxDB. + 8. Set RabbitMQ memory limit to the default value. + 9. Check the last value of the okay alarm in InfluxDB. + + Duration 10m + """ + self.env.revert_snapshot("deploy_toolchain") + controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ["controller"])[0] + self._check_rabbit_mq_memory_alarms(controller, WARNING_STATUS, + RABBITMQ_MEMORY_WARNING_VALUE) + self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS, + RABBITMQ_MEMORY_CRITICAL_VALUE)