Add tests for RabbitMQ memory and disk alarms

Change-Id: Ia1e178cbeace91e978444cd44a5e0dcf0c08d7fd
This commit is contained in:
Vladimir Ushakov 2016-08-30 10:55:47 +03:00 committed by Simon Pasquier
parent 70fe115643
commit ddbdfec778
2 changed files with 141 additions and 19 deletions

View File

@ -324,20 +324,45 @@ class ToolchainApi(object):
self.helpers.check_notifications(notification_list,
cinder_event_types)
def check_alarms(self, alarm_type, source, hostname, value,
def check_alarms(self, alarm_type, filter_value, source, hostname, value,
time_interval="now() - 5m"):
filter_by = "node_role"
if alarm_type == "service":
filter_by = "service"
query = (
"select last(value) from {} where time >= {} and source = '{}' "
"and hostname = '{}' and value = {}".format(
"{}_status".format(alarm_type), time_interval, source,
hostname, value))
"select last(value) from {select_from} where time >= {time}"
" and source = '{source}' and {filter} and hostname = '{hostname}'"
" and value = {value}".format(
select_from="{}_status".format(alarm_type), time=time_interval,
source=source, hostname=hostname, value=value,
filter="{} = '{}'".format(filter_by, filter_value)))
def check_result():
result = self.INFLUXDB_GRAFANA.do_influxdb_query(
query=query).json()["results"][0]
return len(result)
msg = "Alarm with source {} and value {} was not triggered".format(
source, value)
msg = ("Alarm with source {} and {} {} and value {} was"
" not triggered".format(source, filter_by, filter_value, value))
devops_helpers.wait(check_result, timeout=60 * 5,
interval=10, timeout_msg=msg)
def get_rabbitmq_memory_usage(self, interval="now() - 5m"):
query = ("select last(value) from rabbitmq_used_memory "
"where time >= {interval}".format(interval=interval))
result = self.INFLUXDB_GRAFANA.do_influxdb_query(query=query).json()
return result["results"][0]["series"][0]["values"][0][1]
def set_rabbitmq_memory_watermark(self, controller, limit, timeout=5 * 60):
def check_result():
with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
exec_res = remote.execute(
"rabbitmqctl set_vm_memory_high_watermark {}".format(
limit))
if exec_res['exit_code'] == 0:
return True
else:
return False
msg = "Failed to set vm_memory_high_watermark to {}".format(limit)
devops_helpers.wait(check_result, timeout=timeout,
interval=10, timeout_msg=msg)

View File

@ -23,6 +23,10 @@ WARNING_STATUS = 1
CRITICAL_STATUS = 3
WARNING_PERCENT = 91
CRITICAL_PERCENT = 96
RABBITMQ_DISK_WARNING_PERCENT = 99.99
RABBITMQ_DISK_CRITICAL_PERCENT = 100
RABBITMQ_MEMORY_WARNING_VALUE = 1.01
RABBITMQ_MEMORY_CRITICAL_VALUE = 1.0001
@test(groups=["plugins"])
@ -31,26 +35,60 @@ class TestToolchainAlarms(api.ToolchainApi):
"""
def _check_filesystem_alarms(self, nailgun_node, filesystem, source,
filename, alarm_type="node"):
self.check_alarms(
alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
filename, node_role, alarm_type="node"):
self.check_alarms(alarm_type, node_role, source,
nailgun_node["hostname"], OKAY_STATUS)
with self.fuel_web.get_ssh_for_nailgun_node(nailgun_node) as remote:
self.remote_ops.fill_up_filesystem(
remote, filesystem, WARNING_PERCENT, filename)
logger.info("Checking {}-warning alarm".format(source))
self.check_alarms(
alarm_type, source, nailgun_node["hostname"], WARNING_STATUS)
self.check_alarms(alarm_type, node_role, source,
nailgun_node["hostname"], WARNING_STATUS)
self.remote_ops.clean_filesystem(remote, filename)
self.check_alarms(
alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
self.check_alarms(alarm_type, node_role,
source, nailgun_node["hostname"], OKAY_STATUS)
self.remote_ops.fill_up_filesystem(
remote, filesystem, CRITICAL_PERCENT, filename)
logger.info("Checking {}-critical alarm".format(source))
self.check_alarms(
alarm_type, source, nailgun_node["hostname"], CRITICAL_STATUS)
self.check_alarms(alarm_type, node_role, source,
nailgun_node["hostname"], CRITICAL_STATUS)
self.remote_ops.clean_filesystem(remote, filename)
self.check_alarms(
alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
self.check_alarms(alarm_type, node_role, source,
nailgun_node["hostname"], OKAY_STATUS)
def _check_rabbit_mq_disk_alarms(self, controller, status, percent):
cmd = ("rabbitmqctl set_disk_free_limit $(df | grep /dev/dm-4 | "
"awk '{{ printf(\"%.0f\\n\", 1024 * ((($3 + $4) * "
"{percent} / 100) - $3))}}')")
self.check_alarms("service", "rabbitmq", "disk",
controller["hostname"], OKAY_STATUS)
with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
default_value = remote.check_call(
"rabbitmqctl environment | grep disk_free_limit | "
"sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
remote.check_call(cmd.format(percent=percent))
self.check_alarms("service", "rabbitmq", "disk",
controller["hostname"], status)
remote.check_call("rabbitmqctl set_disk_free_limit {}".format(
default_value))
self.check_alarms("service", "rabbitmq", "disk",
controller["hostname"], OKAY_STATUS)
def _check_rabbit_mq_memory_alarms(self, controller, status, value):
cmd = "rabbitmqctl set_vm_memory_high_watermark absolute \"{memory}\""
self.check_alarms("service", "rabbitmq", "memory",
controller["hostname"], OKAY_STATUS)
with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
default_value = remote.check_call(
"rabbitmqctl environment | grep disk_free_limit | "
"sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
mem_usage = self.get_rabbitmq_memory_usage()
remote.check_call(cmd.format(memory=int(mem_usage * value)))
self.check_alarms("service", "rabbitmq", "memory",
controller["hostname"], status)
self.set_rabbitmq_memory_watermark(controller, default_value)
self.check_alarms("service", "rabbitmq", "memory",
controller["hostname"], OKAY_STATUS)
@test(depends_on_groups=["deploy_toolchain"],
groups=["check_mysql_fs_alarms", "toolchain", "alarms"])
@ -74,4 +112,63 @@ class TestToolchainAlarms(api.ToolchainApi):
self.helpers.cluster_id, ["controller"])[0]
self._check_filesystem_alarms(
controller, "/dev/mapper/mysql-root", "mysql-fs",
"/var/lib/mysql/test/bigfile")
"/var/lib/mysql/test/bigfile", "mysql-nodes")
@test(depends_on_groups=["deploy_toolchain"],
groups=["check_rabbitmq_disk_alarm", "toolchain", "alarms"])
@log_snapshot_after_test
def check_rabbitmq_disk_alarm(self):
"""Check that rabbitmq-disk-limit-warning and
rabbitmq-disk-limit-critical alarms work as expected.
Scenario:
1. Check the last value of the okay alarm in InfluxDB.
2. Set RabbitMQ disk limit to 99.99 percent of available space.
3. Check the last value of the warning alarm in InfluxDB.
4. Set RabbitMQ disk limit to the default value.
5. Check the last value of the okay alarm in InfluxDB.
6. Set RabbitMQ disk limit to 100 percent of available space.
7. Check the last value of the critical alarm in InfluxDB.
8. Set RabbitMQ disk limit to the default value.
9. Check the last value of the okay alarm in InfluxDB.
Duration 10m
"""
self.env.revert_snapshot("deploy_toolchain")
controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ["controller"])[0]
self._check_rabbit_mq_disk_alarms(controller, WARNING_STATUS,
RABBITMQ_DISK_WARNING_PERCENT)
self._check_rabbit_mq_disk_alarms(controller, CRITICAL_STATUS,
RABBITMQ_DISK_CRITICAL_PERCENT)
@test(depends_on_groups=["deploy_toolchain"],
groups=["check_rabbitmq_memory_alarm", "toolchain",
"alarms"])
@log_snapshot_after_test
def check_rabbitmq_memory_alarm(self):
"""Check that rabbitmq-memory-limit-warning and
rabbitmq-memory-limit-critical alarms work as expected.
Scenario:
1. Check the last value of the okay alarm in InfluxDB.
2. Set RabbitMQ memory limit to 101 percent of currently
used memory.
3. Check the last value of the warning alarm in InfluxDB.
4. Set RabbitMQ memory limit to the default value.
5. Check the last value of the okay alarm in InfluxDB.
6. Set RabbitMQ memory limit to 100.01 percent of currently
used memory.
7. Check the last value of the critical alarm in InfluxDB.
8. Set RabbitMQ memory limit to the default value.
9. Check the last value of the okay alarm in InfluxDB.
Duration 10m
"""
self.env.revert_snapshot("deploy_toolchain")
controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ["controller"])[0]
self._check_rabbit_mq_memory_alarms(controller, WARNING_STATUS,
RABBITMQ_MEMORY_WARNING_VALUE)
self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
RABBITMQ_MEMORY_CRITICAL_VALUE)