Add tests for RabbitMQ memory and disk alarms
Change-Id: Ia1e178cbeace91e978444cd44a5e0dcf0c08d7fd
This commit is contained in:
parent
70fe115643
commit
ddbdfec778
@ -324,20 +324,45 @@ class ToolchainApi(object):
|
||||
self.helpers.check_notifications(notification_list,
|
||||
cinder_event_types)
|
||||
|
||||
def check_alarms(self, alarm_type, source, hostname, value,
|
||||
def check_alarms(self, alarm_type, filter_value, source, hostname, value,
|
||||
time_interval="now() - 5m"):
|
||||
filter_by = "node_role"
|
||||
if alarm_type == "service":
|
||||
filter_by = "service"
|
||||
query = (
|
||||
"select last(value) from {} where time >= {} and source = '{}' "
|
||||
"and hostname = '{}' and value = {}".format(
|
||||
"{}_status".format(alarm_type), time_interval, source,
|
||||
hostname, value))
|
||||
"select last(value) from {select_from} where time >= {time}"
|
||||
" and source = '{source}' and {filter} and hostname = '{hostname}'"
|
||||
" and value = {value}".format(
|
||||
select_from="{}_status".format(alarm_type), time=time_interval,
|
||||
source=source, hostname=hostname, value=value,
|
||||
filter="{} = '{}'".format(filter_by, filter_value)))
|
||||
|
||||
def check_result():
|
||||
result = self.INFLUXDB_GRAFANA.do_influxdb_query(
|
||||
query=query).json()["results"][0]
|
||||
return len(result)
|
||||
|
||||
msg = "Alarm with source {} and value {} was not triggered".format(
|
||||
source, value)
|
||||
msg = ("Alarm with source {} and {} {} and value {} was"
|
||||
" not triggered".format(source, filter_by, filter_value, value))
|
||||
devops_helpers.wait(check_result, timeout=60 * 5,
|
||||
interval=10, timeout_msg=msg)
|
||||
|
||||
def get_rabbitmq_memory_usage(self, interval="now() - 5m"):
|
||||
query = ("select last(value) from rabbitmq_used_memory "
|
||||
"where time >= {interval}".format(interval=interval))
|
||||
result = self.INFLUXDB_GRAFANA.do_influxdb_query(query=query).json()
|
||||
return result["results"][0]["series"][0]["values"][0][1]
|
||||
|
||||
def set_rabbitmq_memory_watermark(self, controller, limit, timeout=5 * 60):
|
||||
def check_result():
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
|
||||
exec_res = remote.execute(
|
||||
"rabbitmqctl set_vm_memory_high_watermark {}".format(
|
||||
limit))
|
||||
if exec_res['exit_code'] == 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
msg = "Failed to set vm_memory_high_watermark to {}".format(limit)
|
||||
devops_helpers.wait(check_result, timeout=timeout,
|
||||
interval=10, timeout_msg=msg)
|
||||
|
@ -23,6 +23,10 @@ WARNING_STATUS = 1
|
||||
CRITICAL_STATUS = 3
|
||||
WARNING_PERCENT = 91
|
||||
CRITICAL_PERCENT = 96
|
||||
RABBITMQ_DISK_WARNING_PERCENT = 99.99
|
||||
RABBITMQ_DISK_CRITICAL_PERCENT = 100
|
||||
RABBITMQ_MEMORY_WARNING_VALUE = 1.01
|
||||
RABBITMQ_MEMORY_CRITICAL_VALUE = 1.0001
|
||||
|
||||
|
||||
@test(groups=["plugins"])
|
||||
@ -31,26 +35,60 @@ class TestToolchainAlarms(api.ToolchainApi):
|
||||
"""
|
||||
|
||||
def _check_filesystem_alarms(self, nailgun_node, filesystem, source,
|
||||
filename, alarm_type="node"):
|
||||
self.check_alarms(
|
||||
alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
|
||||
filename, node_role, alarm_type="node"):
|
||||
self.check_alarms(alarm_type, node_role, source,
|
||||
nailgun_node["hostname"], OKAY_STATUS)
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(nailgun_node) as remote:
|
||||
self.remote_ops.fill_up_filesystem(
|
||||
remote, filesystem, WARNING_PERCENT, filename)
|
||||
logger.info("Checking {}-warning alarm".format(source))
|
||||
self.check_alarms(
|
||||
alarm_type, source, nailgun_node["hostname"], WARNING_STATUS)
|
||||
self.check_alarms(alarm_type, node_role, source,
|
||||
nailgun_node["hostname"], WARNING_STATUS)
|
||||
self.remote_ops.clean_filesystem(remote, filename)
|
||||
self.check_alarms(
|
||||
alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
|
||||
self.check_alarms(alarm_type, node_role,
|
||||
source, nailgun_node["hostname"], OKAY_STATUS)
|
||||
self.remote_ops.fill_up_filesystem(
|
||||
remote, filesystem, CRITICAL_PERCENT, filename)
|
||||
logger.info("Checking {}-critical alarm".format(source))
|
||||
self.check_alarms(
|
||||
alarm_type, source, nailgun_node["hostname"], CRITICAL_STATUS)
|
||||
self.check_alarms(alarm_type, node_role, source,
|
||||
nailgun_node["hostname"], CRITICAL_STATUS)
|
||||
self.remote_ops.clean_filesystem(remote, filename)
|
||||
self.check_alarms(
|
||||
alarm_type, source, nailgun_node["hostname"], OKAY_STATUS)
|
||||
self.check_alarms(alarm_type, node_role, source,
|
||||
nailgun_node["hostname"], OKAY_STATUS)
|
||||
|
||||
def _check_rabbit_mq_disk_alarms(self, controller, status, percent):
|
||||
cmd = ("rabbitmqctl set_disk_free_limit $(df | grep /dev/dm-4 | "
|
||||
"awk '{{ printf(\"%.0f\\n\", 1024 * ((($3 + $4) * "
|
||||
"{percent} / 100) - $3))}}')")
|
||||
self.check_alarms("service", "rabbitmq", "disk",
|
||||
controller["hostname"], OKAY_STATUS)
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
|
||||
default_value = remote.check_call(
|
||||
"rabbitmqctl environment | grep disk_free_limit | "
|
||||
"sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
|
||||
remote.check_call(cmd.format(percent=percent))
|
||||
self.check_alarms("service", "rabbitmq", "disk",
|
||||
controller["hostname"], status)
|
||||
remote.check_call("rabbitmqctl set_disk_free_limit {}".format(
|
||||
default_value))
|
||||
self.check_alarms("service", "rabbitmq", "disk",
|
||||
controller["hostname"], OKAY_STATUS)
|
||||
|
||||
def _check_rabbit_mq_memory_alarms(self, controller, status, value):
|
||||
cmd = "rabbitmqctl set_vm_memory_high_watermark absolute \"{memory}\""
|
||||
self.check_alarms("service", "rabbitmq", "memory",
|
||||
controller["hostname"], OKAY_STATUS)
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
|
||||
default_value = remote.check_call(
|
||||
"rabbitmqctl environment | grep disk_free_limit | "
|
||||
"sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
|
||||
mem_usage = self.get_rabbitmq_memory_usage()
|
||||
remote.check_call(cmd.format(memory=int(mem_usage * value)))
|
||||
self.check_alarms("service", "rabbitmq", "memory",
|
||||
controller["hostname"], status)
|
||||
self.set_rabbitmq_memory_watermark(controller, default_value)
|
||||
self.check_alarms("service", "rabbitmq", "memory",
|
||||
controller["hostname"], OKAY_STATUS)
|
||||
|
||||
@test(depends_on_groups=["deploy_toolchain"],
|
||||
groups=["check_mysql_fs_alarms", "toolchain", "alarms"])
|
||||
@ -74,4 +112,63 @@ class TestToolchainAlarms(api.ToolchainApi):
|
||||
self.helpers.cluster_id, ["controller"])[0]
|
||||
self._check_filesystem_alarms(
|
||||
controller, "/dev/mapper/mysql-root", "mysql-fs",
|
||||
"/var/lib/mysql/test/bigfile")
|
||||
"/var/lib/mysql/test/bigfile", "mysql-nodes")
|
||||
|
||||
@test(depends_on_groups=["deploy_toolchain"],
|
||||
groups=["check_rabbitmq_disk_alarm", "toolchain", "alarms"])
|
||||
@log_snapshot_after_test
|
||||
def check_rabbitmq_disk_alarm(self):
|
||||
"""Check that rabbitmq-disk-limit-warning and
|
||||
rabbitmq-disk-limit-critical alarms work as expected.
|
||||
|
||||
Scenario:
|
||||
1. Check the last value of the okay alarm in InfluxDB.
|
||||
2. Set RabbitMQ disk limit to 99.99 percent of available space.
|
||||
3. Check the last value of the warning alarm in InfluxDB.
|
||||
4. Set RabbitMQ disk limit to the default value.
|
||||
5. Check the last value of the okay alarm in InfluxDB.
|
||||
6. Set RabbitMQ disk limit to 100 percent of available space.
|
||||
7. Check the last value of the critical alarm in InfluxDB.
|
||||
8. Set RabbitMQ disk limit to the default value.
|
||||
9. Check the last value of the okay alarm in InfluxDB.
|
||||
|
||||
Duration 10m
|
||||
"""
|
||||
self.env.revert_snapshot("deploy_toolchain")
|
||||
controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
|
||||
self.helpers.cluster_id, ["controller"])[0]
|
||||
self._check_rabbit_mq_disk_alarms(controller, WARNING_STATUS,
|
||||
RABBITMQ_DISK_WARNING_PERCENT)
|
||||
self._check_rabbit_mq_disk_alarms(controller, CRITICAL_STATUS,
|
||||
RABBITMQ_DISK_CRITICAL_PERCENT)
|
||||
|
||||
@test(depends_on_groups=["deploy_toolchain"],
|
||||
groups=["check_rabbitmq_memory_alarm", "toolchain",
|
||||
"alarms"])
|
||||
@log_snapshot_after_test
|
||||
def check_rabbitmq_memory_alarm(self):
|
||||
"""Check that rabbitmq-memory-limit-warning and
|
||||
rabbitmq-memory-limit-critical alarms work as expected.
|
||||
|
||||
Scenario:
|
||||
1. Check the last value of the okay alarm in InfluxDB.
|
||||
2. Set RabbitMQ memory limit to 101 percent of currently
|
||||
used memory.
|
||||
3. Check the last value of the warning alarm in InfluxDB.
|
||||
4. Set RabbitMQ memory limit to the default value.
|
||||
5. Check the last value of the okay alarm in InfluxDB.
|
||||
6. Set RabbitMQ memory limit to 100.01 percent of currently
|
||||
used memory.
|
||||
7. Check the last value of the critical alarm in InfluxDB.
|
||||
8. Set RabbitMQ memory limit to the default value.
|
||||
9. Check the last value of the okay alarm in InfluxDB.
|
||||
|
||||
Duration 10m
|
||||
"""
|
||||
self.env.revert_snapshot("deploy_toolchain")
|
||||
controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
|
||||
self.helpers.cluster_id, ["controller"])[0]
|
||||
self._check_rabbit_mq_memory_alarms(controller, WARNING_STATUS,
|
||||
RABBITMQ_MEMORY_WARNING_VALUE)
|
||||
self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
|
||||
RABBITMQ_MEMORY_CRITICAL_VALUE)
|
||||
|
Loading…
Reference in New Issue
Block a user