Fix tests that checks services' and nodes' alerts

Fix naming of getting vip name method:
method "PluginHelper.full_vip_name" was renamed to "PluginHelper.get_vip_resource_name".
Fix naming of services on nagios: "global-" prefix was added 
for plugins with version >=1.0.
Fix checking of haproxy backends state:
removed checks for services,
that don't have haproxy backend.
Fixed disk usage percentage to be consistent 
with alerts' criteria.
Temporary disabled checking of openvswitch-agent, 
because it is managed by pacemaker.
Removed code duplicity in tests.

Change-Id: If3b77c595fbb5e4348f5f3ebd6e82b445b01062c
Closes-Bug: #1627671
This commit is contained in:
Rodion Promyshlennikov 2016-10-04 11:46:06 +03:00
parent bd151c4956
commit 86966e30bd
4 changed files with 155 additions and 165 deletions

View File

@ -139,13 +139,10 @@ def manage_service(remote, name, operation="restart"):
:param operation: type of operation, usually start, stop or restart.
:type operation: str
"""
if remote.execute("service {} status".format(name))['exit_code'] == 0:
service_cmd = 'service {service} {operation}'
elif remote.execute("initctl status {}".format(name))['exit_code'] == 0:
if remote.execute("ls /etc/init/{}.conf".format(name))["exit_code"] == 0:
service_cmd = 'initctl {operation} {service}'
else:
raise Exception('no service handler!')
service_cmd = 'service {service} {operation}'
remote.check_call(service_cmd.format(service=name, operation=operation))

View File

@ -11,16 +11,15 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import six.moves as sm
from devops.helpers import helpers
from fuelweb_test import logger
from proboscis import asserts
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common import exceptions
from selenium.webdriver.common import by
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support import ui
from stacklight_tests import base_test
from stacklight_tests.lma_infrastructure_alerting import (
@ -100,8 +99,8 @@ class InfraAlertingPluginApi(base_test.PluginApi):
link.click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element_by_name("main"))
WebDriverWait(driver, 120).until(
EC.presence_of_element_located((By.XPATH, anchor)))
ui.WebDriverWait(driver, 120).until(
ec.presence_of_element_located((by.By.XPATH, anchor)))
return driver
def check_node_in_nagios(self, changed_node, state):
@ -117,7 +116,7 @@ class InfraAlertingPluginApi(base_test.PluginApi):
def node_is_present(self, driver, name):
table = self.ui_tester.get_table(driver,
"/html/body/div[2]/table/tbody")
for ind in sm.xrange(2, self.ui_tester.get_table_size(table) + 1):
for ind in range(2, self.ui_tester.get_table_size(table) + 1):
node_name = self.ui_tester.get_table_cell(
table, ind, 1).text.rstrip()
if name == node_name:
@ -133,9 +132,12 @@ class InfraAlertingPluginApi(base_test.PluginApi):
return self.helpers.check_plugin_cannot_be_uninstalled(
self.settings.name, self.settings.version)
def get_services_for_node(self, table, node_name, driver,
def get_services_for_node(self, node_name, driver,
table_xpath="/html/body/table[3]/tbody"):
services = {}
limit_xpath = "//select[@name='limit']/option[@value='0']"
driver.find_element_by_xpath(limit_xpath).click()
table = self.ui_tester.get_table(driver, table_xpath)
found_node = False
ind = 2
while ind < self.ui_tester.get_table_size(table) + 1:
@ -144,6 +146,7 @@ class InfraAlertingPluginApi(base_test.PluginApi):
if found_node:
break
else:
ind += 1
continue
if self.ui_tester.get_table_cell(
table, ind, 1).text == node_name:
@ -152,7 +155,8 @@ class InfraAlertingPluginApi(base_test.PluginApi):
services[self.ui_tester.get_table_cell(
table, ind, 2).text] = (
self.ui_tester.get_table_cell(table, ind, 3).text)
except StaleElementReferenceException:
except exceptions.StaleElementReferenceException:
driver.find_element_by_xpath(limit_xpath).click()
table = self.ui_tester.get_table(driver, table_xpath)
ind -= 1
ind += 1
@ -162,12 +166,12 @@ class InfraAlertingPluginApi(base_test.PluginApi):
def check_service_state_on_nagios(self, driver, service_state=None,
node_names=None):
self.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
driver, "Services", "//table[@class='headertable']")
table = self.ui_tester.get_table(driver, "/html/body/table[3]/tbody")
if not node_names:
node_names = [self.ui_tester.get_table_cell(table, 2, 1).text]
for node in node_names:
node_services = self.get_services_for_node(table, node, driver)
node_services = self.get_services_for_node(node, driver)
if service_state:
for service in service_state:
if service_state[service] != node_services[service]:
@ -182,20 +186,10 @@ class InfraAlertingPluginApi(base_test.PluginApi):
node_names=None):
msg = ("Fail to get expected service states for services: {0} "
"on nodes: {1}")
if not service_state or not node_names:
self.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
table = self.ui_tester.get_table(driver,
"/html/body/table[3]/tbody")
if not node_names:
node_names = [self.ui_tester.get_table_cell(table, 2, 1).text]
if not service_state:
service_state = dict((key, 'OK') for key in
self.get_services_for_node(
table, node_names[0], driver))
msg = msg.format([key for key in service_state], node_names)
msg = msg.format(
[key for key in service_state]
if service_state is not None else "all",
node_names if node_names is not None else "global-cluster")
helpers.wait(lambda: self.check_service_state_on_nagios(
driver, service_state, node_names), timeout=60 * 5,

View File

@ -380,16 +380,18 @@ class ToolchainApi(object):
and Nagios UI.
:param service_name: name of the service to change state of.
Format [service name, service name
on dashboard] e.g. ['nova-api', 'nova']
Format:
[service name, service name in influx,
service name on nagios, haproxy-backend for service if exist]
e.g. ['nova-api', 'nova', 'nova-global', 'nova-api']
:type service_name: list.
:param action: action to perform (e.g. stop, start).
:type action: str
:param new_state: new state of the service.
:type new_state: str
:param service_state_in_influx: new state of the service in influx.
:type new_state: int
:param down_backends_in_haproxy: amout of backends in 'down' state.
:type service_state_in_influx: int
:param down_backends_in_haproxy: amount of backends in 'down' state.
:type down_backends_in_haproxy: int
:param toolchain_node: toolchain node with
infrastructure_alerting_ui vip.
@ -411,15 +413,16 @@ class ToolchainApi(object):
node) as remote:
self.remote_ops.manage_service(remote, service_name[0], action)
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {service_name[1]: new_state})
nagios_driver, {service_name[2]: new_state})
self.INFLUXDB_GRAFANA.check_cluster_status(
service_name[1], service_state_in_influx)
if service_name[3]:
self.INFLUXDB_GRAFANA.check_count_of_haproxy_backends(
service_name[0], expected_count=down_backends_in_haproxy)
service_name[3], expected_count=down_backends_in_haproxy)
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
toolchain_node) as remote:
self.checkers.check_local_mail(
remote, toolchain_node["name"], service_name[1], new_state)
remote, toolchain_node["name"], service_name[2], new_state)
def change_verify_node_service_state(self, services, state, influx_state,
percent, toolchain_node,
@ -458,11 +461,11 @@ class ToolchainApi(object):
"/var/lib/mysql/test/bigfile")
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[0]: 'OK'})
nagios_driver, {services[0]: "OK"})
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[1]: state},
[controller_nodes[0]['hostname']])
self.INFLUXDB_GRAFANA.check_cluster_status(services[0],
[controller_nodes[0]["hostname"]])
self.INFLUXDB_GRAFANA.check_cluster_status(services[2],
self.settings.OKAY)
with self.fuel_web.get_ssh_for_nailgun_node(
@ -475,8 +478,8 @@ class ToolchainApi(object):
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[0]: state})
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[1]: state}, [node['hostname']])
self.INFLUXDB_GRAFANA.check_cluster_status(services[0], influx_state)
nagios_driver, {services[1]: state}, [node["hostname"]])
self.INFLUXDB_GRAFANA.check_cluster_status(services[2], influx_state)
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
toolchain_node) as remote:
@ -490,13 +493,13 @@ class ToolchainApi(object):
for node in controller_nodes:
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[0]: 'OK'})
nagios_driver, {services[0]: "OK"})
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[1]: 'OK'}, [node['hostname']])
self.INFLUXDB_GRAFANA.check_cluster_status(services[0],
nagios_driver, {services[1]: "OK"}, [node["hostname"]])
self.INFLUXDB_GRAFANA.check_cluster_status(services[2],
self.settings.OKAY)
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
toolchain_node) as remote:
self.checkers.check_local_mail(
remote, toolchain_node["name"], services[0], 'OK')
remote, toolchain_node["name"], services[0], "OK")

View File

@ -236,6 +236,78 @@ class TestFunctionalToolchain(api.ToolchainApi):
self.check_cinder_notifications()
def _check_services_alerts(self, controllers_count, nagios_status,
influx_status, down_haproxy_count):
components = {
"nova": [("nova-api", "nova-api"), ("nova-scheduler", None)],
"cinder": [("cinder-api", "cinder-api"),
("cinder-scheduler", None)],
"neutron": [
("neutron-server", "neutron-api"),
# TODO(rpromyshlennikov): temporary fix,
# because openvswitch-agent is managed by pacemaker
# ("neutron-openvswitch-agent", None)
],
"glance": [("glance-api", "glance-api")],
"heat": [("heat-api", "heat-api")],
"keystone": [("apache2", "keystone-public-api")]
}
alerting_plugin = self.LMA_INFRASTRUCTURE_ALERTING
services_names_in_nagios = {}
for service in components:
nagios_service_name = (
service
if alerting_plugin.settings.version.startswith("0.")
else "global-{}".format(service)
)
services_names_in_nagios[service] = nagios_service_name
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.get_vip_resource_name(
alerting_plugin.settings.failover_vip))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
url = alerting_plugin.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "Nagios Core",
"//frame[2]") as driver:
alerting_plugin.open_nagios_page(
driver, "Services", "//table[@class='headertable']")
controller_nodes = (
self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id,
["controller"])[:controllers_count]
)
for component in components:
for (service, haproxy_backend) in components[component]:
logger.info("Checking service {0}".format(service))
self.change_verify_service_state(
service_name=[
service, component,
services_names_in_nagios[component],
haproxy_backend],
action="stop",
new_state=nagios_status,
service_state_in_influx=influx_status,
down_backends_in_haproxy=down_haproxy_count,
toolchain_node=toolchain_node,
controller_nodes=controller_nodes,
nagios_driver=driver)
self.change_verify_service_state(
service_name=[
service, component,
services_names_in_nagios[component],
haproxy_backend],
action="start",
new_state="OK",
service_state_in_influx=self.settings.OKAY,
down_backends_in_haproxy=0,
toolchain_node=toolchain_node,
controller_nodes=controller_nodes,
nagios_driver=driver)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_warning_alert_service", "service_restart",
"toolchain", "functional"])
@ -282,54 +354,17 @@ class TestFunctionalToolchain(api.ToolchainApi):
Duration 45m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
params = {"controllers_count": 1,
"nagios_status": "WARNING",
"influx_status": self.settings.WARN,
"down_haproxy_count": 1}
services = {
'nova': ['nova-api', 'nova-scheduler'],
'cinder': ['cinder-api', 'cinder-scheduler'],
'neutron': ['neutron-server', 'neutron-openvswitch-agent'],
'glance': ['glance-api'],
'heat': ['heat-api'],
'keystone': ['apache2']
}
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name(
self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "Nagios Core",
"//frame[2]") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
controller_node = (
self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller'])[0])
for key in services:
for service in services[key]:
self.change_verify_service_state(
service_name=[service, key], action='stop',
new_state='WARNING',
service_state_in_influx=self.settings.WARN,
down_backends_in_haproxy=1,
toolchain_node=toolchain_node,
controller_nodes=[controller_node],
nagios_driver=driver)
self.change_verify_service_state(
service_name=[service, key], action='start',
new_state='OK',
service_state_in_influx=self.settings.OKAY,
down_backends_in_haproxy=0,
toolchain_node=toolchain_node,
controller_nodes=[controller_node],
nagios_driver=driver)
self._check_services_alerts(**params)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_critical_alert_service", "service_restart",
"toolchain", "functional"])
# @log_snapshot_after_test
@log_snapshot_after_test
def toolchain_critical_alert_service(self):
"""Verify that the critical alerts for services show up in
the Grafana and Nagios UI.
@ -367,52 +402,39 @@ class TestFunctionalToolchain(api.ToolchainApi):
Duration 45m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
params = {"controllers_count": 2,
"nagios_status": "CRITICAL",
"influx_status": self.settings.CRIT,
"down_haproxy_count": 2}
services = {
'nova': ['nova-api', 'nova-scheduler'],
'cinder': ['cinder-api', 'cinder-scheduler'],
'neutron': ['neutron-server', 'neutron-openvswitch-agent'],
'glance': ['glance-api'],
'heat': ['heat-api'],
'keystone': ['apache2']
}
self._check_services_alerts(**params)
def _check_mysql_alerts_node(
self, nagios_status, influx_status, disk_usage_percent):
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name(
self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip))
self.helpers.get_vip_resource_name(
"infrastructure_alerting_mgmt_vip"))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ["controller"])
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
alerting_plugin = self.LMA_INFRASTRUCTURE_ALERTING
url = alerting_plugin.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "Nagios Core",
"//frame[2]") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
controller_nodes = (
self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller']))
for key in services:
for service in services[key]:
logger.info("Checking service {0}".format(service))
self.change_verify_service_state(
service_name=[service, key], action='stop',
new_state='CRITICAL',
service_state_in_influx=self.settings.CRIT,
down_backends_in_haproxy=2,
toolchain_node=toolchain_node,
controller_nodes=[controller_nodes[0],
controller_nodes[1]],
nagios_driver=driver)
self.change_verify_service_state(
service_name=[service, key], action='start',
new_state='OK',
service_state_in_influx=self.settings.OKAY,
down_backends_in_haproxy=0,
toolchain_node=toolchain_node,
controller_nodes=[controller_nodes[0],
controller_nodes[1]],
nagios_driver=driver)
alerting_plugin.open_nagios_page(
driver, "Services", "//table[@class='headertable']")
nagios_service_name = (
"mysql"
if alerting_plugin.settings.version.startswith("0.")
else "global-mysql")
self.change_verify_node_service_state(
[nagios_service_name, "mysql-nodes.mysql-fs", "mysql"],
nagios_status,
influx_status, disk_usage_percent, toolchain_node,
nailgun_nodes[:2], driver)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_warning_alert_node", "node_alert_warning",
@ -473,24 +495,11 @@ class TestFunctionalToolchain(api.ToolchainApi):
Duration 15m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip"))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller'])
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "Nagios Core",
"//frame[2]") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
self.change_verify_node_service_state(
['mysql', 'mysql-nodes.mysql-fs'], 'WARNING',
self.settings.WARN, '96', toolchain_node,
[nailgun_nodes[0], nailgun_nodes[1]], driver)
params = {
"nagios_status": "WARNING",
"influx_status": self.settings.WARN,
"disk_usage_percent": 91}
self._check_mysql_alerts_node(**params)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_critical_alert_node", "node_alert_critical",
@ -550,21 +559,8 @@ class TestFunctionalToolchain(api.ToolchainApi):
Duration 15m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip"))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller'])
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "Nagios Core",
"//frame[2]") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
self.change_verify_node_service_state(
['mysql', 'mysql-nodes.mysql-fs'], 'CRITICAL',
self.settings.UNKW, '98', toolchain_node,
[nailgun_nodes[0], nailgun_nodes[1]], driver)
params = {
"nagios_status": "CRITICAL",
"influx_status": self.settings.CRIT,
"disk_usage_percent": 96}
self._check_mysql_alerts_node(**params)