diff --git a/stacklight_tests/helpers/remote_ops.py b/stacklight_tests/helpers/remote_ops.py index 90edadd..f170604 100644 --- a/stacklight_tests/helpers/remote_ops.py +++ b/stacklight_tests/helpers/remote_ops.py @@ -139,13 +139,10 @@ def manage_service(remote, name, operation="restart"): :param operation: type of operation, usually start, stop or restart. :type operation: str """ - - if remote.execute("service {} status".format(name))['exit_code'] == 0: - service_cmd = 'service {service} {operation}' - elif remote.execute("initctl status {}".format(name))['exit_code'] == 0: + if remote.execute("ls /etc/init/{}.conf".format(name))["exit_code"] == 0: service_cmd = 'initctl {operation} {service}' else: - raise Exception('no service handler!') + service_cmd = 'service {service} {operation}' remote.check_call(service_cmd.format(service=name, operation=operation)) diff --git a/stacklight_tests/lma_infrastructure_alerting/api.py b/stacklight_tests/lma_infrastructure_alerting/api.py index c8a9559..d78bc4f 100644 --- a/stacklight_tests/lma_infrastructure_alerting/api.py +++ b/stacklight_tests/lma_infrastructure_alerting/api.py @@ -11,19 +11,18 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. -import six.moves as sm from devops.helpers import helpers from fuelweb_test import logger from proboscis import asserts -from selenium.common.exceptions import StaleElementReferenceException -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait +from selenium.common import exceptions +from selenium.webdriver.common import by +from selenium.webdriver.support import expected_conditions as ec +from selenium.webdriver.support import ui from stacklight_tests import base_test -from stacklight_tests.lma_infrastructure_alerting import( +from stacklight_tests.lma_infrastructure_alerting import ( plugin_settings as infra_alerting_plugin_settings) @@ -100,8 +99,8 @@ class InfraAlertingPluginApi(base_test.PluginApi): link.click() driver.switch_to.default_content() driver.switch_to.frame(driver.find_element_by_name("main")) - WebDriverWait(driver, 120).until( - EC.presence_of_element_located((By.XPATH, anchor))) + ui.WebDriverWait(driver, 120).until( + ec.presence_of_element_located((by.By.XPATH, anchor))) return driver def check_node_in_nagios(self, changed_node, state): @@ -117,7 +116,7 @@ class InfraAlertingPluginApi(base_test.PluginApi): def node_is_present(self, driver, name): table = self.ui_tester.get_table(driver, "/html/body/div[2]/table/tbody") - for ind in sm.xrange(2, self.ui_tester.get_table_size(table) + 1): + for ind in range(2, self.ui_tester.get_table_size(table) + 1): node_name = self.ui_tester.get_table_cell( table, ind, 1).text.rstrip() if name == node_name: @@ -133,9 +132,12 @@ class InfraAlertingPluginApi(base_test.PluginApi): return self.helpers.check_plugin_cannot_be_uninstalled( self.settings.name, self.settings.version) - def get_services_for_node(self, table, node_name, driver, + def get_services_for_node(self, node_name, driver, table_xpath="/html/body/table[3]/tbody"): services = {} + limit_xpath = "//select[@name='limit']/option[@value='0']" + driver.find_element_by_xpath(limit_xpath).click() + table = self.ui_tester.get_table(driver, table_xpath) found_node = False ind = 2 while ind < self.ui_tester.get_table_size(table) + 1: @@ -144,6 +146,7 @@ class InfraAlertingPluginApi(base_test.PluginApi): if found_node: break else: + ind += 1 continue if self.ui_tester.get_table_cell( table, ind, 1).text == node_name: @@ -152,7 +155,8 @@ class InfraAlertingPluginApi(base_test.PluginApi): services[self.ui_tester.get_table_cell( table, ind, 2).text] = ( self.ui_tester.get_table_cell(table, ind, 3).text) - except StaleElementReferenceException: + except exceptions.StaleElementReferenceException: + driver.find_element_by_xpath(limit_xpath).click() table = self.ui_tester.get_table(driver, table_xpath) ind -= 1 ind += 1 @@ -162,12 +166,12 @@ class InfraAlertingPluginApi(base_test.PluginApi): def check_service_state_on_nagios(self, driver, service_state=None, node_names=None): self.open_nagios_page( - driver, 'Services', "//table[@class='headertable']") + driver, "Services", "//table[@class='headertable']") table = self.ui_tester.get_table(driver, "/html/body/table[3]/tbody") if not node_names: node_names = [self.ui_tester.get_table_cell(table, 2, 1).text] for node in node_names: - node_services = self.get_services_for_node(table, node, driver) + node_services = self.get_services_for_node(node, driver) if service_state: for service in service_state: if service_state[service] != node_services[service]: @@ -182,20 +186,10 @@ class InfraAlertingPluginApi(base_test.PluginApi): node_names=None): msg = ("Fail to get expected service states for services: {0} " "on nodes: {1}") - - if not service_state or not node_names: - self.open_nagios_page( - driver, 'Services', "//table[@class='headertable']") - table = self.ui_tester.get_table(driver, - "/html/body/table[3]/tbody") - if not node_names: - node_names = [self.ui_tester.get_table_cell(table, 2, 1).text] - if not service_state: - service_state = dict((key, 'OK') for key in - self.get_services_for_node( - table, node_names[0], driver)) - - msg = msg.format([key for key in service_state], node_names) + msg = msg.format( + [key for key in service_state] + if service_state is not None else "all", + node_names if node_names is not None else "global-cluster") helpers.wait(lambda: self.check_service_state_on_nagios( driver, service_state, node_names), timeout=60 * 5, diff --git a/stacklight_tests/toolchain/api.py b/stacklight_tests/toolchain/api.py index cc83550..7af7c33 100644 --- a/stacklight_tests/toolchain/api.py +++ b/stacklight_tests/toolchain/api.py @@ -384,16 +384,18 @@ class ToolchainApi(object): and Nagios UI. :param service_name: name of the service to change state of. - Format [service name, service name - on dashboard] e.g. ['nova-api', 'nova'] + Format: + [service name, service name in influx, + service name on nagios, haproxy-backend for service if exist] + e.g. ['nova-api', 'nova', 'nova-global', 'nova-api'] :type service_name: list. :param action: action to perform (e.g. stop, start). :type action: str :param new_state: new state of the service. :type new_state: str :param service_state_in_influx: new state of the service in influx. - :type new_state: int - :param down_backends_in_haproxy: amout of backends in 'down' state. + :type service_state_in_influx: int + :param down_backends_in_haproxy: amount of backends in 'down' state. :type down_backends_in_haproxy: int :param toolchain_node: toolchain node with infrastructure_alerting_ui vip. @@ -415,15 +417,16 @@ class ToolchainApi(object): node) as remote: self.remote_ops.manage_service(remote, service_name[0], action) self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( - nagios_driver, {service_name[1]: new_state}) + nagios_driver, {service_name[2]: new_state}) self.INFLUXDB_GRAFANA.check_cluster_status( service_name[1], service_state_in_influx) - self.INFLUXDB_GRAFANA.check_count_of_haproxy_backends( - service_name[0], expected_count=down_backends_in_haproxy) + if service_name[3]: + self.INFLUXDB_GRAFANA.check_count_of_haproxy_backends( + service_name[3], expected_count=down_backends_in_haproxy) with self.helpers.fuel_web.get_ssh_for_nailgun_node( toolchain_node) as remote: self.checkers.check_local_mail( - remote, toolchain_node["name"], service_name[1], new_state) + remote, toolchain_node["name"], service_name[2], new_state) def change_verify_node_service_state(self, services, state, influx_state, percent, toolchain_node, @@ -462,11 +465,11 @@ class ToolchainApi(object): "/var/lib/mysql/test/bigfile") self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( - nagios_driver, {services[0]: 'OK'}) + nagios_driver, {services[0]: "OK"}) self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( nagios_driver, {services[1]: state}, - [controller_nodes[0]['hostname']]) - self.INFLUXDB_GRAFANA.check_cluster_status(services[0], + [controller_nodes[0]["hostname"]]) + self.INFLUXDB_GRAFANA.check_cluster_status(services[2], self.settings.OKAY) with self.fuel_web.get_ssh_for_nailgun_node( @@ -479,8 +482,8 @@ class ToolchainApi(object): self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( nagios_driver, {services[0]: state}) self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( - nagios_driver, {services[1]: state}, [node['hostname']]) - self.INFLUXDB_GRAFANA.check_cluster_status(services[0], influx_state) + nagios_driver, {services[1]: state}, [node["hostname"]]) + self.INFLUXDB_GRAFANA.check_cluster_status(services[2], influx_state) with self.helpers.fuel_web.get_ssh_for_nailgun_node( toolchain_node) as remote: @@ -494,13 +497,13 @@ class ToolchainApi(object): for node in controller_nodes: self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( - nagios_driver, {services[0]: 'OK'}) + nagios_driver, {services[0]: "OK"}) self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( - nagios_driver, {services[1]: 'OK'}, [node['hostname']]) - self.INFLUXDB_GRAFANA.check_cluster_status(services[0], + nagios_driver, {services[1]: "OK"}, [node["hostname"]]) + self.INFLUXDB_GRAFANA.check_cluster_status(services[2], self.settings.OKAY) with self.helpers.fuel_web.get_ssh_for_nailgun_node( toolchain_node) as remote: self.checkers.check_local_mail( - remote, toolchain_node["name"], services[0], 'OK') + remote, toolchain_node["name"], services[0], "OK") diff --git a/stacklight_tests/toolchain/test_functional.py b/stacklight_tests/toolchain/test_functional.py index 83c2432..3204818 100644 --- a/stacklight_tests/toolchain/test_functional.py +++ b/stacklight_tests/toolchain/test_functional.py @@ -236,6 +236,78 @@ class TestFunctionalToolchain(api.ToolchainApi): self.check_cinder_notifications() + def _check_services_alerts(self, controllers_count, nagios_status, + influx_status, down_haproxy_count): + components = { + "nova": [("nova-api", "nova-api"), ("nova-scheduler", None)], + "cinder": [("cinder-api", "cinder-api"), + ("cinder-scheduler", None)], + "neutron": [ + ("neutron-server", "neutron-api"), + # TODO(rpromyshlennikov): temporary fix, + # because openvswitch-agent is managed by pacemaker + # ("neutron-openvswitch-agent", None) + ], + "glance": [("glance-api", "glance-api")], + "heat": [("heat-api", "heat-api")], + "keystone": [("apache2", "keystone-public-api")] + } + + alerting_plugin = self.LMA_INFRASTRUCTURE_ALERTING + services_names_in_nagios = {} + for service in components: + nagios_service_name = ( + service + if alerting_plugin.settings.version.startswith("0.") + else "global-{}".format(service) + ) + services_names_in_nagios[service] = nagios_service_name + + lma_devops_node = self.helpers.get_node_with_vip( + self.settings.stacklight_roles, + self.helpers.get_vip_resource_name( + alerting_plugin.settings.failover_vip)) + toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( + lma_devops_node) + + url = alerting_plugin.get_authenticated_nagios_url() + with self.ui_tester.ui_driver(url, "Nagios Core", + "//frame[2]") as driver: + alerting_plugin.open_nagios_page( + driver, "Services", "//table[@class='headertable']") + controller_nodes = ( + self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, + ["controller"])[:controllers_count] + ) + for component in components: + for (service, haproxy_backend) in components[component]: + logger.info("Checking service {0}".format(service)) + self.change_verify_service_state( + service_name=[ + service, component, + services_names_in_nagios[component], + haproxy_backend], + action="stop", + new_state=nagios_status, + service_state_in_influx=influx_status, + down_backends_in_haproxy=down_haproxy_count, + toolchain_node=toolchain_node, + controller_nodes=controller_nodes, + nagios_driver=driver) + self.change_verify_service_state( + service_name=[ + service, component, + services_names_in_nagios[component], + haproxy_backend], + action="start", + new_state="OK", + service_state_in_influx=self.settings.OKAY, + down_backends_in_haproxy=0, + toolchain_node=toolchain_node, + controller_nodes=controller_nodes, + nagios_driver=driver) + @test(depends_on_groups=["deploy_ha_toolchain"], groups=["toolchain_warning_alert_service", "service_restart", "toolchain", "functional"]) @@ -282,54 +354,17 @@ class TestFunctionalToolchain(api.ToolchainApi): Duration 45m """ self.env.revert_snapshot("deploy_ha_toolchain") + params = {"controllers_count": 1, + "nagios_status": "WARNING", + "influx_status": self.settings.WARN, + "down_haproxy_count": 1} - services = { - 'nova': ['nova-api', 'nova-scheduler'], - 'cinder': ['cinder-api', 'cinder-scheduler'], - 'neutron': ['neutron-server', 'neutron-openvswitch-agent'], - 'glance': ['glance-api'], - 'heat': ['heat-api'], - 'keystone': ['apache2'] - } - - lma_devops_node = self.helpers.get_node_with_vip( - self.settings.stacklight_roles, - self.helpers.full_vip_name( - self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip)) - toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( - lma_devops_node) - - url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() - with self.ui_tester.ui_driver(url, "Nagios Core", - "//frame[2]") as driver: - self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( - driver, 'Services', "//table[@class='headertable']") - controller_node = ( - self.fuel_web.get_nailgun_cluster_nodes_by_roles( - self.helpers.cluster_id, ['controller'])[0]) - for key in services: - for service in services[key]: - self.change_verify_service_state( - service_name=[service, key], action='stop', - new_state='WARNING', - service_state_in_influx=self.settings.WARN, - down_backends_in_haproxy=1, - toolchain_node=toolchain_node, - controller_nodes=[controller_node], - nagios_driver=driver) - self.change_verify_service_state( - service_name=[service, key], action='start', - new_state='OK', - service_state_in_influx=self.settings.OKAY, - down_backends_in_haproxy=0, - toolchain_node=toolchain_node, - controller_nodes=[controller_node], - nagios_driver=driver) + self._check_services_alerts(**params) @test(depends_on_groups=["deploy_ha_toolchain"], groups=["toolchain_critical_alert_service", "service_restart", "toolchain", "functional"]) - # @log_snapshot_after_test + @log_snapshot_after_test def toolchain_critical_alert_service(self): """Verify that the critical alerts for services show up in the Grafana and Nagios UI. @@ -367,52 +402,39 @@ class TestFunctionalToolchain(api.ToolchainApi): Duration 45m """ self.env.revert_snapshot("deploy_ha_toolchain") + params = {"controllers_count": 2, + "nagios_status": "CRITICAL", + "influx_status": self.settings.CRIT, + "down_haproxy_count": 2} - services = { - 'nova': ['nova-api', 'nova-scheduler'], - 'cinder': ['cinder-api', 'cinder-scheduler'], - 'neutron': ['neutron-server', 'neutron-openvswitch-agent'], - 'glance': ['glance-api'], - 'heat': ['heat-api'], - 'keystone': ['apache2'] - } + self._check_services_alerts(**params) + def _check_mysql_alerts_node( + self, nagios_status, influx_status, disk_usage_percent): lma_devops_node = self.helpers.get_node_with_vip( self.settings.stacklight_roles, - self.helpers.full_vip_name( - self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip)) + self.helpers.get_vip_resource_name( + "infrastructure_alerting_mgmt_vip")) toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( lma_devops_node) + nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ["controller"]) - url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() + alerting_plugin = self.LMA_INFRASTRUCTURE_ALERTING + url = alerting_plugin.get_authenticated_nagios_url() with self.ui_tester.ui_driver(url, "Nagios Core", "//frame[2]") as driver: - self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( - driver, 'Services', "//table[@class='headertable']") - controller_nodes = ( - self.fuel_web.get_nailgun_cluster_nodes_by_roles( - self.helpers.cluster_id, ['controller'])) - for key in services: - for service in services[key]: - logger.info("Checking service {0}".format(service)) - self.change_verify_service_state( - service_name=[service, key], action='stop', - new_state='CRITICAL', - service_state_in_influx=self.settings.CRIT, - down_backends_in_haproxy=2, - toolchain_node=toolchain_node, - controller_nodes=[controller_nodes[0], - controller_nodes[1]], - nagios_driver=driver) - self.change_verify_service_state( - service_name=[service, key], action='start', - new_state='OK', - service_state_in_influx=self.settings.OKAY, - down_backends_in_haproxy=0, - toolchain_node=toolchain_node, - controller_nodes=[controller_nodes[0], - controller_nodes[1]], - nagios_driver=driver) + alerting_plugin.open_nagios_page( + driver, "Services", "//table[@class='headertable']") + nagios_service_name = ( + "mysql" + if alerting_plugin.settings.version.startswith("0.") + else "global-mysql") + self.change_verify_node_service_state( + [nagios_service_name, "mysql-nodes.mysql-fs", "mysql"], + nagios_status, + influx_status, disk_usage_percent, toolchain_node, + nailgun_nodes[:2], driver) @test(depends_on_groups=["deploy_ha_toolchain"], groups=["toolchain_warning_alert_node", "node_alert_warning", @@ -473,24 +495,11 @@ class TestFunctionalToolchain(api.ToolchainApi): Duration 15m """ self.env.revert_snapshot("deploy_ha_toolchain") - - lma_devops_node = self.helpers.get_node_with_vip( - self.settings.stacklight_roles, - self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip")) - toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( - lma_devops_node) - nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles( - self.helpers.cluster_id, ['controller']) - - url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() - with self.ui_tester.ui_driver(url, "Nagios Core", - "//frame[2]") as driver: - self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( - driver, 'Services', "//table[@class='headertable']") - self.change_verify_node_service_state( - ['mysql', 'mysql-nodes.mysql-fs'], 'WARNING', - self.settings.WARN, '96', toolchain_node, - [nailgun_nodes[0], nailgun_nodes[1]], driver) + params = { + "nagios_status": "WARNING", + "influx_status": self.settings.WARN, + "disk_usage_percent": 91} + self._check_mysql_alerts_node(**params) @test(depends_on_groups=["deploy_ha_toolchain"], groups=["toolchain_critical_alert_node", "node_alert_critical", @@ -550,21 +559,8 @@ class TestFunctionalToolchain(api.ToolchainApi): Duration 15m """ self.env.revert_snapshot("deploy_ha_toolchain") - - lma_devops_node = self.helpers.get_node_with_vip( - self.settings.stacklight_roles, - self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip")) - toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( - lma_devops_node) - nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles( - self.helpers.cluster_id, ['controller']) - - url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() - with self.ui_tester.ui_driver(url, "Nagios Core", - "//frame[2]") as driver: - self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( - driver, 'Services', "//table[@class='headertable']") - self.change_verify_node_service_state( - ['mysql', 'mysql-nodes.mysql-fs'], 'CRITICAL', - self.settings.UNKW, '98', toolchain_node, - [nailgun_nodes[0], nailgun_nodes[1]], driver) + params = { + "nagios_status": "CRITICAL", + "influx_status": self.settings.CRIT, + "disk_usage_percent": 96} + self._check_mysql_alerts_node(**params)