From 92816644f7ec23098c12808737d6ee5315e0a712 Mon Sep 17 00:00:00 2001 From: Alexandr Nevenchannyy Date: Tue, 7 Jun 2016 18:49:14 +0300 Subject: [PATCH] OpenStack reliability test plan This document describes a abstract methodology for analysing reliability of high-availability OpenStack cluster and it's components. Co-Authored-By: Bogdan Dobrelia Change-Id: I5a08c1a39bab96d90c6f7a873fdc771516ffba48 --- doc/source/test_plans/index.rst | 2 +- doc/source/test_plans/reliability/plan.rst | 413 ++++++++++++++++++ .../test_plans/reliability/template.rst | 111 +++++ tests/test_titles.py | 16 +- 4 files changed, 538 insertions(+), 4 deletions(-) create mode 100644 doc/source/test_plans/reliability/plan.rst create mode 100644 doc/source/test_plans/reliability/template.rst diff --git a/doc/source/test_plans/index.rst b/doc/source/test_plans/index.rst index 6d9c3a9..9b31196 100644 --- a/doc/source/test_plans/index.rst +++ b/doc/source/test_plans/index.rst @@ -20,4 +20,4 @@ Test Plans neutron_features/l3_ha/test_plan hardware_features/index 1000_nodes/plan - + reliability/plan diff --git a/doc/source/test_plans/reliability/plan.rst b/doc/source/test_plans/reliability/plan.rst new file mode 100644 index 0000000..8c4bd02 --- /dev/null +++ b/doc/source/test_plans/reliability/plan.rst @@ -0,0 +1,413 @@ +.. _reliability_testing: + +============================= +OpenStack reliability testing +============================= + +:status: draft +:version: 0 + +:Abstract: + This document describes an abstract methodology for OpenStack cluster + high-availability testing and analysis. OpenStack data plane testing + at this moment is out of scope, but will be described in future. + +:Conventions: + +- **OpenStack cluster:** consists of server nodes with deployed and fully + operational OpenStack environment in high-availability configuration. + +- **Fault-injection operation:** represents common types of failures which can + occur in production environment: service-hang, service-crash, + network-partition, network-flapping, and node-crash. + +- **Service-hang:** faults are injected into specified OpenStack service by + sending -SIGSTOP and -SIGCONT POSIX signals. + +- **Service-crash:** faults are injected by sending -SIGKILL signal into + specified OpenStack service. + +- **Node-crash:** faults are injected to an OpenStack cluster by rebooting + or shutting down a server node. + +- **Network-partition:** faults are injected by inserting iptables rules to + OpenStack cluster nodes to a corresponding service that should be + network-partitioned. + +- **Network-flapping:** faults are injected into OpenStack cluster nodes by + inserting/deleting iptables rules on the fly which will affect + corresponding service that should be tested. + +- **Factor:** consists of a set of atomic fault-injection operations. For + example: reboot-random-controller, reboot-random-rabbitmq. + +- **Test plan:** contains two elements: test scenario + execution graph and fault-injection factors. + +- **SLA**: Service-level agreement + +- **Testing-cycles**: number of test cycles of each factor + +- **Inf**: assumes infinite time to auto-healing of cluster + after fault-factor injection. + + +Test Plan +========= + +Test Environment +---------------- + +This section should contain all information about deployed OpenStack +environment including archive with all information in the ``/etc`` folder from +all nodes. + +Preparation +^^^^^^^^^^^ + +This section should contain all steps to reproduce Openstack environment +deployment and client node. For example: if testing environment is deployed +with DevStack, this section should contain all DevStack configuration files, +DevStack version and all deployment steps. + +Environment description +^^^^^^^^^^^^^^^^^^^^^^^ + +This section should contain all cluster hardware information, including +processor model and its frequency, memory size, storage type and its capacity, +network interfaces, and others. +A separate client node must be used to drive the tests. + +Hardware +~~~~~~~~ + +This section should contain a full hardware nodes specification. + +.. table:: Description of server hardware + + +--------+----------------+-------+-------+ + |SERVER |name | | | + | +----------------+-------+-------+ + | |role | | | + | +----------------+-------+-------+ + | |vendor,model | | | + | +----------------+-------+-------+ + | |operating_system| | | + +--------+----------------+-------+-------+ + |CPU |vendor,model | | | + | +----------------+-------+-------+ + | |processor_count | | | + | +----------------+-------+-------+ + | |core_count | | | + | +----------------+-------+-------+ + | |frequency_MHz | | | + +--------+----------------+-------+-------+ + |RAM |vendor,model | | | + | +----------------+-------+-------+ + | |amount_MB | | | + +--------+----------------+-------+-------+ + |NETWORK |interface_name | | | + | +----------------+-------+-------+ + | |vendor,model | | | + | +----------------+-------+-------+ + | |bandwidth | | | + +--------+----------------+-------+-------+ + |STORAGE |dev_name | | | + | +----------------+-------+-------+ + | |vendor,model | | | + | +----------------+-------+-------+ + | |SSD/HDD | | | + | +----------------+-------+-------+ + | |size | | | + +--------+----------------+-------+-------+ + +Networking +~~~~~~~~~~ + +This section should сontain full description of network equipment used in +OpenStack cluster. Network topology diagram and network hardware +configuration files should be included in this section. + +Factors description +------------------- + + Please define here description of used factors during test runs. + Examples are: + + - **reboot-random-controller:** consist node-crash fault injection on random + OpenStack controller node. + + - **reboot-random-rabbitmq:** consist node-crash fault injection on master + RabbitMQ messaging node. + + - **sigstop-random-nova-api:** consist service-hang fault injection on random + nova-api service. + + - **sigkill-random-mysql:** consist service-crash fault injection on + random MySQL node. + + - **network-partition-random-mysql:** consist network-partition fault injection on + random MySQL node. + + +Test Case 1: NovaServers.boot_and_delete_server +----------------------------------------------- + +Description +^^^^^^^^^^^ + +This Rally scenario boots and deletes virtual instances with injected fault +factors through OpenStack Nova API. + +Service-level agreement +^^^^^^^^^^^^^^^^^^^^^^^ + +In this section, specify SLA values. For example: + +=================== ======== +Parameter Value +=================== ======== +MTTR (sec) <=240 +Failure rate (%) <=95 +Auto-healing Yes +=================== ======== + +Parameters +^^^^^^^^^^ + +In this section, specify load parameters during the test. For example: + +=================== ======== +Parameter Value +=================== ======== +Runner constant +Concurrency X +Times Y +Injection-iteration Z +Testing-cycles N +=================== ======== + +List of reliability metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +======== ============== ================= ================================================= +Priority Value Measurement Units Description +======== ============== ================= ================================================= +1 SLA Boolean Service-level agreement result +2 Auto-healing Boolean Is cluster auto-healed after fault-injection +3 Failure rate Percents Test iteration failure ratio +4 MTTR (auto) Seconds Automatic mean time to repair +5 MTTR (manual) Seconds Manual mean time to repair, if Auto MTTR is Inf. +======== ============== ================= ================================================= + +Results +^^^^^^^ + +reboot-random-controller +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. table:: **Full description of cyclic execution results** + + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | Cycles | MTTR(sec) | Failure rate(%) | Auto-healing | Performance degradation | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 1 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 2 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 3 | X | Y | No | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 4 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 5 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + +Place here link to rally report file with results of testing this factor. + +.. table:: **Testing results summary** + + +--------------------+------------+------------------+ + | Value | MTTR | Failure rate | + +--------------------+------------+------------------+ + | Min | X | Y | + +--------------------+------------+------------------+ + | Max | X | Y | + +--------------------+------------+------------------+ + | SLA | X | Y | + +--------------------+------------+------------------+ + +Detailed results description +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this section, specify detailed description of test results, +including factor impact. + +reboot-random-rabbitmq +~~~~~~~~~~~~~~~~~~~~~~ + +.. table:: **Full description of cyclic execution results** + + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | Cycles | MTTR(sec) | Failure rate(%) | Auto-healing | Performance degradation | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 1 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 2 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 3 | X | Y | No | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 4 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 5 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + +Place here link to rally report file with results of testing this factor. + +.. table:: **Testing results summary** + + +--------------------+------------+------------------+ + | Value | MTTR | Failure rate | + +--------------------+------------+------------------+ + | Min | X | Y | + +--------------------+------------+------------------+ + | Max | X | Y | + +--------------------+------------+------------------+ + | SLA | X | Y | + +--------------------+------------+------------------+ + +Detailed results description +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this section, specify detailed description of test results, +including factor impact. + + +Test Case 2: GlanceImages.create_and_delete_image +------------------------------------------------- + +Description +^^^^^^^^^^^ + +This Rally scenario creates and deletes images with injected fault +factors through OpenStack Glance API. + +Service-level agreement +^^^^^^^^^^^^^^^^^^^^^^^ + +In this section, specify SLA values. For example: + +=================== ======== +Parameter Value +=================== ======== +MTTR (sec) <=120 +Failure rate (%) <=95 +Auto-healing Yes +=================== ======== + +Parameters +^^^^^^^^^^ +In this section, specify load parameters during the test. For example: + +=================== ======== +Parameter Value +=================== ======== +Runner constant +Concurrency X +Times Y +Injection-iteration Z +Testing-cycles N +=================== ======== + +List of reliability metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +======== ============== ================= ================================================= +Priority Value Measurement Units Description +======== ============== ================= ================================================= +1 SLA Boolean Service-level agreement result +2 Auto-healing Boolean Is cluster auto-healed after fault-injection +3 Failure rate Percents Test iteration failure ratio +4 MTTR (auto) Seconds Automatic mean time to repair +5 MTTR (manual) Seconds Manual mean time to repair, if Auto MTTR is Inf. +======== ============== ================= ================================================= + +Results +^^^^^^^ + +reboot-random-controller +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. table:: **Full description of cyclic execution results** + + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | Cycles | MTTR(sec) | Failure rate(%) | Auto-healing | Performance degradation | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 1 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 2 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 3 | X | Y | No | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 4 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 5 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + +Place here link to rally report file with results of testing this factor. + +.. table:: **Testing results summary** + + +--------------------+------------+------------------+ + | Value | MTTR | Failure rate | + +--------------------+------------+------------------+ + | Min | X | Y | + +--------------------+------------+------------------+ + | Max | X | Y | + +--------------------+------------+------------------+ + | SLA | X | Y | + +--------------------+------------+------------------+ + +Detailed results description +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this section, specify detailed description of test results, +including factor impact. + +reboot-random-rabbitmq +~~~~~~~~~~~~~~~~~~~~~~ + +.. table:: **Full description of cyclic execution results** + + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | Cycles | MTTR(sec) | Failure rate(%) | Auto-healing | Performance degradation | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 1 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 2 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 3 | X | Y | No | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 4 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + | 5 | X | Y | Yes | Yes | + +--------------------+----------------+---------------------+------------------+-----------------------------+ + +Place here link to rally report file with results of testing this factor. + +.. table:: **Testing results summary** + + +--------------------+------------+------------------+ + | Value | MTTR | Failure rate | + +--------------------+------------+------------------+ + | Min | X | Y | + +--------------------+------------+------------------+ + | Max | X | Y | + +--------------------+------------+------------------+ + | SLA | X | Y | + +--------------------+------------+------------------+ + +Detailed results description +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this section, specify detailed description of test results, +including factor impact. diff --git a/doc/source/test_plans/reliability/template.rst b/doc/source/test_plans/reliability/template.rst new file mode 100644 index 0000000..99ef487 --- /dev/null +++ b/doc/source/test_plans/reliability/template.rst @@ -0,0 +1,111 @@ +====================================================== +Example Reliability Test Plan - The title of your plan +====================================================== + +:status: test plan status - either **draft** or **ready** +:version: test plan version + +:Abstract: + + Small description of what will be covered later in the test plan + +If needed, please define list of terms that will be used later in the test +plan: + +:Conventions: + + - **Some specific term #1:** its explanation + + - **Some specific term #2:** its explanation + + - ... + + - **Some specific term #n:** its explanation + +Test Plan +========= + +Define the test plan. Test plan can contain several test cases description +using sections, similar to the written below. + +Test Environment +---------------- + +Preparation +^^^^^^^^^^^ + +Please specify here what needs to be done with the environment to run +this test plan. This can include specific tools installation, +specific OpenStack deployment, etc. + +Environment description +^^^^^^^^^^^^^^^^^^^^^^^ + +Please define here used environment. You can use the scheme below for this +purpose or modify it due to your needs: + +* Hardware used (servers, switches, storage, etc.) +* Network scheme +* Software (operating systems, kernel parameters, network interfaces + configuration, disk partitioning configuration). If distributed provisioning + systems are to be tested then the parts that are distributed need to be + described here + +Factors description +------------------- + +Please define here description of used factors used in test runs. + +Test Case 1: Something very interesting #1 +------------------------------------------ + +Description +^^^^^^^^^^^ + +Define test case #1. Every test case can contain at least the sections, defined +below. + +Parameters +^^^^^^^^^^ + +Optional section. Can be used if there are multiple test cases differing in +some input parameters - if so, these parameters need to be listed here. + +List of reliability metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Mandatory section. Defines what measurements are in fact done during the test. +To be a good citizen in case of multiple metrics collection, it will be nice to +list the metrics ordered starting with the most important one. + +=========================== =============== ================= ============= +Priority Value Measurement Units Description +=========================== =============== ================= ============= +1 - most important What's measured +2 - less important What's measured +3 - not that much important What's measured +=========================== =============== ================= ============= + +Some additional section +^^^^^^^^^^^^^^^^^^^^^^^ + +Depending on the test case nature, something else may need to be defined. +If so, additional sections with free form titles should be added. + +Test Case n: Something very interesting #n +------------------------------------------ + +Define test case #n using the approach above. + +Some additional section +----------------------- + +If there are common details for all test cases, that need to be covered +separately, they can be encapsulated in additional free form sections. + +Upper level additional section +============================== + +If there are additional notes, small pieces of code and configurations, etc., +they can be defined in additional paragraphs. Huge pieces and large chunks of +configs should be stored in separated files. diff --git a/tests/test_titles.py b/tests/test_titles.py index 92483b9..61300e7 100644 --- a/tests/test_titles.py +++ b/tests/test_titles.py @@ -188,10 +188,9 @@ class TestTitles(testtools.TestCase): "Found trailing spaces on line %s of %s" % (i + 1, tpl)) def test_template(self): + # Global repository template with open("doc/source/test_plans/template.rst") as f: - template = f.read() - test_plan_tmpl = docutils.core.publish_doctree(template) - template_titles = self._get_titles(test_plan_tmpl) + global_template = f.read() files = glob.glob("doc/source/test_plans/*/plan.rst") files = [os.path.abspath(filename) for filename in files] @@ -201,6 +200,17 @@ class TestTitles(testtools.TestCase): data = f.read() os.chdir(os.path.dirname(filename)) + # Try to use template in directory where plan.rst is located + try: + with open("template.rst") as f: + # use local template + template = f.read() + except Exception: + # use global template + template = global_template + pass + test_plan_tmpl = docutils.core.publish_doctree(template) + template_titles = self._get_titles(test_plan_tmpl) test_plan = docutils.core.publish_doctree(data) self._check_titles(filename, template_titles,