Add basic tools for benchmarking

Adds a horribly written, just hacked together little tool to help provide sizing insight into an ironic deployment's state and underlying performance. Key data: * Queries the list of node from a pure python interface level with the database and reports timeing for the list of nodes to be returned. This information helps convey how long a periodic hits the database just for the query. * Requests *all* nodes using the query pattern/structure of the nova resource tracker, and uses the marker to make any additional requsts. The data is parsed, and collected, and counts identified vendors, if any. * Collects basic data on conductors in terms of running, conductor groups as well as currently loaded drivers in the deployment. All of this information provides operational insight into *what* conditions exist within the deployment allowing developers to try and identify solutions based on the unique circumstances of larger deployments. Also adds a utility to generate and semi-randomize data to allow us to create a benchmark job in CI. Change-Id: Iae660aea82db8f1c4567ee2982595ccfdf434fe3
2021-04-27 10:22:42 -07:00 · 2021-04-27 10:22:42 -07:00 · ffff76a682
commit ffff76a682
parent 97ceb7bd15
3 changed files with 307 additions and 0 deletions
--- a/tools/benchmark/README
+++ b/tools/benchmark/README
@ -0,0 +1,13 @@
+This folder contains two files:
+
+* do_not_run_create_benchmark_data.py - This script will destroy your
+  ironic database. DO NOT RUN IT. You have been warned!
+  It is is intended to generate a semi-random database of node data
+  which can be used for benchmarks, instead of crafting a raw SQL file
+  representing a test model
+
+* generate-statistics.py - This is a utility some statistics to both
+  aid in basic benchmarking of ironic operations *and* provide developers
+  with conceptual information regarding a deployment's size. It operates
+  only by reading the data present and timing how long the result take to
+  return as well as isolating some key details about the deployment.
--- a/tools/benchmark/do_not_run_create_benchmark_data.py
+++ b/tools/benchmark/do_not_run_create_benchmark_data.py
@ -0,0 +1,99 @@
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import time
+
+from oslo_db.sqlalchemy import enginefacade
+from sqlalchemy import sql
+
+from ironic.common import service
+from ironic.conf import CONF  # noqa To Load Configuration
+from ironic.objects import node
+
+
+def _create_test_nodes():
+    print("Starting creation of fake nodes.")
+    start = time.time()
+    node_count = 10000
+    checkin = time.time()
+    for i in range(0, node_count):
+
+        new_node = node.Node({
+            'power_state': 'power off',
+            'driver': 'ipmi',
+            'driver_internal_info': {'test-meow': i},
+            'name': 'BenchmarkTestNode-%s' % i,
+            'driver_info': {
+                'ipmi_username': 'admin',
+                'ipmi_password': 'admin',
+                'ipmi_address': 'testhost%s.env.top.level.domain' % i},
+            'resource_class': 'CUSTOM_BAREMETAL',
+            'properties': {
+                'cpu': 4,
+                'memory': 32,
+                'cats': i,
+                'meowing': True}})
+        new_node.create()
+        delta = time.time() - checkin
+        if delta > 10:
+            checkin = time.time()
+            print('* At %s nodes, %0.02f seconds. Total elapsed: %s'
+                  % (i, delta, time.time() - start))
+    created = time.time()
+    elapse = created - start
+    print('Created %s nodes in %s seconds.\n' % (node_count, elapse))
+
+
+def _mix_up_nodes_data():
+    engine = enginefacade.writer.get_engine()
+    conn = engine.connect()
+
+    # A list of commands to mix up indexed field data a bit to emulate what
+    # a production database may somewhat look like.
+    commands = [
+        "UPDATE nodes set maintenance = True where RAND() < 0.1",  # noqa Easier to read this way
+        "UPDATE nodes set driver = 'redfish' where RAND() < 0.5",  # noqa Easier to read this way
+        "UPDATE nodes set reservation = 'fake_conductor01' where RAND() < 0.02",  # noqa Easier to read this way
+        "UPDATE nodes set reservation = 'fake_conductor02' where RAND() < 0.02",  # noqa Easier to read this way
+        "UPDATE nodes set reservation = 'fake_conductor03' where RAND() < 0.02",  # noqa Easier to read this way
+        "UPDATE nodes set reservation = 'fake_conductor04' where RAND() < 0.02",  # noqa Easier to read this way
+        "UPDATE nodes set reservation = 'fake_conductor05' where RAND() < 0.02",  # noqa Easier to read this way
+        "UPDATE nodes set reservation = 'fake_conductor06' where RAND() < 0.02",  # noqa Easier to read this way
+        "UPDATE nodes set provision_state = 'active' where RAND() < 0.8",  # noqa Easier to read this way
+        "UPDATE nodes set power_state = 'power on' where provision_state = 'active' and RAND() < 0.95",  # noqa Easier to read this way
+        "UPDATE nodes set provision_state = 'available' where RAND() < 0.1",  # noqa Easier to read this way
+        "UPDATE nodes set provision_state = 'manageable' where RAND() < 0.1",  # noqa Easier to read this way
+        "UPDATE nodes set provision_state = 'clean wait' where RAND() < 0.05",  # noqa Easier to read this way
+        "UPDATE nodes set provision_state = 'error' where RAND() < 0.05",  # noqa Easier to read this way
+        "UPDATE nodes set owner = (select UUID()) where RAND() < 0.2",  # noqa Easier to read this way
+        "UPDATE nodes set lessee = (select UUID()) where RAND() < 0.2",  # noqa Easier to read this way
+        "UPDATE nodes set instance_uuid = (select UUID()) where RAND() < 0.95 and provision_state = 'active'",  # noqa Easier to read this way
+        "UPDATE nodes set last_error = (select UUID()) where RAND() <0.05",  # noqa Easier to read this way
+    ]
+    start = time.time()
+    for command in commands:
+        print("Executing SQL command: \\" + command + ";\n")
+        conn.execute(sql.text(command))
+        print("* Completed command. %0.04f elapsed since start of commands."
+              % (time.time() - start))
+
+
+def main():
+    service.prepare_service()
+    CONF.set_override('debug', False)
+    _create_test_nodes()
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/tools/benchmark/generate-statistics.py
+++ b/tools/benchmark/generate-statistics.py
@ -0,0 +1,195 @@
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import datetime
+import sys
+import time
+from unittest import mock
+
+from ironic_lib import metrics_utils
+import oslo_policy
+from oslo_utils import timeutils
+
+from ironic.api.controllers.v1 import node as node_api
+from ironic.api.controllers.v1 import utils as api_utils
+from ironic.common import context
+from ironic.common import service
+from ironic.conf import CONF  # noqa To Load Configuration
+from ironic.db import api as db_api
+from ironic.objects import conductor
+from ironic.objects import node
+
+
+def _calculate_delta(start, finish):
+    return finish - start
+
+
+def _add_a_line():
+    print('------------------------------------------------------------')
+
+
+def _assess_db_performance():
+    start = time.time()
+    dbapi = db_api.get_instance()
+    print('Phase - Assess DB performance')
+    _add_a_line()
+    got_connection = time.time()
+    nodes = dbapi.get_node_list()
+    node_count = len(nodes)
+    query_complete = time.time()
+    delta = _calculate_delta(start, got_connection)
+    print('Obtained DB client in %s seconds.' % delta)
+    delta = _calculate_delta(got_connection, query_complete)
+    print('Returned %s nodes in python %s seconds from the DB.\n' %
+          (node_count, delta))
+    # return node count for future use.
+    return node_count
+
+
+def _assess_db_and_object_performance():
+    print('Phase - Assess DB & Object conversion Performance')
+    _add_a_line()
+    start = time.time()
+    node_list = node.Node().list(context.get_admin_context())
+    got_list = time.time()
+    delta = _calculate_delta(start, got_list)
+    print('Obtained list of node objects in %s seconds.' % delta)
+    count = 0
+    tbl_size = 0
+    # In a sense, this helps provide a relative understanding if the
+    # database is the bottleneck, or the objects post conversion.
+    # converting completely to json and then measuring the size helps
+    # ensure that everything is "assessed" while not revealing too
+    # much detail.
+    for node_obj in node_list:
+        # Just looping through the entire set to count should be
+        # enough to ensure that the entry is loaded from the db
+        # and then converted to an object.
+        tbl_size = tbl_size + sys.getsizeof(node_obj.as_dict(secure=True))
+        count = count + 1
+    delta = _calculate_delta(got_list, time.time())
+    print('Took %s seconds to iterate through %s node objects.' %
+          (delta, count))
+    print('Nodes table is roughly %s bytes of JSON.\n' % tbl_size)
+    observed_vendors = []
+    for node_obj in node_list:
+        vendor = node_obj.driver_internal_info.get('vendor')
+        if vendor:
+            observed_vendors.append(vendor)
+
+
+@mock.patch('ironic.api.request')  # noqa patch needed for the object model
+@mock.patch.object(metrics_utils, 'get_metrics_logger', lambda *_: mock.Mock)
+@mock.patch.object(api_utils, 'check_list_policy', lambda *_: None)
+@mock.patch.object(api_utils, 'check_allow_specify_fields', lambda *_: None)
+@mock.patch.object(api_utils, 'check_allowed_fields', lambda *_: None)
+@mock.patch.object(oslo_policy.policy, 'LOG', autospec=True)
+def _assess_db_object_and_api_performance(mock_log, mock_request):
+    print('Phase - Assess DB & Object conversion Performance')
+    _add_a_line()
+    # Just mock it to silence it since getting the logger to update
+    # config seems like not a thing once started. :\
+    mock_log.debug = mock.Mock()
+    # Internal logic requires major/minor versions and a context to
+    # proceed. This is just to make the NodesController respond properly.
+    mock_request.context = context.get_admin_context()
+    mock_request.version.major = 1
+    mock_request.version.minor = 71
+
+    start = time.time()
+    node_api_controller = node_api.NodesController()
+    node_api_controller.context = context.get_admin_context()
+    fields = ("uuid,power_state,target_power_state,provision_state,"
+              "target_provision_state,last_error,maintenance,properties,"
+              "instance_uuid,traits,resource_class")
+
+    total_nodes = 0
+
+    res = node_api_controller._get_nodes_collection(
+        chassis_uuid=None,
+        instance_uuid=None,
+        associated=None,
+        maintenance=None,
+        retired=None,
+        provision_state=None,
+        marker=None,
+        limit=None,
+        sort_key="id",
+        sort_dir="asc",
+        fields=fields.split(','))
+    total_nodes = len(res['nodes'])
+    while len(res['nodes']) != 1:
+        print(" ** Getting nodes ** %s Elapsed: %s seconds." %
+              (total_nodes, _calculate_delta(start, time.time())))
+        res = node_api_controller._get_nodes_collection(
+            chassis_uuid=None,
+            instance_uuid=None,
+            associated=None,
+            maintenance=None,
+            retired=None,
+            provision_state=None,
+            marker=res['nodes'][-1]['uuid'],
+            limit=None,
+            sort_key="id",
+            sort_dir="asc",
+            fields=fields.split(','))
+        new_nodes = len(res['nodes'])
+        if new_nodes == 0:
+            break
+        total_nodes = total_nodes + new_nodes
+
+    delta = _calculate_delta(start, time.time())
+    print('Took %s seconds to return all %s nodes via '
+          'nodes API call pattern.\n' % (delta, total_nodes))
+
+
+def _report_conductors():
+    print('Phase - identifying conductors/drivers')
+    _add_a_line()
+    conductors = conductor.Conductor().list(
+        context.get_admin_context(),
+    )
+    drivers = []
+    groups = []
+    online_count = 0
+    online_by = timeutils.utcnow(with_timezone=True) - \
+        datetime.timedelta(seconds=90)
+    for conductor_obj in conductors:
+        if conductor_obj.conductor_group:
+            groups.append(conductor_obj.conductor_group)
+        if conductor_obj.updated_at > online_by:
+            online_count = online_count + 1
+            for driver in conductor_obj.drivers:
+                drivers.append(driver)
+    conductor_count = len(conductors)
+    print('Conductor count: %s' % conductor_count)
+    print('Online conductor count: %s' % online_count)
+    running_with_groups = len(groups)
+    print('Conductors with conductor_groups: %s' % running_with_groups)
+    group_count = len(set(groups))
+    print('Conductor group count: %s' % group_count)
+    driver_list = list(set(drivers))
+    print('Presently supported drivers: %s' % driver_list)
+
+
+def main():
+    service.prepare_service()
+    CONF.set_override('debug', False)
+    _assess_db_performance()
+    _assess_db_and_object_performance()
+    _assess_db_object_and_api_performance()
+    _report_conductors()
+
+
+if __name__ == '__main__':
+    sys.exit(main())