Add basic tools for benchmarking

Adds a horribly written, just hacked together little tool to help
provide sizing insight into an ironic deployment's state and underlying
performance.

Key data:
* Queries the list of node from a pure python interface level with the
  database and reports timeing for the list of nodes to be returned.
  This information helps convey how long a periodic hits the database
  just for the query.
* Requests *all* nodes using the query pattern/structure of the nova
  resource tracker, and uses the marker to make any additional requsts.
  The data is parsed, and collected, and counts identified vendors,
  if any.
* Collects basic data on conductors in terms of running, conductor groups
  as well as currently loaded drivers in the deployment.

All of this information provides operational insight into *what*
conditions exist within the deployment allowing developers to try
and identify solutions based on the unique circumstances of larger
deployments.

Also adds a utility to generate and semi-randomize data to allow us to
create a benchmark job in CI.

Change-Id: Iae660aea82db8f1c4567ee2982595ccfdf434fe3
This commit is contained in:
Julia Kreger 2021-04-27 10:22:42 -07:00
parent 97ceb7bd15
commit ffff76a682
3 changed files with 307 additions and 0 deletions

13
tools/benchmark/README Normal file
View File

@ -0,0 +1,13 @@
This folder contains two files:
* do_not_run_create_benchmark_data.py - This script will destroy your
ironic database. DO NOT RUN IT. You have been warned!
It is is intended to generate a semi-random database of node data
which can be used for benchmarks, instead of crafting a raw SQL file
representing a test model
* generate-statistics.py - This is a utility some statistics to both
aid in basic benchmarking of ironic operations *and* provide developers
with conceptual information regarding a deployment's size. It operates
only by reading the data present and timing how long the result take to
return as well as isolating some key details about the deployment.

View File

@ -0,0 +1,99 @@
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import sys
import time
from oslo_db.sqlalchemy import enginefacade
from sqlalchemy import sql
from ironic.common import service
from ironic.conf import CONF # noqa To Load Configuration
from ironic.objects import node
def _create_test_nodes():
print("Starting creation of fake nodes.")
start = time.time()
node_count = 10000
checkin = time.time()
for i in range(0, node_count):
new_node = node.Node({
'power_state': 'power off',
'driver': 'ipmi',
'driver_internal_info': {'test-meow': i},
'name': 'BenchmarkTestNode-%s' % i,
'driver_info': {
'ipmi_username': 'admin',
'ipmi_password': 'admin',
'ipmi_address': 'testhost%s.env.top.level.domain' % i},
'resource_class': 'CUSTOM_BAREMETAL',
'properties': {
'cpu': 4,
'memory': 32,
'cats': i,
'meowing': True}})
new_node.create()
delta = time.time() - checkin
if delta > 10:
checkin = time.time()
print('* At %s nodes, %0.02f seconds. Total elapsed: %s'
% (i, delta, time.time() - start))
created = time.time()
elapse = created - start
print('Created %s nodes in %s seconds.\n' % (node_count, elapse))
def _mix_up_nodes_data():
engine = enginefacade.writer.get_engine()
conn = engine.connect()
# A list of commands to mix up indexed field data a bit to emulate what
# a production database may somewhat look like.
commands = [
"UPDATE nodes set maintenance = True where RAND() < 0.1", # noqa Easier to read this way
"UPDATE nodes set driver = 'redfish' where RAND() < 0.5", # noqa Easier to read this way
"UPDATE nodes set reservation = 'fake_conductor01' where RAND() < 0.02", # noqa Easier to read this way
"UPDATE nodes set reservation = 'fake_conductor02' where RAND() < 0.02", # noqa Easier to read this way
"UPDATE nodes set reservation = 'fake_conductor03' where RAND() < 0.02", # noqa Easier to read this way
"UPDATE nodes set reservation = 'fake_conductor04' where RAND() < 0.02", # noqa Easier to read this way
"UPDATE nodes set reservation = 'fake_conductor05' where RAND() < 0.02", # noqa Easier to read this way
"UPDATE nodes set reservation = 'fake_conductor06' where RAND() < 0.02", # noqa Easier to read this way
"UPDATE nodes set provision_state = 'active' where RAND() < 0.8", # noqa Easier to read this way
"UPDATE nodes set power_state = 'power on' where provision_state = 'active' and RAND() < 0.95", # noqa Easier to read this way
"UPDATE nodes set provision_state = 'available' where RAND() < 0.1", # noqa Easier to read this way
"UPDATE nodes set provision_state = 'manageable' where RAND() < 0.1", # noqa Easier to read this way
"UPDATE nodes set provision_state = 'clean wait' where RAND() < 0.05", # noqa Easier to read this way
"UPDATE nodes set provision_state = 'error' where RAND() < 0.05", # noqa Easier to read this way
"UPDATE nodes set owner = (select UUID()) where RAND() < 0.2", # noqa Easier to read this way
"UPDATE nodes set lessee = (select UUID()) where RAND() < 0.2", # noqa Easier to read this way
"UPDATE nodes set instance_uuid = (select UUID()) where RAND() < 0.95 and provision_state = 'active'", # noqa Easier to read this way
"UPDATE nodes set last_error = (select UUID()) where RAND() <0.05", # noqa Easier to read this way
]
start = time.time()
for command in commands:
print("Executing SQL command: \\" + command + ";\n")
conn.execute(sql.text(command))
print("* Completed command. %0.04f elapsed since start of commands."
% (time.time() - start))
def main():
service.prepare_service()
CONF.set_override('debug', False)
_create_test_nodes()
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,195 @@
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import datetime
import sys
import time
from unittest import mock
from ironic_lib import metrics_utils
import oslo_policy
from oslo_utils import timeutils
from ironic.api.controllers.v1 import node as node_api
from ironic.api.controllers.v1 import utils as api_utils
from ironic.common import context
from ironic.common import service
from ironic.conf import CONF # noqa To Load Configuration
from ironic.db import api as db_api
from ironic.objects import conductor
from ironic.objects import node
def _calculate_delta(start, finish):
return finish - start
def _add_a_line():
print('------------------------------------------------------------')
def _assess_db_performance():
start = time.time()
dbapi = db_api.get_instance()
print('Phase - Assess DB performance')
_add_a_line()
got_connection = time.time()
nodes = dbapi.get_node_list()
node_count = len(nodes)
query_complete = time.time()
delta = _calculate_delta(start, got_connection)
print('Obtained DB client in %s seconds.' % delta)
delta = _calculate_delta(got_connection, query_complete)
print('Returned %s nodes in python %s seconds from the DB.\n' %
(node_count, delta))
# return node count for future use.
return node_count
def _assess_db_and_object_performance():
print('Phase - Assess DB & Object conversion Performance')
_add_a_line()
start = time.time()
node_list = node.Node().list(context.get_admin_context())
got_list = time.time()
delta = _calculate_delta(start, got_list)
print('Obtained list of node objects in %s seconds.' % delta)
count = 0
tbl_size = 0
# In a sense, this helps provide a relative understanding if the
# database is the bottleneck, or the objects post conversion.
# converting completely to json and then measuring the size helps
# ensure that everything is "assessed" while not revealing too
# much detail.
for node_obj in node_list:
# Just looping through the entire set to count should be
# enough to ensure that the entry is loaded from the db
# and then converted to an object.
tbl_size = tbl_size + sys.getsizeof(node_obj.as_dict(secure=True))
count = count + 1
delta = _calculate_delta(got_list, time.time())
print('Took %s seconds to iterate through %s node objects.' %
(delta, count))
print('Nodes table is roughly %s bytes of JSON.\n' % tbl_size)
observed_vendors = []
for node_obj in node_list:
vendor = node_obj.driver_internal_info.get('vendor')
if vendor:
observed_vendors.append(vendor)
@mock.patch('ironic.api.request') # noqa patch needed for the object model
@mock.patch.object(metrics_utils, 'get_metrics_logger', lambda *_: mock.Mock)
@mock.patch.object(api_utils, 'check_list_policy', lambda *_: None)
@mock.patch.object(api_utils, 'check_allow_specify_fields', lambda *_: None)
@mock.patch.object(api_utils, 'check_allowed_fields', lambda *_: None)
@mock.patch.object(oslo_policy.policy, 'LOG', autospec=True)
def _assess_db_object_and_api_performance(mock_log, mock_request):
print('Phase - Assess DB & Object conversion Performance')
_add_a_line()
# Just mock it to silence it since getting the logger to update
# config seems like not a thing once started. :\
mock_log.debug = mock.Mock()
# Internal logic requires major/minor versions and a context to
# proceed. This is just to make the NodesController respond properly.
mock_request.context = context.get_admin_context()
mock_request.version.major = 1
mock_request.version.minor = 71
start = time.time()
node_api_controller = node_api.NodesController()
node_api_controller.context = context.get_admin_context()
fields = ("uuid,power_state,target_power_state,provision_state,"
"target_provision_state,last_error,maintenance,properties,"
"instance_uuid,traits,resource_class")
total_nodes = 0
res = node_api_controller._get_nodes_collection(
chassis_uuid=None,
instance_uuid=None,
associated=None,
maintenance=None,
retired=None,
provision_state=None,
marker=None,
limit=None,
sort_key="id",
sort_dir="asc",
fields=fields.split(','))
total_nodes = len(res['nodes'])
while len(res['nodes']) != 1:
print(" ** Getting nodes ** %s Elapsed: %s seconds." %
(total_nodes, _calculate_delta(start, time.time())))
res = node_api_controller._get_nodes_collection(
chassis_uuid=None,
instance_uuid=None,
associated=None,
maintenance=None,
retired=None,
provision_state=None,
marker=res['nodes'][-1]['uuid'],
limit=None,
sort_key="id",
sort_dir="asc",
fields=fields.split(','))
new_nodes = len(res['nodes'])
if new_nodes == 0:
break
total_nodes = total_nodes + new_nodes
delta = _calculate_delta(start, time.time())
print('Took %s seconds to return all %s nodes via '
'nodes API call pattern.\n' % (delta, total_nodes))
def _report_conductors():
print('Phase - identifying conductors/drivers')
_add_a_line()
conductors = conductor.Conductor().list(
context.get_admin_context(),
)
drivers = []
groups = []
online_count = 0
online_by = timeutils.utcnow(with_timezone=True) - \
datetime.timedelta(seconds=90)
for conductor_obj in conductors:
if conductor_obj.conductor_group:
groups.append(conductor_obj.conductor_group)
if conductor_obj.updated_at > online_by:
online_count = online_count + 1
for driver in conductor_obj.drivers:
drivers.append(driver)
conductor_count = len(conductors)
print('Conductor count: %s' % conductor_count)
print('Online conductor count: %s' % online_count)
running_with_groups = len(groups)
print('Conductors with conductor_groups: %s' % running_with_groups)
group_count = len(set(groups))
print('Conductor group count: %s' % group_count)
driver_list = list(set(drivers))
print('Presently supported drivers: %s' % driver_list)
def main():
service.prepare_service()
CONF.set_override('debug', False)
_assess_db_performance()
_assess_db_and_object_performance()
_assess_db_object_and_api_performance()
_report_conductors()
if __name__ == '__main__':
sys.exit(main())