distil-collector concurrency support

As the number of our cloud customers and used resources increasing,
it will take a longer time(more than 1 hour) for distil usage
collection, which will lead to database problem, because there will
be more than 1 process operating on the same table for the same
project at the same time, which will also cause performance
degradation.

This change will add a table for locking purpose, it's very easy
to add extra collector without any code or configuration modification.

Change-Id: I25ebd19e8eae9dcc7ee3c2d2389d5ad2658f5e46
This commit is contained in:
Lingxian Kong 2017-01-12 00:11:45 +13:00
parent 6d1dec6cd9
commit dac4cb1fcf
13 changed files with 257 additions and 23 deletions

View File

@ -13,6 +13,7 @@
# under the License. # under the License.
import abc import abc
from datetime import timedelta
import re import re
import yaml import yaml
@ -64,11 +65,16 @@ class BaseCollector(object):
resources = {} resources = {}
usage_entries = [] usage_entries = []
# NOTE(kong): Set 10min as leadin time when getting samples, this
# helps us to get correct instance uptime, in case the instance
# status in first sample is not in our tracked status list.
actual_start = window_start - timedelta(minutes=10)
try: try:
for mapping in self.meter_mappings: for mapping in self.meter_mappings:
# Invoke get_meter function of specific collector. # Invoke get_meter function of specific collector.
usage = self.get_meter(project['id'], mapping['meter'], usage = self.get_meter(project['id'], mapping['meter'],
window_start, window_end) actual_start, window_end)
usage_by_resource = {} usage_by_resource = {}
self._filter_and_group(usage, usage_by_resource) self._filter_and_group(usage, usage_by_resource)
@ -125,7 +131,9 @@ class BaseCollector(object):
os_distro = image_meta.get('os_distro', 'unknown') os_distro = image_meta.get('os_distro', 'unknown')
else: else:
# Boot from image # Boot from image
image_id = entry['metadata']['image.id'] image_id = entry['metadata'].get('image.id', None)
image_id = image_id or entry['metadata'].get('image_ref', None)
os_distro = getattr( os_distro = getattr(
openstack.get_image(image_id), openstack.get_image(image_id),
'os_distro', 'os_distro',
@ -180,6 +188,11 @@ class BaseCollector(object):
service, entries, window_start, window_end service, entries, window_start, window_end
) )
LOG.debug(
'After transformation, usage for resource %s: %s' %
(res_id, transformed)
)
if transformed: if transformed:
res_id = mapping.get('res_id_template', '%s') % res_id res_id = mapping.get('res_id_template', '%s') % res_id
res_info = self._get_resource_info( res_info = self._get_resource_info(
@ -192,7 +205,6 @@ class BaseCollector(object):
res = resources.setdefault(res_id, res_info) res = resources.setdefault(res_id, res_info)
res.update(res_info) res.update(res_info)
LOG.debug('resource info: %s', res)
for service, volume in transformed.items(): for service, volume in transformed.items():
entry = { entry = {
@ -205,4 +217,3 @@ class BaseCollector(object):
'tenant_id': project_id 'tenant_id': project_id
} }
usage_entries.append(entry) usage_entries.append(entry)
LOG.debug('new entry: %s', entry)

View File

@ -17,6 +17,7 @@ from datetime import timedelta
from decimal import Decimal from decimal import Decimal
import functools import functools
import math import math
import socket
import warnings import warnings
import yaml import yaml
@ -101,3 +102,8 @@ def convert_to(value, from_unit, to_unit):
if from_unit == to_unit: if from_unit == to_unit:
return value return value
return conversions[from_unit][to_unit](value) return conversions[from_unit][to_unit](value)
def get_process_identifier():
"""Gets current running process identifier."""
return "%s_%s" % (socket.gethostname(), CONF.collector.partitioning_suffix)

View File

@ -16,6 +16,7 @@ from keystoneauth1 import loading as ka_loading
from oslo_cache import core as cache from oslo_cache import core as cache
from oslo_config import cfg from oslo_config import cfg
from oslo_log import log from oslo_log import log
from oslo_utils import uuidutils
from distil import version from distil import version
@ -61,6 +62,9 @@ COLLECTOR_OPTS = [
help=('The list of resources that handled by collector.')), help=('The list of resources that handled by collector.')),
cfg.StrOpt('dawn_of_time', default='2014-04-01 00:00:00', cfg.StrOpt('dawn_of_time', default='2014-04-01 00:00:00',
help=('The earlist starting time for new tenant.')), help=('The earlist starting time for new tenant.')),
cfg.StrOpt('partitioning_suffix',
help=('Collector partitioning group suffix. It is used when '
'running multiple collectors in favor of lock.'))
] ]
ODOO_OPTS = [ ODOO_OPTS = [

View File

@ -31,6 +31,7 @@ interface.
`sqlite:///var/lib/distil/distil.sqlite`. `sqlite:///var/lib/distil/distil.sqlite`.
""" """
import contextlib
from oslo_config import cfg from oslo_config import cfg
from oslo_db import api as db_api from oslo_db import api as db_api
@ -119,3 +120,27 @@ def project_get(project_id):
def project_get_all(): def project_get_all():
return IMPL.project_get_all() return IMPL.project_get_all()
# Project Locks.
def create_project_lock(project_id, owner):
return IMPL.create_project_lock(project_id, owner)
def get_project_locks(project_id):
return IMPL.get_project_locks(project_id)
def ensure_project_lock(project_id, owner):
return IMPL.ensure_project_lock(project_id, owner)
def delete_project_lock(project_id):
return IMPL.delete_project_lock(project_id)
@contextlib.contextmanager
def project_lock(project_id, owner):
with IMPL.project_lock(project_id, owner):
yield

View File

@ -0,0 +1,39 @@
# Copyright 2017 OpenStack Foundation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""add-project-lock-table
Revision ID: 002
Revises: 001
Create Date: 2017-01-11 11:45:05.107888
"""
# revision identifiers, used by Alembic.
revision = '002'
down_revision = '001'
from alembic import op
import sqlalchemy as sa
def upgrade():
op.create_table(
'project_locks',
sa.Column('created', sa.DateTime(), nullable=False),
sa.Column('project_id', sa.String(length=100), nullable=False),
sa.Column('owner', sa.String(length=100), nullable=False),
sa.PrimaryKeyConstraint('project_id'),
)

View File

@ -15,21 +15,23 @@
"""Implementation of SQLAlchemy backend.""" """Implementation of SQLAlchemy backend."""
import contextlib
from datetime import datetime from datetime import datetime
import json import json
import six
import sys import sys
import sqlalchemy as sa
from sqlalchemy import func
import threading import threading
from oslo_config import cfg from oslo_config import cfg
from oslo_db import exception as db_exception from oslo_db import exception as db_exception
from oslo_db.sqlalchemy import session as db_session from oslo_db.sqlalchemy import session as db_session
from oslo_log import log as logging from oslo_log import log as logging
from retrying import retry
import six
import sqlalchemy as sa
from sqlalchemy import func
from distil.db.sqlalchemy import models as m from distil.db.sqlalchemy import models as m
from distil.db.sqlalchemy.models import ProjectLock
from distil.db.sqlalchemy.models import Resource from distil.db.sqlalchemy.models import Resource
from distil.db.sqlalchemy.models import Tenant from distil.db.sqlalchemy.models import Tenant
from distil.db.sqlalchemy.models import UsageEntry from distil.db.sqlalchemy.models import UsageEntry
@ -76,7 +78,7 @@ def get_backend():
def setup_db(): def setup_db():
try: try:
engine = get_engine() engine = get_engine()
m.Cluster.metadata.create_all(engine) m.Tenant.metadata.create_all(engine)
except sa.exc.OperationalError as e: except sa.exc.OperationalError as e:
LOG.exception("Database registration exception: %s", e) LOG.exception("Database registration exception: %s", e)
return False return False
@ -226,8 +228,9 @@ def usages_add(project_id, resources, usage_entries, last_collect):
for (id, res_info) in six.iteritems(resources): for (id, res_info) in six.iteritems(resources):
res_db = _get_resource(session, project_id, id) res_db = _get_resource(session, project_id, id)
if res_db: if res_db:
orig_info = json.loads(res_db.info) orig_info = json.loads(res_db.info) or {}
res_db.info = json.dumps(orig_info.update(res_info)) orig_info.update(res_info)
res_db.info = json.dumps(orig_info)
else: else:
resource_ref = Resource( resource_ref = Resource(
id=id, id=id,
@ -306,3 +309,82 @@ def _merge_resource_metadata(md_dict, entry, md_def):
pass pass
return md_dict return md_dict
def get_project_locks(project_id):
session = get_session()
query = session.query(ProjectLock)
query = query.filter(ProjectLock.project_id == project_id)
try:
return query.all()
except Exception as e:
raise exceptions.DBException(
"Failed when querying database, error type: %s, "
"error message: %s" % (e.__class__.__name__, str(e))
)
@retry(stop_max_attempt_number=3, wait_fixed=5000)
def create_project_lock(project_id, owner):
"""Creates project lock record.
This method has to work without SQLAlchemy session because session may not
immediately issue an SQL query to a database and instead just schedule it
whereas we need to make sure to issue a operation immediately.
If there are more than 2 transactions trying to get same project lock
(although with little chance), after the first one's commit, only one of
the others will succeed to continue, all others will fail with exception.
Using retry mechanism here to avoid that happen.
"""
session = get_session()
session.flush()
insert = ProjectLock.__table__.insert()
session.execute(insert.values(project_id=project_id, owner=owner,
created=datetime.utcnow()))
session.flush()
def ensure_project_lock(project_id, owner):
"""Make sure project lock record exists."""
session = get_session()
query = session.query(ProjectLock)
query = query.filter(ProjectLock.project_id == project_id,
ProjectLock.owner == owner)
# If there is already lock existing for the process, do not recreate. This
# helps the process continue with the project it was handling before it was
# killed.
if not query.all():
create_project_lock(project_id, owner)
def delete_project_lock(project_id):
"""Deletes project lock record.
This method has to work without SQLAlchemy session because session may not
immediately issue an SQL query to a database and instead just schedule it
whereas we need to make sure to issue a operation immediately.
"""
session = get_session()
session.flush()
table = ProjectLock.__table__
delete = table.delete()
session.execute(delete.where(table.c.project_id == project_id))
session.flush()
@contextlib.contextmanager
def project_lock(project_id, owner):
try:
ensure_project_lock(project_id, owner)
yield
finally:
delete_project_lock(project_id)

View File

@ -119,3 +119,11 @@ class SalesOrder(DistilBase):
@hybrid_method @hybrid_method
def intersects(self, other): def intersects(self, other):
return (self.start <= other.end and other.start <= self.end) return (self.start <= other.end and other.start <= self.end)
class ProjectLock(DistilBase):
__tablename__ = 'project_locks'
project_id = Column(String(100), primary_key=True, nullable=False)
owner = Column(String(100), nullable=False)
created = Column(DateTime, nullable=False)

View File

@ -22,6 +22,7 @@ from stevedore import driver
from distil.db import api as db_api from distil.db import api as db_api
from distil import exceptions from distil import exceptions
from distil.common import general
from distil.common import openstack from distil.common import openstack
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@ -53,6 +54,8 @@ class CollectorService(service.Service):
self.validate_config() self.validate_config()
self.identifier = general.get_process_identifier()
collector_args = {} collector_args = {}
self.collector = driver.DriverManager( self.collector = driver.DriverManager(
'distil.collector', 'distil.collector',
@ -93,15 +96,36 @@ class CollectorService(service.Service):
logging.setup(CONF, 'distil-collector') logging.setup(CONF, 'distil-collector')
def collect_usage(self): def collect_usage(self):
LOG.info("Begin to collect usage for all projects...") LOG.info("Starting to collect usage...")
projects = filter_projects(openstack.get_projects()) projects = filter_projects(openstack.get_projects())
end = datetime.utcnow().replace(minute=0, second=0, microsecond=0) end = datetime.utcnow().replace(minute=0, second=0, microsecond=0)
count = 0
for project in projects: for project in projects:
# Check if the project is being processed by other collector
# instance. If no, will get a lock and continue processing,
# otherwise just skip it.
locks = db_api.get_project_locks(project['id'])
if locks and locks[0].owner != self.identifier:
LOG.debug(
"Project %s is being processed by collector %s." %
(project['id'], locks[0].owner)
)
continue
try:
with db_api.project_lock(project['id'], self.identifier):
# Add a project or get last_collected of existing project. # Add a project or get last_collected of existing project.
db_project = db_api.project_add(project) db_project = db_api.project_add(project)
start = db_project.last_collected start = db_project.last_collected
self.collector.collect_usage(project, start, end) self.collector.collect_usage(project, start, end)
count = count + 1
except Exception:
LOG.warning('Get lock failed. Process: %s' % self.identifier)
LOG.info("Finished collecting usage for %s projects." % count)

View File

@ -22,6 +22,7 @@ from oslo_log import log
from distil import context from distil import context
from distil import config from distil import config
from distil.db import api as db_api
class DistilTestCase(base.BaseTestCase): class DistilTestCase(base.BaseTestCase):
@ -34,7 +35,7 @@ class DistilTestCase(base.BaseTestCase):
if self.config_file: if self.config_file:
self.conf = self.load_conf(self.config_file) self.conf = self.load_conf(self.config_file)
else: else:
self.conf = cfg.ConfigOpts() self.conf = cfg.CONF
self.conf.register_opts(config.DEFAULT_OPTIONS) self.conf.register_opts(config.DEFAULT_OPTIONS)
@ -71,12 +72,12 @@ class DistilTestCase(base.BaseTestCase):
:returns: Project's config object. :returns: Project's config object.
""" """
conf = cfg.ConfigOpts() conf = cfg.CONF
log.register_options(conf) log.register_options(conf)
conf(args=[], default_config_files=[cls.conf_path(filename)]) conf(args=[], default_config_files=[cls.conf_path(filename)])
return conf return conf
def config(self, group=None, **kw): def override_config(self, group=None, **kw):
"""Override some configuration values. """Override some configuration values.
The keyword arguments are the names of configuration options to The keyword arguments are the names of configuration options to
@ -99,6 +100,6 @@ class DistilWithDbTestCase(DistilTestCase):
def setUp(self): def setUp(self):
super(DistilWithDbTestCase, self).setUp() super(DistilWithDbTestCase, self).setUp()
self.override_config('connection', "sqlite://", group='database') self.conf.set_default('connection', 'sqlite://', group='database')
db_api.setup_db() db_api.setup_db()
self.addCleanup(db_api.drop_db) self.addCleanup(db_api.drop_db)

View File

View File

@ -0,0 +1,29 @@
# Copyright (C) 2017 Catalyst IT Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from distil.db.sqlalchemy import api as db_api
from distil.tests.unit import base
class ProjectLockTest(base.DistilWithDbTestCase):
def test_with_project_lock(self):
project_id = 'fake_project_id'
owner = 'fake_owner'
with db_api.project_lock(project_id, owner):
# Make sure that within 'with' section the lock record exists.
self.assertEqual(1, len(db_api.get_project_locks(project_id)))
# Make sure that outside 'with' section the lock record does not exist.
self.assertEqual(0, len(db_api.get_project_locks(project_id)))

View File

@ -11,6 +11,7 @@ odoorpc==0.4.2
SQLAlchemy<1.1.0,>=1.0.10 # MIT SQLAlchemy<1.1.0,>=1.0.10 # MIT
keystonemiddleware!=4.1.0,>=4.0.0 # Apache-2.0 keystonemiddleware!=4.1.0,>=4.0.0 # Apache-2.0
keystoneauth1>=2.1.0 # Apache-2.0 keystoneauth1>=2.1.0 # Apache-2.0
retrying>=1.2.3,!=1.3.0 # Apache-2.0
python-cinderclient>=1.6.0 # Apache-2.0 python-cinderclient>=1.6.0 # Apache-2.0
python-keystoneclient!=1.8.0,!=2.1.0,>=1.6.0 # Apache-2.0 python-keystoneclient!=1.8.0,!=2.1.0,>=1.6.0 # Apache-2.0

10
tox.ini
View File

@ -11,10 +11,14 @@ setenv =
DISTIL_TESTS_CONFIGS_DIR={toxinidir}/distil/tests/etc/ DISTIL_TESTS_CONFIGS_DIR={toxinidir}/distil/tests/etc/
DISCOVER_DIRECTORY=distil/tests/unit DISCOVER_DIRECTORY=distil/tests/unit
deps = deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt -r{toxinidir}/test-requirements.txt
commands = python setup.py testr --slowest --testr-args="{posargs}" commands =
whitelist_externals = bash find . -type f -name "*.pyc" -delete
python setup.py testr --slowest --testr-args="{posargs}"
whitelist_externals =
bash
rm
find
[testenv:py33] [testenv:py33]
deps = -r{toxinidir}/requirements-py3.txt deps = -r{toxinidir}/requirements-py3.txt