Implement Prometheus metric integration
This implements Prometheus metric integration, including metric definition, collection, and exportation. End user documentation for supported metric data and exportation interface is included. Change-Id: Ia0837f28073d6cd8e0220ac84cdd261b32704ae4
This commit is contained in:
parent
77deecc294
commit
0721ed43aa
11
README.rst
11
README.rst
@ -90,13 +90,16 @@ Which should output something like this::
|
||||
For more information on how to install and use Armada, please reference:
|
||||
`Armada Quickstart`_.
|
||||
|
||||
|
||||
Integration Points
|
||||
------------------
|
||||
|
||||
Armada CLI component has the following integration points:
|
||||
|
||||
* `Tiller`_ manages Armada chart installations.
|
||||
* `Deckhand`_ supplies storage and management of site designs and secrets.
|
||||
* `Deckhand`_ is one of the supported control document sources for Armada.
|
||||
* `Prometheus`_ exporter is provided for metric data related to application
|
||||
of charts and collections of charts. See `metrics`_.
|
||||
|
||||
In addition, Armada's API component has the following integration points:
|
||||
|
||||
@ -110,10 +113,12 @@ Further Reading
|
||||
|
||||
.. _Manual Install Guide: https://airship-armada.readthedocs.io/en/latest/development/getting-started.html#developer-install-guide
|
||||
.. _Armada Quickstart: https://airship-armada.readthedocs.io/en/latest/operations/guide-use-armada.html
|
||||
.. _metrics: https://airship-armada.readthedocs.io/en/latest/operations/metrics.html#metrics
|
||||
.. _kubectl: https://kubernetes.io/docs/user-guide/kubectl/kubectl_config/
|
||||
.. _Tiller: https://docs.helm.sh/using_helm/#easy-in-cluster-installation
|
||||
.. _Deckhand: https://opendev.org/airship/deckhand
|
||||
.. _Keystone: https://opendev.org/openstack/keystone
|
||||
.. _Deckhand: https://github.com/openstack/airship-deckhand
|
||||
.. _Prometheus: https://prometheus.io
|
||||
.. _Keystone: https://github.com/openstack/keystone
|
||||
|
||||
.. |Docker Repository on Quay| image:: https://quay.io/repository/airshipit/armada/status
|
||||
:target: https://quay.io/repository/airshipit/armada
|
||||
|
38
armada/api/controller/metrics.py
Normal file
38
armada/api/controller/metrics.py
Normal file
@ -0,0 +1,38 @@
|
||||
# Copyright 2019 The Armada Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import falcon
|
||||
import prometheus_client
|
||||
|
||||
from armada import api
|
||||
from armada.handlers import metrics
|
||||
|
||||
|
||||
class Metrics(api.BaseResource):
|
||||
'''Controller for exporting prometheus metrics.
|
||||
'''
|
||||
|
||||
def on_get(self, req, resp):
|
||||
encoder, content_type = prometheus_client.exposition.choose_encoder(
|
||||
req.get_header('Accept'))
|
||||
try:
|
||||
output = encoder(metrics.REGISTRY)
|
||||
except Exception as ex:
|
||||
err_message = 'Failed to generate metric output'
|
||||
self.logger.error(err_message, exc_info=ex)
|
||||
return self.return_error(
|
||||
resp, falcon.HTTP_500, message=err_message)
|
||||
resp.content_type = content_type
|
||||
resp.body = output
|
||||
resp.status = falcon.HTTP_200
|
@ -27,6 +27,7 @@ from armada.api.controller.rollback import Rollback
|
||||
from armada.api.controller.test import TestReleasesReleaseNameController
|
||||
from armada.api.controller.test import TestReleasesManifestController
|
||||
from armada.api.controller.health import Health
|
||||
from armada.api.controller.metrics import Metrics
|
||||
from armada.api.controller.tiller import Release
|
||||
from armada.api.controller.tiller import Status
|
||||
from armada.api.controller.validation import Validate
|
||||
@ -59,7 +60,7 @@ def create(enable_middleware=CONF.middleware):
|
||||
logging.setup(CONF, 'armada')
|
||||
|
||||
# Configure API routing
|
||||
url_routes_v1 = (
|
||||
url_routes_v1 = [
|
||||
(HEALTH_PATH, Health()),
|
||||
('apply', Apply()),
|
||||
('releases', Release()),
|
||||
@ -68,7 +69,8 @@ def create(enable_middleware=CONF.middleware):
|
||||
('tests', TestReleasesManifestController()),
|
||||
('test/{release}', TestReleasesReleaseNameController()),
|
||||
('validatedesign', Validate()),
|
||||
)
|
||||
('metrics', Metrics()),
|
||||
]
|
||||
|
||||
for route, service in url_routes_v1:
|
||||
api.add_route("/api/v1.0/{}".format(route), service)
|
||||
|
@ -14,10 +14,12 @@
|
||||
|
||||
import click
|
||||
from oslo_config import cfg
|
||||
import prometheus_client
|
||||
import yaml
|
||||
|
||||
from armada.cli import CliAction
|
||||
from armada.exceptions.source_exceptions import InvalidPathException
|
||||
from armada.handlers import metrics
|
||||
from armada.handlers.armada import Armada
|
||||
from armada.handlers.document import ReferenceResolver
|
||||
from armada.handlers.lock import lock_and_thread
|
||||
@ -81,6 +83,12 @@ SHORT_DESC = "Command installs manifest charts."
|
||||
'--dry-run', help="Run charts without installing them.", is_flag=True)
|
||||
@click.option(
|
||||
'--enable-chart-cleanup', help="Clean up unmanaged charts.", is_flag=True)
|
||||
@click.option(
|
||||
'--metrics-output',
|
||||
help=(
|
||||
"Output path for prometheus metric data, should end in .prom. By "
|
||||
"default, no metric data is output."),
|
||||
default=None)
|
||||
@click.option(
|
||||
'--use-doc-ref', help="Use armada manifest file reference.", is_flag=True)
|
||||
@click.option(
|
||||
@ -121,7 +129,7 @@ SHORT_DESC = "Command installs manifest charts."
|
||||
'--wait',
|
||||
help=(
|
||||
"Force Tiller to wait until all charts are deployed, "
|
||||
"rather than using each chart's specified wait policy. "
|
||||
"rather than using each charts specified wait policy. "
|
||||
"This is equivalent to sequenced chartgroups."),
|
||||
is_flag=True)
|
||||
@click.option(
|
||||
@ -135,22 +143,22 @@ SHORT_DESC = "Command installs manifest charts."
|
||||
@click.pass_context
|
||||
def apply_create(
|
||||
ctx, locations, api, disable_update_post, disable_update_pre, dry_run,
|
||||
enable_chart_cleanup, use_doc_ref, set, tiller_host, tiller_port,
|
||||
tiller_namespace, timeout, values, wait, target_manifest, bearer_token,
|
||||
debug):
|
||||
enable_chart_cleanup, metrics_output, use_doc_ref, set, tiller_host,
|
||||
tiller_port, tiller_namespace, timeout, values, wait, target_manifest,
|
||||
bearer_token, debug):
|
||||
CONF.debug = debug
|
||||
ApplyManifest(
|
||||
ctx, locations, api, disable_update_post, disable_update_pre, dry_run,
|
||||
enable_chart_cleanup, use_doc_ref, set, tiller_host, tiller_port,
|
||||
tiller_namespace, timeout, values, wait, target_manifest,
|
||||
enable_chart_cleanup, metrics_output, use_doc_ref, set, tiller_host,
|
||||
tiller_port, tiller_namespace, timeout, values, wait, target_manifest,
|
||||
bearer_token).safe_invoke()
|
||||
|
||||
|
||||
class ApplyManifest(CliAction):
|
||||
def __init__(
|
||||
self, ctx, locations, api, disable_update_post, disable_update_pre,
|
||||
dry_run, enable_chart_cleanup, use_doc_ref, set, tiller_host,
|
||||
tiller_port, tiller_namespace, timeout, values, wait,
|
||||
dry_run, enable_chart_cleanup, metrics_output, use_doc_ref, set,
|
||||
tiller_host, tiller_port, tiller_namespace, timeout, values, wait,
|
||||
target_manifest, bearer_token):
|
||||
super(ApplyManifest, self).__init__()
|
||||
self.ctx = ctx
|
||||
@ -161,6 +169,7 @@ class ApplyManifest(CliAction):
|
||||
self.disable_update_pre = disable_update_pre
|
||||
self.dry_run = dry_run
|
||||
self.enable_chart_cleanup = enable_chart_cleanup
|
||||
self.metrics_output = metrics_output
|
||||
self.use_doc_ref = use_doc_ref
|
||||
self.set = set
|
||||
self.tiller_host = tiller_host
|
||||
@ -210,8 +219,16 @@ class ApplyManifest(CliAction):
|
||||
bearer_token=self.bearer_token,
|
||||
dry_run=self.dry_run) as tiller:
|
||||
|
||||
resp = self.handle(documents, tiller)
|
||||
self.output(resp)
|
||||
try:
|
||||
resp = self.handle(documents, tiller)
|
||||
self.output(resp)
|
||||
finally:
|
||||
if self.metrics_output:
|
||||
path = self.metrics_output
|
||||
self.logger.info(
|
||||
'Storing metrics output in path: {}'.format(path))
|
||||
prometheus_client.write_to_textfile(
|
||||
path, metrics.REGISTRY)
|
||||
else:
|
||||
if len(self.values) > 0:
|
||||
self.logger.error(
|
||||
|
@ -24,6 +24,7 @@ from armada.exceptions import override_exceptions
|
||||
from armada.exceptions import source_exceptions
|
||||
from armada.exceptions import tiller_exceptions
|
||||
from armada.exceptions import validate_exceptions
|
||||
from armada.handlers import metrics
|
||||
from armada.handlers.chart_deploy import ChartDeploy
|
||||
from armada.handlers.manifest import Manifest
|
||||
from armada.handlers.override import Override
|
||||
@ -92,8 +93,9 @@ class Armada(object):
|
||||
self.documents, target_manifest=target_manifest).get_manifest()
|
||||
self.chart_cache = {}
|
||||
self.chart_deploy = ChartDeploy(
|
||||
disable_update_pre, disable_update_post, self.dry_run,
|
||||
k8s_wait_attempts, k8s_wait_attempt_sleep, timeout, self.tiller)
|
||||
self.manifest, disable_update_pre, disable_update_post,
|
||||
self.dry_run, k8s_wait_attempts, k8s_wait_attempt_sleep, timeout,
|
||||
self.tiller)
|
||||
|
||||
def pre_flight_ops(self):
|
||||
"""Perform a series of checks and operations to ensure proper
|
||||
@ -113,6 +115,12 @@ class Armada(object):
|
||||
self.get_chart(ch)
|
||||
|
||||
def get_chart(self, ch):
|
||||
manifest_name = self.manifest['metadata']['name']
|
||||
chart_name = ch['metadata']['name']
|
||||
with metrics.CHART_DOWNLOAD.get_context(manifest_name, chart_name):
|
||||
return self._get_chart(ch)
|
||||
|
||||
def _get_chart(self, ch):
|
||||
chart = ch.get(const.KEYWORD_DATA)
|
||||
chart_source = chart.get('source', {})
|
||||
location = chart_source.get('location')
|
||||
@ -171,6 +179,11 @@ class Armada(object):
|
||||
'''
|
||||
Synchronize Helm with the Armada Config(s)
|
||||
'''
|
||||
manifest_name = self.manifest['metadata']['name']
|
||||
with metrics.APPLY.get_context(manifest_name):
|
||||
return self._sync()
|
||||
|
||||
def _sync(self):
|
||||
if self.dry_run:
|
||||
LOG.info('Armada is in DRY RUN mode, no changes being made.')
|
||||
|
||||
@ -207,11 +220,12 @@ class Armada(object):
|
||||
|
||||
cg_charts = chartgroup.get(const.KEYWORD_CHARTS, [])
|
||||
|
||||
def deploy_chart(chart):
|
||||
def deploy_chart(chart, concurrency):
|
||||
set_current_chart(chart)
|
||||
try:
|
||||
return self.chart_deploy.execute(
|
||||
chart, cg_test_all_charts, prefix, known_releases)
|
||||
chart, cg_test_all_charts, prefix, known_releases,
|
||||
concurrency)
|
||||
finally:
|
||||
set_current_chart(None)
|
||||
|
||||
@ -233,13 +247,14 @@ class Armada(object):
|
||||
|
||||
if cg_sequenced:
|
||||
for chart in cg_charts:
|
||||
if (handle_result(chart, lambda: deploy_chart(chart))):
|
||||
if (handle_result(chart, lambda: deploy_chart(chart, 1))):
|
||||
break
|
||||
else:
|
||||
with ThreadPoolExecutor(
|
||||
max_workers=len(cg_charts)) as executor:
|
||||
future_to_chart = {
|
||||
executor.submit(deploy_chart, chart): chart
|
||||
executor.submit(deploy_chart, chart, len(cg_charts)):
|
||||
chart
|
||||
for chart in cg_charts
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,7 @@ import yaml
|
||||
|
||||
from armada import const
|
||||
from armada.exceptions import armada_exceptions
|
||||
from armada.handlers import metrics
|
||||
from armada.handlers.chartbuilder import ChartBuilder
|
||||
from armada.handlers.release_diff import ReleaseDiff
|
||||
from armada.handlers.chart_delete import ChartDelete
|
||||
@ -33,8 +34,9 @@ LOG = logging.getLogger(__name__)
|
||||
|
||||
class ChartDeploy(object):
|
||||
def __init__(
|
||||
self, disable_update_pre, disable_update_post, dry_run,
|
||||
self, manifest, disable_update_pre, disable_update_post, dry_run,
|
||||
k8s_wait_attempts, k8s_wait_attempt_sleep, timeout, tiller):
|
||||
self.manifest = manifest
|
||||
self.disable_update_pre = disable_update_pre
|
||||
self.disable_update_post = disable_update_post
|
||||
self.dry_run = dry_run
|
||||
@ -43,25 +45,26 @@ class ChartDeploy(object):
|
||||
self.timeout = timeout
|
||||
self.tiller = tiller
|
||||
|
||||
def execute(self, ch, cg_test_all_charts, prefix, known_releases):
|
||||
def execute(
|
||||
self, ch, cg_test_all_charts, prefix, known_releases, concurrency):
|
||||
chart_name = ch['metadata']['name']
|
||||
manifest_name = self.manifest['metadata']['name']
|
||||
with metrics.CHART_HANDLE.get_context(concurrency, manifest_name,
|
||||
chart_name):
|
||||
return self._execute(
|
||||
ch, cg_test_all_charts, prefix, known_releases)
|
||||
|
||||
def _execute(self, ch, cg_test_all_charts, prefix, known_releases):
|
||||
manifest_name = self.manifest['metadata']['name']
|
||||
chart = ch[const.KEYWORD_DATA]
|
||||
chart_name = ch['metadata']['name']
|
||||
namespace = chart.get('namespace')
|
||||
release = chart.get('release')
|
||||
release_name = r.release_prefixer(prefix, release)
|
||||
LOG.info('Processing Chart, release=%s', release_name)
|
||||
|
||||
values = chart.get('values', {})
|
||||
pre_actions = {}
|
||||
post_actions = {}
|
||||
|
||||
result = {}
|
||||
|
||||
old_release = self.find_chart_release(known_releases, release_name)
|
||||
|
||||
status = None
|
||||
if old_release:
|
||||
status = r.get_release_status(old_release)
|
||||
|
||||
chart_wait = ChartWait(
|
||||
self.tiller.k8s,
|
||||
release_name,
|
||||
@ -70,18 +73,32 @@ class ChartDeploy(object):
|
||||
k8s_wait_attempts=self.k8s_wait_attempts,
|
||||
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
|
||||
timeout=self.timeout)
|
||||
|
||||
native_wait_enabled = chart_wait.is_native_enabled()
|
||||
wait_timeout = chart_wait.get_timeout()
|
||||
|
||||
# Begin Chart timeout deadline
|
||||
deadline = time.time() + chart_wait.get_timeout()
|
||||
deadline = time.time() + wait_timeout
|
||||
old_release = self.find_chart_release(known_releases, release_name)
|
||||
action = metrics.ChartDeployAction.NOOP
|
||||
|
||||
def noop():
|
||||
pass
|
||||
|
||||
deploy = noop
|
||||
|
||||
# Resolve action
|
||||
values = chart.get('values', {})
|
||||
pre_actions = {}
|
||||
post_actions = {}
|
||||
|
||||
status = None
|
||||
if old_release:
|
||||
status = r.get_release_status(old_release)
|
||||
|
||||
native_wait_enabled = chart_wait.is_native_enabled()
|
||||
|
||||
chartbuilder = ChartBuilder(ch)
|
||||
new_chart = chartbuilder.get_helm_chart()
|
||||
|
||||
# TODO(mark-burnett): It may be more robust to directly call
|
||||
# tiller status to decide whether to install/upgrade rather
|
||||
# than checking for list membership.
|
||||
if status == const.STATUS_DEPLOYED:
|
||||
|
||||
# indicate to the end user what path we are taking
|
||||
@ -135,36 +152,37 @@ class ChartDeploy(object):
|
||||
if not diff:
|
||||
LOG.info("Found no updates to chart release inputs")
|
||||
else:
|
||||
action = metrics.ChartDeployAction.UPGRADE
|
||||
LOG.info("Found updates to chart release inputs")
|
||||
LOG.debug("%s", diff)
|
||||
result['diff'] = {chart['release']: str(diff)}
|
||||
|
||||
# TODO(MarshM): Add tiller dry-run before upgrade and
|
||||
# consider deadline impacts
|
||||
def upgrade():
|
||||
# do actual update
|
||||
timer = int(round(deadline - time.time()))
|
||||
LOG.info(
|
||||
"Upgrading release %s in namespace %s, wait=%s, "
|
||||
"timeout=%ss", release_name, namespace,
|
||||
native_wait_enabled, timer)
|
||||
tiller_result = self.tiller.update_release(
|
||||
new_chart,
|
||||
release_name,
|
||||
namespace,
|
||||
pre_actions=pre_actions,
|
||||
post_actions=post_actions,
|
||||
disable_hooks=disable_hooks,
|
||||
values=yaml.safe_dump(values),
|
||||
wait=native_wait_enabled,
|
||||
timeout=timer,
|
||||
force=force,
|
||||
recreate_pods=recreate_pods)
|
||||
|
||||
# do actual update
|
||||
timer = int(round(deadline - time.time()))
|
||||
LOG.info(
|
||||
"Upgrading release %s in namespace %s, wait=%s, "
|
||||
"timeout=%ss", release_name, namespace,
|
||||
native_wait_enabled, timer)
|
||||
tiller_result = self.tiller.update_release(
|
||||
new_chart,
|
||||
release_name,
|
||||
namespace,
|
||||
pre_actions=pre_actions,
|
||||
post_actions=post_actions,
|
||||
disable_hooks=disable_hooks,
|
||||
values=yaml.safe_dump(values),
|
||||
wait=native_wait_enabled,
|
||||
timeout=timer,
|
||||
force=force,
|
||||
recreate_pods=recreate_pods)
|
||||
LOG.info(
|
||||
'Upgrade completed with results from Tiller: %s',
|
||||
tiller_result.__dict__)
|
||||
result['upgrade'] = release_name
|
||||
|
||||
LOG.info(
|
||||
'Upgrade completed with results from Tiller: %s',
|
||||
tiller_result.__dict__)
|
||||
result['upgrade'] = release_name
|
||||
deploy = upgrade
|
||||
else:
|
||||
# Check for release with status other than DEPLOYED
|
||||
if status:
|
||||
@ -178,7 +196,6 @@ class ChartDeploy(object):
|
||||
# was started within the timeout window of the chart.
|
||||
last_deployment_age = r.get_last_deployment_age(
|
||||
old_release)
|
||||
wait_timeout = chart_wait.get_timeout()
|
||||
likely_pending = last_deployment_age <= wait_timeout
|
||||
if likely_pending:
|
||||
# Give up if a deployment is likely pending, we do not
|
||||
@ -217,35 +234,49 @@ class ChartDeploy(object):
|
||||
release_name, status)
|
||||
else:
|
||||
# Purge the release
|
||||
LOG.info(
|
||||
'Purging release %s with status %s', release_name,
|
||||
status)
|
||||
chart_delete = ChartDelete(
|
||||
chart, release_name, self.tiller)
|
||||
chart_delete.delete()
|
||||
result['purge'] = release_name
|
||||
with metrics.CHART_DELETE.get_context(manifest_name,
|
||||
chart_name):
|
||||
|
||||
LOG.info(
|
||||
'Purging release %s with status %s', release_name,
|
||||
status)
|
||||
chart_delete = ChartDelete(
|
||||
chart, release_name, self.tiller)
|
||||
chart_delete.delete()
|
||||
result['purge'] = release_name
|
||||
|
||||
action = metrics.ChartDeployAction.INSTALL
|
||||
|
||||
def install():
|
||||
timer = int(round(deadline - time.time()))
|
||||
LOG.info(
|
||||
"Installing release %s in namespace %s, wait=%s, "
|
||||
"timeout=%ss", release_name, namespace,
|
||||
native_wait_enabled, timer)
|
||||
tiller_result = self.tiller.install_release(
|
||||
new_chart,
|
||||
release_name,
|
||||
namespace,
|
||||
values=yaml.safe_dump(values),
|
||||
wait=native_wait_enabled,
|
||||
timeout=timer)
|
||||
|
||||
LOG.info(
|
||||
'Install completed with results from Tiller: %s',
|
||||
tiller_result.__dict__)
|
||||
result['install'] = release_name
|
||||
|
||||
deploy = install
|
||||
|
||||
# Deploy
|
||||
with metrics.CHART_DEPLOY.get_context(wait_timeout, manifest_name,
|
||||
chart_name,
|
||||
action.get_label_value()):
|
||||
deploy()
|
||||
|
||||
# Wait
|
||||
timer = int(round(deadline - time.time()))
|
||||
LOG.info(
|
||||
"Installing release %s in namespace %s, wait=%s, "
|
||||
"timeout=%ss", release_name, namespace, native_wait_enabled,
|
||||
timer)
|
||||
tiller_result = self.tiller.install_release(
|
||||
new_chart,
|
||||
release_name,
|
||||
namespace,
|
||||
values=yaml.safe_dump(values),
|
||||
wait=native_wait_enabled,
|
||||
timeout=timer)
|
||||
|
||||
LOG.info(
|
||||
'Install completed with results from Tiller: %s',
|
||||
tiller_result.__dict__)
|
||||
result['install'] = release_name
|
||||
|
||||
# Wait
|
||||
timer = int(round(deadline - time.time()))
|
||||
chart_wait.wait(timer)
|
||||
chart_wait.wait(timer)
|
||||
|
||||
# Test
|
||||
just_deployed = ('install' in result) or ('upgrade' in result)
|
||||
@ -260,7 +291,9 @@ class ChartDeploy(object):
|
||||
run_test = test_handler.test_enabled and (
|
||||
just_deployed or not last_test_passed)
|
||||
if run_test:
|
||||
self._test_chart(release_name, test_handler)
|
||||
with metrics.CHART_TEST.get_context(test_handler.timeout,
|
||||
manifest_name, chart_name):
|
||||
self._test_chart(release_name, test_handler)
|
||||
|
||||
return result
|
||||
|
||||
|
175
armada/handlers/metrics.py
Normal file
175
armada/handlers/metrics.py
Normal file
@ -0,0 +1,175 @@
|
||||
# Copyright 2019 The Armada Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from contextlib import ExitStack
|
||||
from enum import Enum
|
||||
import os
|
||||
|
||||
import prometheus_client
|
||||
from prometheus_client import multiprocess, values, context_managers
|
||||
|
||||
|
||||
class ActionMetrics():
|
||||
""" Support for defining and observing metrics for an action, including
|
||||
tracking attempts, failures, and timing.
|
||||
"""
|
||||
|
||||
_PREFIX = 'armada'
|
||||
|
||||
def __init__(self, prefix, description, labels):
|
||||
"""
|
||||
:param prefix: prefix to use for each metric name
|
||||
:param description: description of action to use in metric description
|
||||
:param labels: label names to define for each metric
|
||||
"""
|
||||
self.full_prefix = '{}_{}'.format(self.__class__._PREFIX, prefix)
|
||||
self.progress = prometheus_client.Gauge(
|
||||
'{}_attempt_inprogress'.format(self.full_prefix),
|
||||
'In progress attempts to {}'.format(description),
|
||||
labels,
|
||||
registry=REGISTRY,
|
||||
multiprocess_mode='livesum')
|
||||
self.attempt_total = prometheus_client.Counter(
|
||||
'{}_attempt_total'.format(self.full_prefix),
|
||||
'Total attempts to {}'.format(description),
|
||||
labels,
|
||||
registry=REGISTRY)
|
||||
self.failure_total = prometheus_client.Counter(
|
||||
'{}_failure_total'.format(self.full_prefix),
|
||||
'Total failures to {}'.format(description),
|
||||
labels,
|
||||
registry=REGISTRY)
|
||||
self.duration = prometheus_client.Histogram(
|
||||
'{}_duration_seconds'.format(self.full_prefix),
|
||||
'Seconds to {}'.format(description),
|
||||
labels,
|
||||
registry=REGISTRY)
|
||||
|
||||
def get_context(self, *args, **kwargs):
|
||||
""" Any extra args are used as metric label values.
|
||||
|
||||
:return: a context manager for the action which observes the desired
|
||||
metrics.
|
||||
:rtype: contextmanager
|
||||
"""
|
||||
progress = self.progress.labels(*args, **kwargs)
|
||||
attempt_total = self.attempt_total.labels(*args, **kwargs)
|
||||
attempt_total.inc()
|
||||
failure_total = self.failure_total.labels(*args, **kwargs)
|
||||
duration = self.duration.labels(*args, **kwargs)
|
||||
|
||||
e = ExitStack()
|
||||
contexts = [
|
||||
progress.track_inprogress(),
|
||||
failure_total.count_exceptions(),
|
||||
duration.time()
|
||||
]
|
||||
for ctx in contexts:
|
||||
e.enter_context(ctx)
|
||||
return e
|
||||
|
||||
|
||||
class ChartHandleMetrics(ActionMetrics):
|
||||
def __init__(self, prefix, description, labels):
|
||||
super().__init__(prefix, description, labels)
|
||||
self.concurrency = prometheus_client.Histogram(
|
||||
'{}_concurrency_count'.format(self.full_prefix),
|
||||
'Count of charts being handled concurrently for chart',
|
||||
labels,
|
||||
registry=REGISTRY)
|
||||
|
||||
def get_context(self, concurrency_value, *args, **kwargs):
|
||||
concurrency = self.concurrency.labels(*args, **kwargs)
|
||||
concurrency.observe(concurrency_value)
|
||||
return super().get_context(*args, **kwargs)
|
||||
|
||||
|
||||
class ActionWithTimeoutMetrics(ActionMetrics):
|
||||
def __init__(self, prefix, description, labels):
|
||||
super().__init__(prefix, description, labels)
|
||||
self.timeout = prometheus_client.Histogram(
|
||||
'{}_timeout_duration_seconds'.format(self.full_prefix),
|
||||
'Configured timeout (in seconds) to {}'.format(description),
|
||||
labels,
|
||||
registry=REGISTRY)
|
||||
self.timeout_usage = prometheus_client.Histogram(
|
||||
'{}_timeout_usage_ratio'.format(self.full_prefix),
|
||||
'Ratio of duration to timeout to {}'.format(description),
|
||||
labels,
|
||||
registry=REGISTRY)
|
||||
|
||||
def get_context(self, timeout_value, *args, **kwargs):
|
||||
timeout = self.timeout.labels(*args, **kwargs)
|
||||
timeout_usage = self.timeout_usage.labels(*args, **kwargs)
|
||||
|
||||
timeout.observe(timeout_value)
|
||||
|
||||
def observe_timeout_usage(duration):
|
||||
# Avoid division by 0
|
||||
if timeout_value:
|
||||
val = duration / timeout_value
|
||||
timeout_usage.observe(val)
|
||||
|
||||
timer = context_managers.Timer(observe_timeout_usage)
|
||||
context = super().get_context(*args, **kwargs)
|
||||
context.enter_context(timer)
|
||||
return context
|
||||
|
||||
|
||||
class ChartDeployAction(Enum):
|
||||
""" Enum to define sub-actions for the chart deploy action, to be used as
|
||||
label values.
|
||||
"""
|
||||
|
||||
INSTALL = 1
|
||||
UPGRADE = 2
|
||||
NOOP = 3
|
||||
|
||||
def get_label_value(self):
|
||||
"""
|
||||
:return: the label value
|
||||
:rtype: str
|
||||
"""
|
||||
return self.name.lower()
|
||||
|
||||
|
||||
REGISTRY = prometheus_client.CollectorRegistry()
|
||||
|
||||
if "prometheus_multiproc_dir" in os.environ:
|
||||
# For why this is needed see:
|
||||
# https://github.com/prometheus/client_python/issues/275#issuecomment-504755024
|
||||
import uwsgi
|
||||
prometheus_client.values.ValueClass = values.MultiProcessValue(
|
||||
uwsgi.worker_id)
|
||||
|
||||
multiprocess.MultiProcessCollector(REGISTRY)
|
||||
|
||||
APPLY = ActionMetrics('apply', 'apply a manifest', ['manifest'])
|
||||
# TODO: Ideally include an action (ChartDeployAction) label, but that's not
|
||||
# determined until after chart handling starts.
|
||||
CHART_HANDLE = ChartHandleMetrics(
|
||||
'chart_handle',
|
||||
'handle a chart (including delete, deploy, test (all as necessary) but '
|
||||
'not download)', ['manifest', 'chart'])
|
||||
CHART_DOWNLOAD = ActionMetrics(
|
||||
'chart_download', 'download a chart (will be noop if previously cached)',
|
||||
['manifest', 'chart'])
|
||||
CHART_DELETE = ActionMetrics(
|
||||
'chart_delete', 'delete a chart', ['manifest', 'chart'])
|
||||
CHART_DEPLOY = ActionWithTimeoutMetrics(
|
||||
'chart_deploy',
|
||||
'deploy a chart (including install/upgrade and wait (all as necessary))',
|
||||
['manifest', 'chart', 'action'])
|
||||
CHART_TEST = ActionWithTimeoutMetrics(
|
||||
'chart_test', 'test a chart', ['manifest', 'chart'])
|
@ -397,6 +397,7 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
|
||||
mock_test_release.side_effect = fail
|
||||
else:
|
||||
mock_test_release.return_value = test_success
|
||||
mock_test.return_value.timeout = const.DEFAULT_TEST_TIMEOUT
|
||||
|
||||
# Stub out irrelevant methods called by `armada.sync()`.
|
||||
mock_chartbuilder.get_source_path.return_value = None
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
{{- $envAll := . }}
|
||||
{{- $mounts_armada_api := .Values.pod.mounts.armada_api.armada_api }}
|
||||
{{- $mounts_armada_api_init := .Values.pod.mounts.armada_api.init_container }}
|
||||
{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.armada }}
|
||||
{{- $serviceAccountName := "armada-api" }}
|
||||
{{ tuple $envAll "api" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
||||
---
|
||||
@ -79,9 +80,9 @@ spec:
|
||||
labels:
|
||||
{{ tuple $envAll "armada" "api" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
|
||||
annotations:
|
||||
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" | indent 8 }}
|
||||
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||
{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_pod_annotations" | indent 8 }}
|
||||
spec:
|
||||
{{ dict "envAll" $envAll "application" "armada" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
|
||||
serviceAccountName: {{ $serviceAccountName }}
|
||||
@ -123,6 +124,8 @@ spec:
|
||||
volumeMounts:
|
||||
- name: pod-tmp
|
||||
mountPath: /tmp
|
||||
- name: pod-tmp-metrics
|
||||
mountPath: /tmp/armada/metrics
|
||||
- name: pod-etc-armada
|
||||
mountPath: /etc/armada
|
||||
- name: armada-etc
|
||||
@ -193,6 +196,9 @@ spec:
|
||||
volumes:
|
||||
- name: pod-tmp
|
||||
emptyDir: {}
|
||||
- name: pod-tmp-metrics
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: pod-etc-armada
|
||||
emptyDir: {}
|
||||
- name: armada-bin
|
||||
|
@ -216,6 +216,13 @@ conf:
|
||||
# greater than that will have no effect.
|
||||
prestop_sleep: 30
|
||||
|
||||
monitoring:
|
||||
prometheus:
|
||||
armada:
|
||||
scrape: true
|
||||
path: /api/v1.0/metrics
|
||||
port: 8000
|
||||
|
||||
pod:
|
||||
security_context:
|
||||
armada:
|
||||
|
@ -7,29 +7,29 @@ Commands
|
||||
|
||||
.. code:: bash
|
||||
|
||||
Usage: armada apply [OPTIONS] FILENAME
|
||||
Usage: armada apply [OPTIONS] [LOCATIONS]...
|
||||
|
||||
This command installs and updates charts defined in armada manifest
|
||||
This command installs and updates charts defined in Armada manifest.
|
||||
|
||||
The apply argument must be relative path to Armada Manifest. Executing
|
||||
apply command once will install all charts defined in manifest. Re-
|
||||
executing apply command will execute upgrade.
|
||||
|
||||
To see how to create an Armada manifest:
|
||||
https://airship-armada.readthedocs.io/en/latest/operations/
|
||||
To see how to create an Armada manifest: https://airship-
|
||||
armada.readthedocs.io/en/latest/operations/
|
||||
|
||||
To install or upgrade charts, run:
|
||||
|
||||
$ armada apply examples/simple.yaml
|
||||
$ armada apply examples/simple.yaml
|
||||
|
||||
To override a specific value in a Manifest, run:
|
||||
|
||||
$ armada apply examples/simple.yaml --set manifest:simple-armada:release="wordpress"
|
||||
$ armada apply examples/simple.yaml --set manifest:simple-armada:release="wordpress"
|
||||
|
||||
Or to override several values in a Manifest, reference a values.yaml-
|
||||
formatted file:
|
||||
|
||||
$ armada apply examples/simple.yaml --values examples/simple-ovr-values.yaml
|
||||
$ armada apply examples/simple.yaml --values examples/simple-ovr-values.yaml
|
||||
|
||||
Options:
|
||||
--api Contacts service endpoint.
|
||||
@ -37,6 +37,8 @@ Commands
|
||||
--disable-update-pre Disable pre-update Tiller operations.
|
||||
--dry-run Run charts without installing them.
|
||||
--enable-chart-cleanup Clean up unmanaged charts.
|
||||
--metrics-output TEXT The output path for metric data
|
||||
--use-doc-ref Use armada manifest file reference.
|
||||
--set TEXT Use to override Armada Manifest values.
|
||||
Accepts overrides that adhere to the format
|
||||
<path>:<to>:<property>=<value> to specify a
|
||||
@ -46,15 +48,19 @@ Commands
|
||||
--tiller-host TEXT Tiller host IP.
|
||||
--tiller-port INTEGER Tiller host port.
|
||||
-tn, --tiller-namespace TEXT Tiller namespace.
|
||||
--timeout INTEGER Specifies time to wait for charts to deploy.
|
||||
--timeout INTEGER Specifies time to wait for each chart to fully
|
||||
finish deploying.
|
||||
-f, --values TEXT Use to override multiple Armada Manifest
|
||||
values by reading overrides from a
|
||||
values.yaml-type file.
|
||||
--wait Wait until all charts deployed.
|
||||
--wait Force Tiller to wait until all charts are
|
||||
deployed, rather than using each charts
|
||||
specified wait policy. This is equivalent to
|
||||
sequenced chartgroups.
|
||||
--target-manifest TEXT The target manifest to run. Required for
|
||||
specifying which manifest to run when multiple
|
||||
are available.
|
||||
--bearer-token User bearer token.
|
||||
--bearer-token TEXT User Bearer token
|
||||
--debug Enable debug logging.
|
||||
--help Show this message and exit.
|
||||
|
||||
|
@ -14,6 +14,7 @@ Operations Guide
|
||||
guide-configure
|
||||
guide-troubleshooting
|
||||
guide-use-armada
|
||||
metrics
|
||||
exceptions/index
|
||||
guide-helm-plugin
|
||||
sampleconf
|
||||
|
85
doc/source/operations/metrics.rst
Normal file
85
doc/source/operations/metrics.rst
Normal file
@ -0,0 +1,85 @@
|
||||
.. _metrics:
|
||||
|
||||
Metrics
|
||||
=======
|
||||
|
||||
Armada exposes metric data, for consumption by `Prometheus`_.
|
||||
|
||||
Exporting
|
||||
---------
|
||||
|
||||
Metric data can be exported via:
|
||||
|
||||
* API: Prometheus exporter in the `/metrics` endpoint. The Armada chart
|
||||
includes the appropriate Prometheus scrape configurations for this endpoint.
|
||||
* CLI: `--metrics-output=<path>` of `apply` command. The
|
||||
`node exporter text file collector`_ can then be used to export the produced
|
||||
text files to Prometheus.
|
||||
|
||||
Metric Names
|
||||
------------
|
||||
|
||||
Metric names are as follows:
|
||||
|
||||
`armada_` + <action> + `_` + <metric>
|
||||
|
||||
Supported <action>s
|
||||
-------------------
|
||||
|
||||
The below tree of <action>s are measured. Supported prometheus labels are noted.
|
||||
Labels are inherited by sub-actions except as noted.
|
||||
|
||||
* `apply`:
|
||||
|
||||
* description: apply a manifest
|
||||
* labels: `manifest`
|
||||
* sub-actions:
|
||||
|
||||
* `chart_handle`:
|
||||
|
||||
* description: fully handle a chart (see below sub-actions)
|
||||
* labels:
|
||||
|
||||
* `chart`
|
||||
* `action` (install|upgrade|noop) (not included in sub-actions)
|
||||
* sub-actions:
|
||||
|
||||
* `chart_download`
|
||||
* `chart_deploy`
|
||||
* `chart_test`
|
||||
* `chart_delete`:
|
||||
|
||||
* description: delete a chart (e.g. due to `FAILED` status)
|
||||
* labels: `chart`
|
||||
|
||||
Supported <metric>s
|
||||
-------------------
|
||||
|
||||
* `failure_total`: total failed attempts
|
||||
* `attempt_total`: total attempts
|
||||
* `attempt_inprogress`: total attempts in progress
|
||||
* `duration_seconds`: duration of each attempt
|
||||
|
||||
Timeouts
|
||||
^^^^^^^^
|
||||
|
||||
The `chart_handle` and `chart_test` actions additionally include the following
|
||||
metrics:
|
||||
|
||||
* `timeout_duration_seconds`: configured chart timeout duration in seconds
|
||||
* `timeout_usage_ratio`: `= duration_seconds / timeout_duration_seconds`
|
||||
|
||||
These can help identify charts whose timeouts may need to
|
||||
be changed to avoid potential failures or to acheive faster failures.
|
||||
|
||||
Chart concurrency
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
The `chart_handle` action additionally includes the following metric:
|
||||
|
||||
* `concurrency_count`: count of charts being handled concurrently
|
||||
|
||||
This can help identify opportunities for greater chart concurrency.
|
||||
|
||||
.. _Prometheus: https://prometheus.io
|
||||
.. _`node exporter text file collector`: https://github.com/prometheus/node_exporter#textfile-collector
|
@ -26,6 +26,8 @@ ARMADA_UWSGI_TIMEOUT=${ARMADA_UWSGI_TIMEOUT:-3600}
|
||||
ARMADA_UWSGI_WORKERS=${ARMADA_UWSGI_WORKERS:-4}
|
||||
# Threads per worker
|
||||
ARMADA_UWSGI_THREADS=${ARMADA_UWSGI_THREADS:-1}
|
||||
# Prometheus multiprocess dir
|
||||
ARMADA_PROMETHEUS_MULTIPROC_DIR=${ARMADA_PROMETHEUS_MULTIPROC_DIR:-$(mktemp -d -p /tmp/armada/metrics XXXXXX)}
|
||||
|
||||
# Start Armada application
|
||||
# TODO(fmontei): Should be specifying callable too. But Armada spins up the
|
||||
@ -37,6 +39,7 @@ if [ "$1" = 'server' ]; then
|
||||
--http :"${ARMADA_UWSGI_PORT}" \
|
||||
--http-timeout "$ARMADA_UWSGI_TIMEOUT" \
|
||||
--enable-threads \
|
||||
--env prometheus_multiproc_dir="$ARMADA_PROMETHEUS_MULTIPROC_DIR" \
|
||||
-L \
|
||||
--lazy-apps \
|
||||
--master \
|
||||
|
@ -10,6 +10,7 @@ PasteDeploy>=1.5.2
|
||||
protobuf>=3.4.0
|
||||
PyYAML==3.12
|
||||
requests
|
||||
prometheus_client==0.7.0
|
||||
|
||||
# API
|
||||
falcon
|
||||
|
Loading…
x
Reference in New Issue
Block a user