portdirect 515494ca98 RBAC: Include release name in cluster roles to prevent collision
This PS includes the release name in the cluster role to prevent
colision if the chart is deployed multiple times in the same
cluster.

Change-Id: I7166e5ee25b3d4c89879393c5f84c869585a2681
2018-02-19 13:13:56 -06:00

774 lines
34 KiB
YAML

# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values for prometheus.
# This is a YAML-formatted file.
# Declare name/value pairs to be passed into your templates.
# name: value
images:
tags:
prometheus: docker.io/prom/prometheus:v2.0.0
helm_tests: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
local_registry:
active: false
exclude:
- dep_check
- image_repo_sync
labels:
node_selector_key: openstack-control-plane
node_selector_value: enabled
pod:
affinity:
anti:
type:
default: preferredDuringSchedulingIgnoredDuringExecution
topologyKey:
default: kubernetes.io/hostname
mounts:
prometheus:
prometheus:
init_container: null
replicas:
prometheus: 1
lifecycle:
upgrades:
revision_history: 3
pod_replacement_strategy: RollingUpdate
rolling_update:
max_unavailable: 1
max_surge: 3
termination_grace_period:
prometheus:
timeout: 30
resources:
enabled: false
prometheus:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "500m"
jobs:
image_repo_sync:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
tests:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
endpoints:
cluster_domain_suffix: cluster.local
local_image_registry:
name: docker-registry
namespace: docker-registry
hosts:
default: localhost
internal: docker-registry
node: localhost
host_fqdn_override:
default: null
port:
registry:
node: 5000
monitoring:
name: prometheus
namespace: null
hosts:
default: prom-metrics
public: prometheus
host_fqdn_override:
default: null
path:
default: null
scheme:
default: 'http'
port:
api:
default: 9090
public: 80
alerts:
name: alertmanager
namespace: null
hosts:
default: alerts-api
public: alertmanager
host_fqdn_override:
default: null
path:
default: null
scheme:
default: 'http'
port:
api:
default: 9093
public: 80
dependencies:
prometheus:
services: null
image_repo_sync:
services:
- service: local_image_registry
endpoint: internal
conditional_dependencies:
local_image_registry:
jobs:
- prometheus-image-repo-sync
services:
- service: local_image_registry
endpoint: node
monitoring:
prometheus:
enabled: true
prometheus:
scrape: true
network:
prometheus:
ingress:
public: true
proxy_body_size: 1024M
node_port:
enabled: false
port: 30900
port: 9090
storage:
enabled: true
pvc:
name: prometheus-pvc
access_mode: [ "ReadWriteOnce" ]
requests:
storage: 5Gi
storage_class: general
manifests:
configmap_bin: true
configmap_etc: true
ingress_prometheus: true
helm_tests: true
job_image_repo_sync: true
pvc: true
service_ingress_prometheus: true
service: true
statefulset_prometheus: true
conf:
prometheus:
storage:
tsdb:
path: /var/lib/prometheus/data
retention: 7d
min_block_duration: 2h
max_block_duration: 6h
log:
format: logger:stdout?json=true
level: info
query:
max_concurrency: 20
timeout: 2m
web_admin_api:
enabled: false
scrape_configs:
global:
scrape_interval: 60s
evaluation_interval: 60s
rule_files:
- /etc/config/rules/alertmanager.rules
- /etc/config/rules/etcd3.rules
- /etc/config/rules/kubernetes.rules
- /etc/config/rules/kube-apiserver.rules
- /etc/config/rules/kube-controller-manager.rules
- /etc/config/rules/kubelet.rules
- /etc/config/rules/kube-scheduler.rules
- /etc/config/rules/rabbitmq.rules
- /etc/config/rules/mysql.rules
- /etc/config/rules/ceph.rules
- /etc/config/rules/openstack.rules
- /etc/config/rules/custom.rules
scrape_configs:
- job_name: kubelet
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
scrape_interval: 45s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels:
- __meta_kubernetes_node_name
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- source_labels:
- __meta_kubernetes_node_name
action: replace
target_label: kubernetes_io_hostname
# Scrape config for Kubelet cAdvisor.
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
#
# In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
# HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
# in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
# the --cadvisor-port=0 Kubelet flag).
#
# This job is not necessary and should be removed in Kubernetes 1.6 and
# earlier versions, or it will cause the metrics to be scraped twice.
- job_name: 'kubernetes-cadvisor'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
scrape_interval: 45s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels:
- __meta_kubernetes_node_name
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- source_labels:
- __meta_kubernetes_node_name
action: replace
target_label: kubernetes_io_hostname
metric_relabel_configs:
- action: replace
source_labels:
- id
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
target_label: rkt_container_name
replacement: '${2}-${1}'
- action: replace
source_labels:
- id
regex: '^/system\.slice/(.+)\.service$'
target_label: systemd_service_name
replacement: '${1}'
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'apiserver'
kubernetes_sd_configs:
- role: endpoints
scrape_interval: 45s
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
# insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
action: keep
regex: default;kubernetes;https
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
scrape_interval: 60s
relabel_configs:
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
action: keep
regex: true
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
action: replace
target_label: __scheme__
regex: (https?)
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
action: replace
target_label: kubernetes_namespace
- source_labels:
- __meta_kubernetes_service_name
action: replace
target_label: kubernetes_name
- source_labels:
- __meta_kubernetes_service_name
target_label: job
replacement: ${1}
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
# pod's declared ports (default is a port-free target if none are declared).
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: calico-etcd
honor_labels: false
kubernetes_sd_configs:
- role: service
scrape_interval: 20s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: keep
source_labels:
- __meta_kubernetes_service_name
regex: "calico-etcd"
- action: keep
source_labels:
- __meta_kubernetes_namespace
regex: kube-system
target_label: namespace
- source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- source_labels:
- __meta_kubernetes_service_name
target_label: service
- source_labels:
- __meta_kubernetes_service_name
target_label: job
replacement: ${1}
- source_labels:
- __meta_kubernetes_service_label
target_label: job
regex: calico-etcd
replacement: ${1}
- target_label: endpoint
replacement: "calico-etcd"
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels:
- __meta_kubernetes_pod_label_name
regex: alertmanager
action: keep
- source_labels:
- __meta_kubernetes_namespace
regex: openstack
action: keep
- source_labels:
- __meta_kubernetes_pod_container_port_number
regex:
action: drop
rules:
alertmanager:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
summary: Alertmanager down or not discovered
- alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
summary: Alertmanager configuration reload has failed
etcd3:
groups:
- name: etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
kube_apiserver:
groups:
- name: kube-apiserver.rules
rules:
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
summary: API server unreachable
- alert: K8SApiServerLatency
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
for: 10m
labels:
severity: warning
annotations:
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high
kube_controller_manager:
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
kubelet:
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_ready{condition="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit
kubernetes:
groups:
- name: kubernetes.rules
rules:
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
- record: cluster_namespace_controller_pod_container:memory_oom:rate
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
- record: cluster:memory_allocation:percent
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:memory_used:percent
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:cpu_allocation:percent
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
- record: cluster:node_cpu_use:percent
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.99"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.9"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
rabbitmq: null
mysql: null
ceph: null
openstack: null
custom: null