Prometheus: Update chart to support federation
This updates the Prometheus chart to support federation. This moves to defining the Prometheus configuration file via a template in the values.yaml file instead of through raw yaml. This allows for overriding the chart's default configuration wholesale, as this would be required for a hierarchical federated setup. This also strips out all of the default rules defined in the chart for the same reason. There are example rules defined for the various aspects of OSH's infrastructure in the prometheus/values_overrides directory that are executed as part of the normal CI jobs. This also adds a nonvoting federated-monitoring job that vets out the ability to federate prometheus in a hierarchical fashion with extremely basic overrides Change-Id: I0f121ad5e4f80be4c790dc869955c6b299ca9f26 Signed-off-by: Steve Wilkerson <sw5822@att.com>
This commit is contained in:
parent
0edd3e18de
commit
fbd34421f2
@ -20,7 +20,7 @@ limitations under the License.
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-bin
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
|
||||
data:
|
||||
apache.sh: |
|
||||
{{ tuple "bin/_apache.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
|
@ -16,34 +16,14 @@ limitations under the License.
|
||||
|
||||
{{- if .Values.manifests.configmap_etc }}
|
||||
{{- $envAll := . }}
|
||||
|
||||
{{- if empty $envAll.Values.conf.prometheus.scrape_configs.rule_files -}}
|
||||
{{- $_ := set $envAll.Values "__rule_files" ( list ) }}
|
||||
{{- $rulesKeys := keys $envAll.Values.conf.prometheus.rules -}}
|
||||
{{- range $rule := $rulesKeys }}
|
||||
{{- $rulesFile := printf "/etc/config/rules/%s.rules" $rule }}
|
||||
{{- $__rule_files := append $envAll.Values.__rule_files $rulesFile }}
|
||||
{{- $_ := set $envAll.Values "__rule_files" $__rule_files }}
|
||||
{{ end }}
|
||||
{{- $_ := set .Values.conf.prometheus.scrape_configs "rule_files" $envAll.Values.__rule_files -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if not (empty $envAll.Values.conf.prometheus.scrape_configs.scrape_configs) }}
|
||||
{{- $_ := set $envAll.Values "__updated_scrape_configs" ( list ) }}
|
||||
{{- $promScrapeTarget := first $envAll.Values.conf.prometheus.scrape_configs.scrape_configs }}
|
||||
{{- if (empty $promScrapeTarget.basic_auth) }}
|
||||
{{- $_ := set $promScrapeTarget "basic_auth" $envAll.Values.endpoints.monitoring.auth.admin }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: prometheus-etc
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-etc" | quote }}
|
||||
type: Opaque
|
||||
data:
|
||||
prometheus.yml: {{ toYaml .Values.conf.prometheus.scrape_configs | b64enc }}
|
||||
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.prometheus.scrape_configs.template "key" "prometheus.yml" "format" "Secret") | indent 2 }}
|
||||
{{ range $key, $value := .Values.conf.prometheus.rules }}
|
||||
{{ $key }}.rules: {{ toYaml $value | b64enc }}
|
||||
{{ end }}
|
||||
|
@ -16,7 +16,6 @@ limitations under the License.
|
||||
|
||||
{{- if .Values.manifests.helm_tests }}
|
||||
{{- $envAll := . }}
|
||||
{{- $promUserSecret := .Values.secrets.prometheus.admin }}
|
||||
|
||||
{{- $serviceAccountName := print .Release.Name "-test" }}
|
||||
{{ tuple $envAll "tests" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
||||
@ -47,12 +46,12 @@ spec:
|
||||
- name: PROMETHEUS_ADMIN_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ $promUserSecret }}
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
|
||||
key: PROMETHEUS_ADMIN_USERNAME
|
||||
- name: PROMETHEUS_ADMIN_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ $promUserSecret }}
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
|
||||
key: PROMETHEUS_ADMIN_PASSWORD
|
||||
- name: PROMETHEUS_ENDPOINT
|
||||
value: {{ tuple "monitoring" "internal" "http" $envAll | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}
|
||||
@ -68,6 +67,6 @@ spec:
|
||||
emptyDir: {}
|
||||
- name: prometheus-bin
|
||||
configMap:
|
||||
name: prometheus-bin
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
|
||||
defaultMode: 0555
|
||||
{{- end }}
|
||||
|
@ -16,12 +16,11 @@ limitations under the License.
|
||||
|
||||
{{- if .Values.manifests.secret_prometheus }}
|
||||
{{- $envAll := . }}
|
||||
{{- $secretName := index $envAll.Values.secrets.prometheus.admin }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ $secretName }}
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
|
||||
type: Opaque
|
||||
data:
|
||||
PROMETHEUS_ADMIN_USERNAME: {{ .Values.endpoints.monitoring.auth.admin.username | b64enc }}
|
||||
|
@ -19,15 +19,14 @@ limitations under the License.
|
||||
|
||||
{{- $mounts_prometheus := .Values.pod.mounts.prometheus.prometheus }}
|
||||
{{- $mounts_prometheus_init := .Values.pod.mounts.prometheus.init_container }}
|
||||
{{- $promUserSecret := .Values.secrets.prometheus.admin }}
|
||||
|
||||
{{- $serviceAccountName := printf "%s-%s" .Release.Name "prometheus" }}
|
||||
{{ tuple $envAll "prometheus" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
||||
{{- $rcControllerName := printf "%s-%s" $envAll.Release.Name "prometheus" }}
|
||||
{{ tuple $envAll "prometheus" $rcControllerName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ $serviceAccountName }}
|
||||
name: {{ $rcControllerName | quote }}
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
@ -55,20 +54,20 @@ rules:
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ $serviceAccountName }}
|
||||
name: {{ $rcControllerName | quote }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ $serviceAccountName }}
|
||||
name: {{ $rcControllerName | quote }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: {{ $serviceAccountName }}
|
||||
name: {{ $rcControllerName | quote }}
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: prometheus
|
||||
name: {{ $rcControllerName | quote }}
|
||||
annotations:
|
||||
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
|
||||
labels:
|
||||
@ -90,7 +89,7 @@ spec:
|
||||
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||
spec:
|
||||
{{ dict "envAll" $envAll "application" "api" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
|
||||
serviceAccountName: {{ $serviceAccountName }}
|
||||
serviceAccountName: {{ $rcControllerName | quote }}
|
||||
affinity:
|
||||
{{ tuple $envAll "prometheus" "api" | include "helm-toolkit.snippets.kubernetes_pod_anti_affinity" | indent 8 }}
|
||||
nodeSelector:
|
||||
@ -129,12 +128,12 @@ spec:
|
||||
- name: PROMETHEUS_ADMIN_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ $promUserSecret }}
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
|
||||
key: PROMETHEUS_ADMIN_USERNAME
|
||||
- name: PROMETHEUS_ADMIN_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ $promUserSecret }}
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
|
||||
key: PROMETHEUS_ADMIN_PASSWORD
|
||||
volumeMounts:
|
||||
- name: pod-tmp
|
||||
@ -169,6 +168,10 @@ spec:
|
||||
port: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
initialDelaySeconds: 30
|
||||
timeoutSeconds: 30
|
||||
env:
|
||||
{{- if .Values.pod.env.prometheus }}
|
||||
{{ include "helm-toolkit.utils.to_k8s_env_vars" .Values.pod.env.prometheus | indent 12 }}
|
||||
{{- end }}
|
||||
volumeMounts:
|
||||
- name: pod-tmp
|
||||
mountPath: /tmp
|
||||
@ -202,11 +205,11 @@ spec:
|
||||
emptyDir: {}
|
||||
- name: prometheus-etc
|
||||
secret:
|
||||
secretName: prometheus-etc
|
||||
secretName: {{ printf "%s-%s" $envAll.Release.Name "prometheus-etc" | quote }}
|
||||
defaultMode: 0444
|
||||
- name: prometheus-bin
|
||||
configMap:
|
||||
name: prometheus-bin
|
||||
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
|
||||
defaultMode: 0555
|
||||
{{ if $mounts_prometheus.volumes }}{{ toYaml $mounts_prometheus.volumes | indent 8 }}{{ end }}
|
||||
{{- if not .Values.storage.enabled }}
|
||||
|
File diff suppressed because it is too large
Load Diff
31
prometheus/values_overrides/alertmanager.yaml
Normal file
31
prometheus/values_overrides/alertmanager.yaml
Normal file
@ -0,0 +1,31 @@
|
||||
conf:
|
||||
prometheus:
|
||||
rules:
|
||||
alertmanager:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
|
||||
summary: Alertmanager configurations are inconsistent
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
|
||||
summary: Alertmanager down or not discovered
|
||||
- alert: FailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod }}.
|
||||
summary: Alertmanager configuration reload has failed
|
71
prometheus/values_overrides/ceph.yaml
Normal file
71
prometheus/values_overrides/ceph.yaml
Normal file
@ -0,0 +1,71 @@
|
||||
conf:
|
||||
prometheus:
|
||||
rules:
|
||||
ceph:
|
||||
groups:
|
||||
- name: ceph.rules
|
||||
rules:
|
||||
- alert: prom_exporter_ceph_unavailable
|
||||
expr: absent(ceph_health_status)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: Ceph exporter is not collecting metrics or is not available
|
||||
- alert: no_active_ceph_mgr
|
||||
expr: count(up{job="ceph-mgr"} == 1) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'no ceph active mgr is present or all ceph mgr are down'
|
||||
summary: 'no ceph active mgt is present'
|
||||
- alert: ceph_mon_quorum_low
|
||||
expr: ceph_mon_quorum_count < 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
|
||||
summary: 'ceph high availability is at risk'
|
||||
- alert: ceph_cluster_usage_high
|
||||
expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'ceph cluster capacity usage more than 80 percent'
|
||||
summary: 'ceph cluster usage is more than 80 percent'
|
||||
- alert: ceph_placement_group_degrade_pct_high
|
||||
expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: 'ceph placement group degradation is more than 80 percent'
|
||||
summary: 'ceph placement groups degraded'
|
||||
- alert: ceph_osd_down_pct_high
|
||||
expr: 100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: 'ceph OSDs down percent is more than 80 percent'
|
||||
summary: 'ceph OSDs down percent is high'
|
||||
- alert: ceph_osd_down
|
||||
expr: ceph_osd_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
|
||||
summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
|
||||
- alert: ceph_osd_out
|
||||
expr: ceph_osd_in == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
|
||||
summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
|
379
prometheus/values_overrides/kubernetes.yaml
Normal file
379
prometheus/values_overrides/kubernetes.yaml
Normal file
@ -0,0 +1,379 @@
|
||||
conf:
|
||||
prometheus:
|
||||
rules:
|
||||
kubernetes:
|
||||
groups:
|
||||
- name: calico.rules
|
||||
rules:
|
||||
- alert: prom_exporter_calico_unavailable
|
||||
expr: absent(felix_host)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Calico exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: Calico exporter is not collecting metrics or is not available
|
||||
- alert: calico_datapane_failures_high_1h
|
||||
expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
|
||||
summary: 'A high number of dataplane failures within Felix are happening'
|
||||
- alert: calico_datapane_address_msg_batch_size_high_5m
|
||||
expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
|
||||
summary: 'Felix address message batch size is higher'
|
||||
- alert: calico_datapane_iface_msg_batch_size_high_5m
|
||||
expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
|
||||
summary: 'Felix interface message batch size is higher'
|
||||
- alert: calico_ipset_errors_high_1h
|
||||
expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
|
||||
summary: 'A high number of ipset errors within Felix are happening'
|
||||
- alert: calico_iptable_save_errors_high_1h
|
||||
expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
|
||||
summary: 'A high number of iptable save errors within Felix are happening'
|
||||
- alert: calico_iptable_restore_errors_high_1h
|
||||
expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
|
||||
summary: 'A high number of iptable restore errors within Felix are happening'
|
||||
- name: etcd3.rules
|
||||
rules:
|
||||
- alert: etcd_InsufficientMembers
|
||||
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: etcd_NoLeader
|
||||
expr: etcd_server_has_leader{job="etcd"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{ $labels.instance }} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: etcd_HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: etcd_HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: etcd_HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: etcd_GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: etcd_HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: etcd_HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: etcd_HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: etcd_EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: etcd_HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: etcd_HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: etcd_HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||
summary: high commit durations
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready", status="unknown"} == 1 or kube_node_status_condition{condition="Ready", status="false"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute
|
||||
summary: '{{ $labels.node }} Node status is NotReady and {{ $labels.status }}'
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
|
||||
summary: Many Kubernetes nodes are Not Ready
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
|
||||
summary: Many Kubernetes nodes are Not Ready
|
||||
- alert: K8SNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }} nodes are notReady state.'
|
||||
summary: One or more Kubernetes nodes are Not Ready
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletDown
|
||||
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
|
||||
summary: API server unreachable
|
||||
- alert: K8SApiServerLatency
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
|
||||
summary: Kubernetes apiserver latency is high
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{job="kube-controller-manager-discovery"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
- name: kubernetes-object.rules
|
||||
rules:
|
||||
- alert: prom_exporter_kube_state_metrics_unavailable
|
||||
expr: absent(kube_node_info)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: kube-state-metrics exporter is not collecting metrics or is not available
|
||||
- alert: kube_statefulset_replicas_unavailable
|
||||
expr: kube_statefulset_status_replicas < kube_statefulset_replicas
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
|
||||
summary: '{{$labels.statefulset}}: has inssuficient replicas.'
|
||||
- alert: daemonsets_misscheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
|
||||
summary: 'Daemonsets not scheduled correctly'
|
||||
- alert: daemonsets_not_scheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
|
||||
summary: 'Less than desired number of daemonsets scheduled'
|
||||
- alert: daemonset_pods_unavailable
|
||||
expr: kube_daemonset_status_number_unavailable > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
|
||||
summary: 'Daemonset pods unavailable, due to one of many reasons'
|
||||
- alert: deployment_replicas_unavailable
|
||||
expr: kube_deployment_status_replicas_unavailable > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
|
||||
summary: '{{$labels.deployment}}: has inssuficient replicas.'
|
||||
- alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
|
||||
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
|
||||
summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
|
||||
- alert: job_status_failed
|
||||
expr: kube_job_status_failed > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Job {{$labels.exported_job}} is in failed status'
|
||||
summary: '{{$labels.exported_job}} has failed status'
|
||||
- alert: pod_status_pending
|
||||
expr: kube_pod_status_phase{phase="Pending"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
|
||||
- alert: pod_error_image_pull
|
||||
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_status_error_image_pull_backoff
|
||||
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_error_crash_loop_back_off
|
||||
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_error_config_error
|
||||
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: replicaset_missing_replicas
|
||||
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
|
||||
summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
|
||||
- alert: pod_container_terminated
|
||||
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: volume_claim_capacity_high_utilization
|
||||
expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
|
||||
summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
|
105
prometheus/values_overrides/logging.yaml
Normal file
105
prometheus/values_overrides/logging.yaml
Normal file
@ -0,0 +1,105 @@
|
||||
conf:
|
||||
prometheus:
|
||||
rules:
|
||||
logging:
|
||||
groups:
|
||||
- name: fluentd.rules
|
||||
rules:
|
||||
- alert: prom_exporter_fluentd_unavailable
|
||||
expr: absent(fluentd_up)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: Fluentd exporter is not collecting metrics or is not available
|
||||
- alert: fluentd_not_running
|
||||
expr: fluentd_up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
|
||||
summary: 'Fluentd is down'
|
||||
- name: elasticsearch.rules
|
||||
rules:
|
||||
- alert: prom_exporter_elasticsearch_unavailable
|
||||
expr: absent(elasticsearch_cluster_health_status)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: Elasticsearch exporter is not collecting metrics or is not available
|
||||
- alert: es_high_process_open_files_count
|
||||
expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
|
||||
summary: 'Elasticsearch has a very high process open file count.'
|
||||
- alert: es_high_process_cpu_percent
|
||||
expr: elasticsearch_process_cpu_percent > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
|
||||
summary: 'Elasticsearch process cpu usage is more than 95 percent.'
|
||||
- alert: es_fs_usage_high
|
||||
expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
|
||||
summary: 'Elasticsearch filesystem usage is high.'
|
||||
- alert: es_unassigned_shards
|
||||
expr: elasticsearch_cluster_health_unassigned_shards > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch has {{ $value }} unassigned shards.'
|
||||
summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
|
||||
- alert: es_cluster_health_timed_out
|
||||
expr: elasticsearch_cluster_health_timed_out > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
|
||||
summary: 'Elasticsearch cluster health status calls are timing out.'
|
||||
- alert: es_cluster_health_status_alert
|
||||
expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
|
||||
summary: 'Elasticsearch cluster health status is not green.'
|
||||
- alert: es_cluster_health_too_few_nodes_running
|
||||
expr: elasticsearch_cluster_health_number_of_nodes < 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
|
||||
summary: 'ElasticSearch running on less than 3 nodes'
|
||||
- alert: es_cluster_health_too_few_data_nodes_running
|
||||
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
||||
summary: 'ElasticSearch running on less than 3 data nodes'
|
||||
- alert: es_cluster_health_too_few_data_nodes_running
|
||||
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
||||
summary: 'ElasticSearch running on less than 3 data nodes'
|
240
prometheus/values_overrides/nodes.yaml
Normal file
240
prometheus/values_overrides/nodes.yaml
Normal file
@ -0,0 +1,240 @@
|
||||
conf:
|
||||
prometheus:
|
||||
rules:
|
||||
nodes:
|
||||
groups:
|
||||
- name: nodes.rules
|
||||
rules:
|
||||
- alert: prom_exporter_node_unavailable
|
||||
expr: absent(node_uname_info)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: node exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: node exporter is not collecting metrics or is not available
|
||||
- alert: node_filesystem_full_80percent
|
||||
expr: sort(node_filesystem_free{fstype =~ "xfs|ext[34]"} < node_filesystem_size{fstype =~ "xfs|ext[34]"}
|
||||
* 0.2) / 1024 ^ 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
|
||||
got less than 10% space left on its filesystem.'
|
||||
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
|
||||
- alert: node_filesystem_full_in_4h
|
||||
expr: predict_linear(node_filesystem_free{fstype =~ "xfs|ext[34]"}[1h], 4 * 3600) <= 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
|
||||
is running out of space of in approx. 4 hours'
|
||||
summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
|
||||
- alert: node_filedescriptors_full_in_3h
|
||||
expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
|
||||
for: 20m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is running out of available file descriptors
|
||||
in approx. 3 hours'
|
||||
summary: '{{$labels.alias}} is running out of available file descriptors in
|
||||
3 hours.'
|
||||
- alert: node_load1_90percent
|
||||
expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
|
||||
for: 1h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is running with > 90% total load for at least
|
||||
1h.'
|
||||
summary: '{{$labels.alias}}: Running on high load.'
|
||||
- alert: node_cpu_util_90percent
|
||||
expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
|
||||
for: 1h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} has total CPU utilization over 90% for at least
|
||||
1h.'
|
||||
summary: '{{$labels.alias}}: High CPU utilization.'
|
||||
- alert: node_ram_using_90percent
|
||||
expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
|
||||
* 0.1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is using at least 90% of its RAM for at least
|
||||
30 minutes now.'
|
||||
summary: '{{$labels.alias}}: Using lots of RAM.'
|
||||
- alert: node_swap_using_80percent
|
||||
expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
|
||||
> node_memory_SwapTotal * 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is using 80% of its swap space for at least
|
||||
10 minutes now.'
|
||||
summary: '{{$labels.alias}}: Running out of swap soon.'
|
||||
- alert: node_high_cpu_load
|
||||
expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
|
||||
summary: '{{$labels.alias}}: Running on high load: {{$value}}'
|
||||
- alert: node_high_memory_load
|
||||
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
|
||||
+ node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Host memory usage is {{ humanize $value }}%. Reported by
|
||||
instance {{ $labels.instance }} of job {{ $labels.job }}.
|
||||
summary: Server memory is almost full
|
||||
- alert: node_high_storage_load
|
||||
expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
|
||||
/ node_filesystem_size{mountpoint="/"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Host storage usage is {{ humanize $value }}%. Reported by
|
||||
instance {{ $labels.instance }} of job {{ $labels.job }}.
|
||||
summary: Server storage is almost full
|
||||
- alert: node_high_swap
|
||||
expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
|
||||
* 0.4)
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Host system has a high swap usage of {{ humanize $value }}. Reported
|
||||
by instance {{ $labels.instance }} of job {{ $labels.job }}.
|
||||
summary: Server has a high swap usage
|
||||
- alert: node_high_network_drop_rcv
|
||||
expr: node_network_receive_drop{device!="lo"} > 3000
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Host system has an unusally high drop in network reception ({{
|
||||
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
|
||||
$labels.job }}
|
||||
summary: Server has a high receive drop
|
||||
- alert: node_high_network_drop_send
|
||||
expr: node_network_transmit_drop{device!="lo"} > 3000
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Host system has an unusally high drop in network transmission ({{
|
||||
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
|
||||
$labels.job }}
|
||||
summary: Server has a high transmit drop
|
||||
- alert: node_high_network_errs_rcv
|
||||
expr: node_network_receive_errs{device!="lo"} > 3000
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Host system has an unusally high error rate in network reception
|
||||
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
|
||||
{{ $labels.job }}
|
||||
summary: Server has unusual high reception errors
|
||||
- alert: node_high_network_errs_send
|
||||
expr: node_network_transmit_errs{device!="lo"} > 3000
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Host system has an unusally high error rate in network transmission
|
||||
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
|
||||
{{ $labels.job }}
|
||||
summary: Server has unusual high transmission errors
|
||||
- alert: node_network_conntrack_usage_80percent
|
||||
expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
|
||||
summary: '{{$labels.instance}}: available network conntrack entries are low.'
|
||||
- alert: node_entropy_available_low
|
||||
expr: node_entropy_available_bits < 300
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
|
||||
summary: '{{$labels.instance}}: is low on entropy bits.'
|
||||
- alert: node_hwmon_high_cpu_temp
|
||||
expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
|
||||
summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
|
||||
- alert: node_vmstat_paging_rate_high
|
||||
expr: irate(node_vmstat_pgpgin[5m]) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
|
||||
summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
|
||||
- alert: node_xfs_block_allocation_high
|
||||
expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
|
||||
summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
|
||||
- alert: node_network_bond_slaves_down
|
||||
expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
|
||||
summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
|
||||
- alert: node_numa_memory_used
|
||||
expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
|
||||
summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
|
||||
- alert: node_ntp_clock_skew_high
|
||||
expr: abs(node_ntp_drift_seconds) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
|
||||
summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
|
||||
- alert: node_disk_read_latency
|
||||
expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 40
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.device}} has a high read latency of {{ $value }}'
|
||||
summary: 'High read latency observed for device {{ $labels.device }}'
|
||||
- alert: node_disk_write_latency
|
||||
expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 40
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.device}} has a high write latency of {{ $value }}'
|
||||
summary: 'High write latency observed for device {{ $labels.device }}'
|
315
prometheus/values_overrides/openstack.yaml
Normal file
315
prometheus/values_overrides/openstack.yaml
Normal file
@ -0,0 +1,315 @@
|
||||
conf:
|
||||
prometheus:
|
||||
rules:
|
||||
openstack:
|
||||
groups:
|
||||
- name: mariadb.rules
|
||||
rules:
|
||||
- alert: prom_exporter_mariadb_unavailable
|
||||
expr: absent(mysql_up)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: MariaDB exporter is not collecting metrics or is not available
|
||||
- alert: mariadb_table_lock_wait_high
|
||||
expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Mariadb has high table lock waits of {{ $value }} percentage'
|
||||
summary: 'Mariadb table lock waits are high'
|
||||
- alert: mariadb_node_not_ready
|
||||
expr: mysql_global_status_wsrep_ready != 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
|
||||
summary: 'Galera cluster node not ready'
|
||||
- alert: mariadb_galera_node_out_of_sync
|
||||
expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
|
||||
summary: 'Galera cluster node out of sync'
|
||||
- alert: mariadb_innodb_replication_fallen_behind
|
||||
expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'The mysql innodb replication has fallen behind and is not recovering'
|
||||
summary: 'MySQL innodb replication is lagging'
|
||||
- name: openstack.rules
|
||||
rules:
|
||||
- alert: prom_exporter_openstack_unavailable
|
||||
expr: absent(openstack_exporter_cache_refresh_duration_seconds)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: Openstack exporter is not collecting metrics or is not available
|
||||
- alert: os_glance_api_availability
|
||||
expr: openstack_check_glance_api != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
|
||||
summary: 'Glance API is not available at {{$labels.url}}'
|
||||
- alert: os_nova_api_availability
|
||||
expr: openstack_check_nova_api != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
|
||||
summary: 'Nova API is not available at {{$labels.url}}'
|
||||
- alert: os_keystone_api_availability
|
||||
expr: openstack_check_keystone_api != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
|
||||
summary: 'Keystone API is not available at {{$labels.url}}'
|
||||
- alert: os_neutron_api_availability
|
||||
expr: openstack_check_neutron_api != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
|
||||
summary: 'Neutron API is not available at {{$labels.url}}'
|
||||
- alert: os_neutron_metadata_agent_availability
|
||||
expr: openstack_services_neutron_metadata_agent_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
|
||||
summary: 'One or more neutron metadata_agents are not available'
|
||||
- alert: os_neutron_openvswitch_agent_availability
|
||||
expr: openstack_services_neutron_openvswitch_agent_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
|
||||
summary: 'One or more neutron openvswitch agents are not available'
|
||||
- alert: os_neutron_dhcp_agent_availability
|
||||
expr: openstack_services_neutron_dhcp_agent_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
|
||||
summary: 'One or more neutron dhcp agents are not available'
|
||||
- alert: os_neutron_l3_agent_availability
|
||||
expr: openstack_services_neutron_l3_agent_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'One or more neutron L3 agents are not available for more than 5 minutes'
|
||||
summary: 'One or more neutron L3 agents are not available'
|
||||
- alert: os_swift_api_availability
|
||||
expr: openstack_check_swift_api != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
|
||||
summary: 'Swift API is not available at {{$labels.url}}'
|
||||
- alert: os_cinder_api_availability
|
||||
expr: openstack_check_cinder_api != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Cinder API is not available at {{$labels.url}} for more than 5 minutes'
|
||||
summary: 'Cinder API is not available at {{$labels.url}}'
|
||||
- alert: os_cinder_scheduler_availability
|
||||
expr: openstack_services_cinder_cinder_scheduler != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Cinder scheduler is not available for more than 5 minutes'
|
||||
summary: 'Cinder scheduler is not available'
|
||||
- alert: os_heat_api_availability
|
||||
expr: openstack_check_heat_api != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
|
||||
summary: 'Heat API is not available at {{$labels.url}}'
|
||||
- alert: os_nova_compute_disabled
|
||||
expr: openstack_services_nova_compute_disabled_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-compute is disabled on some hosts'
|
||||
- alert: os_nova_conductor_disabled
|
||||
expr: openstack_services_nova_conductor_disabled_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-conductor is disabled on some hosts'
|
||||
- alert: os_nova_consoleauth_disabled
|
||||
expr: openstack_services_nova_consoleauth_disabled_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
|
||||
- alert: os_nova_scheduler_disabled
|
||||
expr: openstack_services_nova_scheduler_disabled_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
|
||||
- alert: os_nova_compute_down
|
||||
expr: openstack_services_nova_compute_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-compute is down on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-compute is down on some hosts'
|
||||
- alert: os_nova_conductor_down
|
||||
expr: openstack_services_nova_conductor_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-conductor is down on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-conductor is down on some hosts'
|
||||
- alert: os_nova_consoleauth_down
|
||||
expr: openstack_services_nova_consoleauth_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-consoleauth is down on some hosts'
|
||||
- alert: os_nova_scheduler_down
|
||||
expr: openstack_services_nova_scheduler_down_total > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
|
||||
summary: 'Openstack compute service nova-scheduler is down on some hosts'
|
||||
- alert: os_vm_vcpu_usage_high
|
||||
expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
|
||||
summary: 'Openstack VM vcpu usage is high'
|
||||
- alert: os_vm_ram_usage_high
|
||||
expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Openstack VM RAM usage is hight at {{$value}} percent'
|
||||
summary: 'Openstack VM RAM usage is high'
|
||||
- alert: os_vm_disk_usage_high
|
||||
expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Openstack VM Disk usage is hight at {{$value}} percent'
|
||||
summary: 'Openstack VM Disk usage is high'
|
||||
- name: rabbitmq.rules
|
||||
rules:
|
||||
- alert: rabbitmq_network_pratitions_detected
|
||||
expr: min(partitions) by(instance) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
|
||||
summary: 'RabbitMQ Network partitions detected'
|
||||
- alert: rabbitmq_down
|
||||
expr: min(rabbitmq_up) by(instance) != 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
|
||||
summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
|
||||
- alert: rabbitmq_file_descriptor_usage_high
|
||||
expr: fd_used * 100 /fd_total > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
|
||||
summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
|
||||
- alert: rabbitmq_node_disk_free_alarm
|
||||
expr: node_disk_free_alarm > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
|
||||
summary: 'RabbitMQ disk space usage is high'
|
||||
- alert: rabbitmq_node_memory_alarm
|
||||
expr: node_mem_alarm > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
|
||||
summary: 'RabbitMQ memory usage is high'
|
||||
- alert: rabbitmq_less_than_3_nodes
|
||||
expr: running < 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server has less than 3 nodes running.'
|
||||
summary: 'RabbitMQ server is at risk of loosing data'
|
||||
- alert: rabbitmq_queue_messages_returned_high
|
||||
expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
|
||||
summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
|
||||
- alert: rabbitmq_consumers_low_utilization
|
||||
expr: queue_consumer_utilisation < .4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ consumers message consumption speed is low'
|
||||
summary: 'RabbitMQ consumers message consumption speed is low'
|
||||
- alert: rabbitmq_high_message_load
|
||||
expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
|
||||
summary: 'RabbitMQ has high message load'
|
39
prometheus/values_overrides/postgresql.yaml
Normal file
39
prometheus/values_overrides/postgresql.yaml
Normal file
@ -0,0 +1,39 @@
|
||||
conf:
|
||||
prometheus:
|
||||
rules:
|
||||
postgresql:
|
||||
groups:
|
||||
- name: postgresql.rules
|
||||
rules:
|
||||
- alert: prom_exporter_postgresql_unavailable
|
||||
expr: absent(pg_static)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
|
||||
title: postgresql exporter is not collecting metrics or is not available
|
||||
- alert: pg_replication_fallen_behind
|
||||
expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
|
||||
title: Postgres Replication lag is over 2 minutes
|
||||
- alert: pg_connections_too_high
|
||||
expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warn
|
||||
channel: database
|
||||
annotations:
|
||||
title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
|
||||
- alert: pg_deadlocks_detected
|
||||
expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
|
||||
title: Postgres server is experiencing deadlocks
|
1
tools/deployment/federated-monitoring/000-install-packages.sh
Symbolic link
1
tools/deployment/federated-monitoring/000-install-packages.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../common/000-install-packages.sh
|
1
tools/deployment/federated-monitoring/005-deploy-k8s.sh
Symbolic link
1
tools/deployment/federated-monitoring/005-deploy-k8s.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../common/005-deploy-k8s.sh
|
1
tools/deployment/federated-monitoring/010-ingress.sh
Symbolic link
1
tools/deployment/federated-monitoring/010-ingress.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../common/020-ingress.sh
|
1
tools/deployment/federated-monitoring/020-nfs-provisioner.sh
Symbolic link
1
tools/deployment/federated-monitoring/020-nfs-provisioner.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../osh-infra-monitoring/030-nfs-provisioner.sh
|
1
tools/deployment/federated-monitoring/030-ldap.sh
Symbolic link
1
tools/deployment/federated-monitoring/030-ldap.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../common/040-ldap.sh
|
1
tools/deployment/federated-monitoring/040-kube-state-metrics.sh
Symbolic link
1
tools/deployment/federated-monitoring/040-kube-state-metrics.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../common/070-kube-state-metrics.sh
|
1
tools/deployment/federated-monitoring/050-node-exporter.sh
Symbolic link
1
tools/deployment/federated-monitoring/050-node-exporter.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../common/080-node-exporter.sh
|
68
tools/deployment/federated-monitoring/060-prometheus.sh
Executable file
68
tools/deployment/federated-monitoring/060-prometheus.sh
Executable file
@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2017 The Openstack-Helm Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
set -xe
|
||||
|
||||
#NOTE: Lint and package chart
|
||||
make prometheus
|
||||
|
||||
tee /tmp/prometheus-one.yaml << EOF
|
||||
endpoints:
|
||||
monitoring:
|
||||
hosts:
|
||||
default: prom-metrics-one
|
||||
public: prometheus-one
|
||||
manifests:
|
||||
network_policy: false
|
||||
EOF
|
||||
|
||||
tee /tmp/prometheus-two.yaml << EOF
|
||||
endpoints:
|
||||
monitoring:
|
||||
hosts:
|
||||
default: prom-metrics-two
|
||||
public: prometheus-two
|
||||
manifests:
|
||||
network_policy: false
|
||||
EOF
|
||||
|
||||
tee /tmp/prometheus-three.yaml << EOF
|
||||
endpoints:
|
||||
monitoring:
|
||||
hosts:
|
||||
default: prom-metrics-three
|
||||
public: prometheus-three
|
||||
manifests:
|
||||
network_policy: false
|
||||
EOF
|
||||
#NOTE: Deploy command
|
||||
for release in prometheus-one prometheus-two prometheus-three; do
|
||||
rules_overrides=""
|
||||
for rules_file in $(ls ./prometheus/values_overrides); do
|
||||
rules_overrides="$rules_overrides --values=./prometheus/values_overrides/$rules_file"
|
||||
done
|
||||
helm upgrade --install prometheus-$release ./prometheus \
|
||||
--namespace=osh-infra \
|
||||
--values=/tmp/$release.yaml \
|
||||
$rules_overrides
|
||||
#NOTE: Wait for deploy
|
||||
./tools/deployment/common/wait-for-pods.sh osh-infra
|
||||
|
||||
#NOTE: Validate Deployment info
|
||||
helm status prometheus-$release
|
||||
|
||||
helm test prometheus-$release
|
||||
done
|
66
tools/deployment/federated-monitoring/070-federated-prometheus.sh
Executable file
66
tools/deployment/federated-monitoring/070-federated-prometheus.sh
Executable file
@ -0,0 +1,66 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2017 The Openstack-Helm Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
set -xe
|
||||
|
||||
tee /tmp/federated-prometheus.yaml << EOF
|
||||
endpoints:
|
||||
monitoring:
|
||||
hosts:
|
||||
default: prom-metrics-federate
|
||||
public: prometheus-federate
|
||||
manifests:
|
||||
network_policy: false
|
||||
conf:
|
||||
prometheus:
|
||||
scrape_configs:
|
||||
template: |
|
||||
global:
|
||||
scrape_interval: 60s
|
||||
evaluation_interval: 60s
|
||||
scrape_configs:
|
||||
- job_name: 'federate'
|
||||
scrape_interval: 15s
|
||||
|
||||
honor_labels: true
|
||||
metrics_path: '/federate'
|
||||
|
||||
params:
|
||||
'match[]':
|
||||
- '{__name__=~".+"}'
|
||||
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'prometheus-one.osh-infra.svc.cluster.local:80'
|
||||
- 'prometheus-two.osh-infra.svc.cluster.local:80'
|
||||
- 'prometheus-three.osh-infra.svc.cluster.local:80'
|
||||
EOF
|
||||
|
||||
#NOTE: Lint and package chart
|
||||
make prometheus
|
||||
|
||||
#NOTE: Deploy command
|
||||
helm upgrade --install federated-prometheus ./prometheus \
|
||||
--namespace=osh-infra \
|
||||
--values=/tmp/federated-prometheus.yaml
|
||||
|
||||
#NOTE: Wait for deploy
|
||||
./tools/deployment/common/wait-for-pods.sh osh-infra
|
||||
|
||||
#NOTE: Validate Deployment info
|
||||
helm status federated-prometheus
|
||||
|
||||
helm test federated-prometheus
|
33
tools/deployment/federated-monitoring/100-prometheus-selenium.sh
Executable file
33
tools/deployment/federated-monitoring/100-prometheus-selenium.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -xe
|
||||
|
||||
export CHROMEDRIVER="${CHROMEDRIVER:="/etc/selenium/chromedriver"}"
|
||||
export ARTIFACTS_DIR="${ARTIFACTS_DIR:="/tmp/artifacts/"}"
|
||||
|
||||
export PROMETHEUS_USER="admin"
|
||||
export PROMETHEUS_PASSWORD="changeme"
|
||||
|
||||
export PROMETHEUS_URI="prometheus-one.osh-infra.svc.cluster.local"
|
||||
python3 tools/gate/selenium/prometheusSelenium.py
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_One_Command_Line_Flags.png
|
||||
mv ${ARTIFACTS_DIR}Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_One_Dashboard.png
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_One_Runtime_Info.png
|
||||
|
||||
export PROMETHEUS_URI="prometheus-two.osh-infra.svc.cluster.local"
|
||||
python3 tools/gate/selenium/prometheusSelenium.py
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Two_Command_Line_Flags.png
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Two_Dashboard.png
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Two_Runtime_Info.png
|
||||
|
||||
export PROMETHEUS_URI="prometheus-three.osh-infra.svc.cluster.local"
|
||||
python3 tools/gate/selenium/prometheusSelenium.py
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Three_Command_Line_Flags.png
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Three_Dashboard.png
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Three_Runtime_Info.png
|
||||
|
||||
export PROMETHEUS_URI="prometheus-federate.osh-infra.svc.cluster.local"
|
||||
python3 tools/gate/selenium/prometheusSelenium.py
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Federated_Command_Line_Flags.png
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Federated_Dashboard.png
|
||||
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Federated_Runtime_Info.png
|
@ -19,9 +19,15 @@ set -xe
|
||||
#NOTE: Lint and package chart
|
||||
make prometheus
|
||||
|
||||
rules_overrides=""
|
||||
for rules_file in $(ls ./prometheus/values_overrides); do
|
||||
rules_overrides="$rules_overrides --values=./prometheus/values_overrides/$rules_file"
|
||||
done
|
||||
|
||||
#NOTE: Deploy command
|
||||
helm upgrade --install prometheus ./prometheus \
|
||||
--namespace=osh-infra
|
||||
--namespace=osh-infra \
|
||||
$rules_overrides
|
||||
|
||||
#NOTE: Wait for deploy
|
||||
./tools/deployment/common/wait-for-pods.sh osh-infra
|
||||
|
@ -169,6 +169,29 @@
|
||||
- ./tools/deployment/osh-infra-monitoring/610-prometheus-selenium.sh || true
|
||||
- ./tools/deployment/osh-infra-monitoring/620-nagios-selenium.sh || true
|
||||
|
||||
- job:
|
||||
name: openstack-helm-infra-federated-monitoring
|
||||
parent: openstack-helm-infra-functional
|
||||
timeout: 7200
|
||||
pre-run:
|
||||
- playbooks/osh-infra-upgrade-host.yaml
|
||||
- playbooks/osh-infra-deploy-selenium.yaml
|
||||
run: playbooks/osh-infra-gate-runner.yaml
|
||||
post-run: playbooks/osh-infra-collect-logs.yaml
|
||||
nodeset: openstack-helm-single-node
|
||||
vars:
|
||||
gate_scripts:
|
||||
- ./tools/deployment/federated-monitoring/000-install-packages.sh
|
||||
- ./tools/deployment/federated-monitoring/005-deploy-k8s.sh
|
||||
- ./tools/deployment/federated-monitoring/010-ingress.sh
|
||||
- ./tools/deployment/federated-monitoring/020-nfs-provisioner.sh
|
||||
- ./tools/deployment/federated-monitoring/030-ldap.sh
|
||||
- ./tools/deployment/federated-monitoring/040-kube-state-metrics.sh
|
||||
- ./tools/deployment/federated-monitoring/050-node-exporter.sh
|
||||
- ./tools/deployment/federated-monitoring/060-prometheus.sh
|
||||
- ./tools/deployment/federated-monitoring/070-federated-prometheus.sh
|
||||
- ./tools/deployment/federated-monitoring/100-prometheus-selenium.sh || true
|
||||
|
||||
- job:
|
||||
name: openstack-helm-infra-aio-network-policy
|
||||
parent: openstack-helm-infra-functional
|
||||
|
@ -21,6 +21,8 @@
|
||||
- openstack-helm-lint
|
||||
- openstack-helm-infra-aio-logging
|
||||
- openstack-helm-infra-aio-monitoring
|
||||
- openstack-helm-infra-federated-monitoring:
|
||||
voting: false
|
||||
- openstack-helm-infra-aio-network-policy:
|
||||
voting: false
|
||||
- openstack-helm-infra-openstack-support
|
||||
|
Loading…
x
Reference in New Issue
Block a user