From a31bb2b04918107a08cb14201ae72f2b5696cb9d Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Tue, 2 Jul 2019 15:27:08 -0500 Subject: [PATCH] Add node-problem-detector chart This adds a chart for the node problem detector. This chart will help provide additional insight into the status of the underlying infrastructure of a deployment. Updated the chart with new yamllint checks. Change-Id: I21a24b67b121388107b20ab38ac7703c7a33f1c1 Signed-off-by: Steve Wilkerson --- kubernetes-node-problem-detector/Chart.yaml | 24 + .../requirements.yaml | 18 + .../bin/_node-problem-detector.sh.tpl | 25 + .../templates/configmap-bin.yaml | 36 ++ .../templates/configmap-etc.yaml | 31 ++ .../templates/daemonset.yaml | 135 +++++ .../templates/job-image-repo-sync.yaml | 18 + .../templates/service.yaml | 38 ++ kubernetes-node-problem-detector/values.yaml | 465 ++++++++++++++++++ .../common/node-problem-detector.sh | 38 ++ .../multinode/075-node-problem-detector.sh | 1 + .../075-node-problem-detector.sh | 1 + zuul.d/jobs.yaml | 2 + 13 files changed, 832 insertions(+) create mode 100644 kubernetes-node-problem-detector/Chart.yaml create mode 100644 kubernetes-node-problem-detector/requirements.yaml create mode 100644 kubernetes-node-problem-detector/templates/bin/_node-problem-detector.sh.tpl create mode 100644 kubernetes-node-problem-detector/templates/configmap-bin.yaml create mode 100644 kubernetes-node-problem-detector/templates/configmap-etc.yaml create mode 100644 kubernetes-node-problem-detector/templates/daemonset.yaml create mode 100644 kubernetes-node-problem-detector/templates/job-image-repo-sync.yaml create mode 100644 kubernetes-node-problem-detector/templates/service.yaml create mode 100644 kubernetes-node-problem-detector/values.yaml create mode 100755 tools/deployment/common/node-problem-detector.sh create mode 120000 tools/deployment/multinode/075-node-problem-detector.sh create mode 120000 tools/deployment/osh-infra-monitoring/075-node-problem-detector.sh diff --git a/kubernetes-node-problem-detector/Chart.yaml b/kubernetes-node-problem-detector/Chart.yaml new file mode 100644 index 000000000..4064a32b9 --- /dev/null +++ b/kubernetes-node-problem-detector/Chart.yaml @@ -0,0 +1,24 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: v1 +description: OpenStack-Helm Kubernetes Node Problem Detector +name: kubernetes-node-problem-detector +version: 0.1.0 +home: https://github.com/kubernetes/node-problem-detector +sources: + - https://github.com/kubernetes/node-problem-detector + - https://opendev.org/openstack/openstack-helm-infra +maintainers: + - name: OpenStack-Helm Authors +... diff --git a/kubernetes-node-problem-detector/requirements.yaml b/kubernetes-node-problem-detector/requirements.yaml new file mode 100644 index 000000000..efd01ef7a --- /dev/null +++ b/kubernetes-node-problem-detector/requirements.yaml @@ -0,0 +1,18 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +dependencies: + - name: helm-toolkit + repository: http://localhost:8879/charts + version: 0.1.0 +... diff --git a/kubernetes-node-problem-detector/templates/bin/_node-problem-detector.sh.tpl b/kubernetes-node-problem-detector/templates/bin/_node-problem-detector.sh.tpl new file mode 100644 index 000000000..86b4ac08f --- /dev/null +++ b/kubernetes-node-problem-detector/templates/bin/_node-problem-detector.sh.tpl @@ -0,0 +1,25 @@ +#!/bin/sh +{{/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +set -ex + +exec /node-problem-detector \ + {{- range $monitor, $monitorConfig := .Values.conf.monitors }} + {{- if $monitorConfig.enabled }} + --config.{{$monitor}}={{ include "helm-toolkit.utils.joinListWithComma" $monitorConfig.enabled }} \ + {{- end }} + {{- end }} + --logtostderr \ + --prometheus-address=0.0.0.0 diff --git a/kubernetes-node-problem-detector/templates/configmap-bin.yaml b/kubernetes-node-problem-detector/templates/configmap-bin.yaml new file mode 100644 index 000000000..83531d1a4 --- /dev/null +++ b/kubernetes-node-problem-detector/templates/configmap-bin.yaml @@ -0,0 +1,36 @@ +{{/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if .Values.manifests.configmap_bin }} +{{- $envAll := . }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: node-problem-detector-bin +data: + node-problem-detector.sh: | +{{ tuple "bin/_node-problem-detector.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + image-repo-sync.sh: | +{{- include "helm-toolkit.scripts.image_repo_sync" . | indent 4 }} +{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }} +{{- $scripts := $monitorConfig.scripts }} +{{- range $script, $scriptSource := $scripts.source }} +{{- if has $script $scripts.enabled }} + {{$script}}: | +{{$scriptSource | indent 4 -}} +{{- end }} +{{- end -}} +{{- end -}} +{{- end }} diff --git a/kubernetes-node-problem-detector/templates/configmap-etc.yaml b/kubernetes-node-problem-detector/templates/configmap-etc.yaml new file mode 100644 index 000000000..1afae8faf --- /dev/null +++ b/kubernetes-node-problem-detector/templates/configmap-etc.yaml @@ -0,0 +1,31 @@ +{{/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if .Values.manifests.configmap_etc }} + +{{- $envAll := . }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: node-problem-detector-etc +type: Opaque +data: +{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }} +{{- $plugins := $monitorConfig.config }} +{{- range $plugin, $config := $plugins }} + {{$plugin}}.json: {{ toJson $config | b64enc }} +{{- end }} +{{ end }} +{{- end }} diff --git a/kubernetes-node-problem-detector/templates/daemonset.yaml b/kubernetes-node-problem-detector/templates/daemonset.yaml new file mode 100644 index 000000000..c0ac0fdd5 --- /dev/null +++ b/kubernetes-node-problem-detector/templates/daemonset.yaml @@ -0,0 +1,135 @@ +{{/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if .Values.manifests.daemonset }} +{{- $envAll := . }} + +{{- $serviceAccountName := printf "%s-%s" .Release.Name "node-problem-detector" }} +{{ tuple $envAll "node_problem_detector" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: run-node-problem-detector +subjects: + - kind: ServiceAccount + name: {{ $serviceAccountName }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector + annotations: + {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }} + labels: +{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }} +spec: + selector: + matchLabels: +{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 6 }} +{{ tuple $envAll "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_upgrades_daemonset" | indent 2 }} + template: + metadata: + labels: +{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }} + annotations: +{{- if .Values.monitoring.prometheus.pod.enabled }} +{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.node_problem_detector }} +{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_pod_annotations" | indent 8 }} +{{- end }} +{{ dict "envAll" $envAll "podName" "node-problem-detector" "containerNames" (list "node-problem-detector") | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }} +{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" | indent 8 }} + configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }} + spec: +{{ dict "envAll" $envAll "application" "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }} + serviceAccountName: {{ $serviceAccountName }} +{{ if .Values.pod.tolerations.node_problem_detector.enabled }} +{{ tuple $envAll "node_exporter" | include "helm-toolkit.snippets.kubernetes_tolerations" | indent 6 }} +{{ else }} + nodeSelector: + {{ .Values.labels.node_problem_detector.node_selector_key }}: {{ .Values.labels.node_problem_detector.node_selector_value | quote }} +{{ end }} + containers: + - name: node-problem-detector +{{ tuple $envAll "node_problem_detector" | include "helm-toolkit.snippets.image" | indent 10 }} +{{ tuple $envAll $envAll.Values.pod.resources.node_problem_detector | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} +{{ dict "envAll" $envAll "application" "node_problem_detector" "container" "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }} + command: + - /tmp/node-problem-detector.sh + ports: + - name: metrics + containerPort: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + readOnly: true + - name: kmsg + mountPath: /dev/kmsg + readOnly: true + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: node-problem-detector-bin + mountPath: /tmp/node-problem-detector.sh + subPath: node-problem-detector.sh + readOnly: true + {{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }} + {{- $scripts := $monitorConfig.scripts }} + {{- range $script, $scriptSource := $scripts.source }} + {{- if has $script $scripts.enabled }} + - name: node-problem-detector-bin + mountPath: /config/plugin/{{$script}} + subPath: {{$script}} + {{- end }} + {{- end }} + {{- end }} + {{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }} + {{- $plugins := $monitorConfig.config }} + {{- range $plugin, $config := $plugins }} + - name: node-problem-detector-etc + mountPath: /config/{{$plugin}}.json + subPath: {{$plugin}}.json + {{- end }} + {{- end }} + volumes: + - name: pod-tmp + emptyDir: {} + - name: log + hostPath: + path: /var/log + - name: kmsg + hostPath: + path: /dev/kmsg + - name: localtime + hostPath: + path: /etc/localtime + - name: node-problem-detector-etc + secret: + secretName: node-problem-detector-etc + defaultMode: 292 + - name: node-problem-detector-bin + configMap: + name: node-problem-detector-bin + defaultMode: 365 +{{- end }} diff --git a/kubernetes-node-problem-detector/templates/job-image-repo-sync.yaml b/kubernetes-node-problem-detector/templates/job-image-repo-sync.yaml new file mode 100644 index 000000000..c28a7d379 --- /dev/null +++ b/kubernetes-node-problem-detector/templates/job-image-repo-sync.yaml @@ -0,0 +1,18 @@ +{{/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if and .Values.manifests.job_image_repo_sync .Values.images.local_registry.active }} +{{- $imageRepoSyncJob := dict "envAll" . "serviceName" "node-problem-detector" -}} +{{ $imageRepoSyncJob | include "helm-toolkit.manifests.job_image_repo_sync" }} +{{- end }} diff --git a/kubernetes-node-problem-detector/templates/service.yaml b/kubernetes-node-problem-detector/templates/service.yaml new file mode 100644 index 000000000..ef13af4b0 --- /dev/null +++ b/kubernetes-node-problem-detector/templates/service.yaml @@ -0,0 +1,38 @@ +{{/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if .Values.manifests.service }} +{{- $envAll := . }} +{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.node_problem_detector }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ tuple "node_problem_detector" "internal" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }} + labels: +{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }} + annotations: +{{- if .Values.monitoring.prometheus.service.enabled }} +{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_service_annotations" | indent 4 }} +{{- end }} +spec: + type: ClusterIP + clusterIP: None + ports: + - name: metrics + port: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + targetPort: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + selector: +{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }} +{{- end }} diff --git a/kubernetes-node-problem-detector/values.yaml b/kubernetes-node-problem-detector/values.yaml new file mode 100644 index 000000000..7ddb81eda --- /dev/null +++ b/kubernetes-node-problem-detector/values.yaml @@ -0,0 +1,465 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default values for node-exporter. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +--- +images: + tags: + node_problem_detector: k8s.gcr.io/node-problem-detector:v0.7.0 + dep_check: quay.io/airshipit/kubernetes-entrypoint:v1.0.0 + image_repo_sync: docker.io/docker:17.07.0 + pull_policy: IfNotPresent + local_registry: + active: false + exclude: + - dep_check + - image_repo_sync + +labels: + node_problem_detector: + node_selector_key: openstack-control-plane + node_selector_value: enabled + job: + node_selector_key: openstack-control-plane + node_selector_value: enabled + +pod: + security_context: + node_problem_detector: + container: + node_problem_detector: + privileged: true + affinity: + anti: + type: + default: preferredDuringSchedulingIgnoredDuringExecution + topologyKey: + default: kubernetes.io/hostname + mounts: + node_problem_detector: + node_problem_detector: + init_container: null + lifecycle: + upgrades: + daemonsets: + pod_replacement_strategy: RollingUpdate + node_problem_detector: + enabled: true + min_ready_seconds: 0 + revision_history: 3 + pod_replacement_strategy: RollingUpdate + rolling_update: + max_unavailable: 1 + max_surge: 3 + termination_grace_period: + node_problem_detector: + timeout: 30 + resources: + enabled: false + node_problem_detector: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + jobs: + image_repo_sync: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "1024Mi" + cpu: "2000m" + tolerations: + node_problem_detector: + enabled: false + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + - key: node-role.kubernetes.io/node + operator: Exists +dependencies: + dynamic: + common: + local_image_registry: + jobs: + - node-exporter-image-repo-sync + services: + - endpoint: node + service: local_image_registry + static: + image_repo_sync: + services: + - endpoint: internal + service: local_image_registry + node_problem_detector: + services: null + +monitoring: + prometheus: + pod: + enabled: true + service: + enabled: false + node_problem_detector: + scrape: true + port: 20257 + +endpoints: + cluster_domain_suffix: cluster.local + local_image_registry: + name: docker-registry + namespace: docker-registry + hosts: + default: localhost + internal: docker-registry + node: localhost + host_fqdn_override: + default: null + port: + registry: + node: 5000 + node_problem_detector: + name: node-problem-detector + namespace: null + hosts: + default: node-problem-detector + host_fqdn_override: + default: null + path: + default: null + port: + metrics: + default: 20257 + +manifests: + configmap_bin: true + configmap_etc: true + daemonset: true + job_image_repo_sync: true + service: false + +conf: + monitors: + system-log-monitor: + enabled: + - /config/kernel-monitor.json + - /config/docker-monitor.json + - /config/systemd-monitor.json + scripts: + enabled: null + source: null + config: + kernel-monitor: + plugin: kmsg + logPath: "/dev/kmsg" + lookback: 5m + bufferSize: 10 + source: kernel-monitor + conditions: + - type: KernelDeadlock + reason: KernelHasNoDeadlock + message: kernel has no deadlock + - type: ReadonlyFilesystem + reason: FilesystemIsNotReadOnly + message: Filesystem is not read-only + rules: + - type: temporary + reason: OOMKilling + pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+ + (.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.* + - type: temporary + reason: TaskHung + pattern: task \S+:\w+ blocked for more than \w+ seconds\. + - type: temporary + reason: UnregisterNetDevice + pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+' + - type: temporary + reason: KernelOops + pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*' + - type: temporary + reason: KernelOops + pattern: 'divide error: 0000 \[#\d+\] SMP' + - type: permanent + condition: KernelDeadlock + reason: AUFSUmountHung + pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\. + - type: permanent + condition: KernelDeadlock + reason: DockerHung + pattern: task docker:\w+ blocked for more than \w+ seconds\. + - type: permanent + condition: ReadonlyFilesystem + reason: FilesystemIsReadOnly + pattern: Remounting filesystem read-only + kernel-monitor-filelog: + plugin: filelog + pluginConfig: + timestamp: "^.{15}" + message: 'kernel: \[.*\] (.*)' + timestampFormat: Jan _2 15:04:05 + logPath: "/var/log/kern.log" + lookback: 5m + bufferSize: 10 + source: kernel-monitor + conditions: + - type: KernelDeadlock + reason: KernelHasNoDeadlock + message: kernel has no deadlock + rules: + - type: temporary + reason: OOMKilling + pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+ + (.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.* + - type: temporary + reason: TaskHung + pattern: task \S+:\w+ blocked for more than \w+ seconds\. + - type: temporary + reason: UnregisterNetDevice + pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+' + - type: temporary + reason: KernelOops + pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*' + - type: temporary + reason: KernelOops + pattern: 'divide error: 0000 \[#\d+\] SMP' + - type: permanent + condition: KernelDeadlock + reason: AUFSUmountHung + pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\. + - type: permanent + condition: KernelDeadlock + reason: DockerHung + pattern: task docker:\w+ blocked for more than \w+ seconds\. + kernel-monitor-counter: + plugin: custom + pluginConfig: + invoke_interval: 5m + timeout: 1m + max_output_length: 80 + concurrency: 1 + source: kernel-monitor + conditions: + - type: FrequentUnregisterNetDevice + reason: NoFrequentUnregisterNetDevice + message: node is functioning properly + rules: + - type: permanent + condition: FrequentUnregisterNetDevice + reason: UnregisterNetDevice + path: "/home/kubernetes/bin/log-counter" + args: + - "--journald-source=kernel" + - "--log-path=/var/log/journal" + - "--lookback=20m" + - "--count=3" + - "--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count + = \\d+" + timeout: 1m + docker-monitor: + plugin: journald + pluginConfig: + source: dockerd + logPath: "/var/log/journal" + lookback: 5m + bufferSize: 10 + source: docker-monitor + conditions: [] + rules: + - type: temporary + reason: CorruptDockerImage + pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) + /var/lib/docker/image/(.+): directory not empty.*' + docker-monitor-filelog: + plugin: filelog + pluginConfig: + timestamp: ^time="(\S*)" + message: |- + msg="([^ + ]*)" + timestampFormat: '2006-01-02T15:04:05.999999999-07:00' + logPath: "/var/log/docker.log" + lookback: 5m + bufferSize: 10 + source: docker-monitor + conditions: [] + rules: + - type: temporary + reason: CorruptDockerImage + pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) + /var/lib/docker/image/(.+): directory not empty.*' + docker-monitor-counter: + plugin: custom + pluginConfig: + invoke_interval: 5m + timeout: 1m + max_output_length: 80 + concurrency: 1 + source: docker-monitor + conditions: + - type: CorruptDockerOverlay2 + reason: NoCorruptDockerOverlay2 + message: docker overlay2 is functioning properly + rules: + - type: permanent + condition: CorruptDockerOverlay2 + reason: CorruptDockerOverlay2 + path: "/home/kubernetes/bin/log-counter" + args: + - "--journald-source=dockerd" + - "--log-path=/var/log/journal" + - "--lookback=5m" + - "--count=10" + - "--pattern=returned error: readlink /var/lib/docker/overlay2.*: invalid argument.*" + timeout: 1m + systemd-monitor: + plugin: journald + pluginConfig: + source: systemd + logPath: "/var/log/journal" + lookback: '' + bufferSize: 10 + source: systemd-monitor + conditions: [] + rules: + - type: temporary + reason: KubeletStart + pattern: Started Kubernetes kubelet. + - type: temporary + reason: DockerStart + pattern: Starting Docker Application Container Engine... + - type: temporary + reason: ContainerdStart + pattern: Starting containerd container runtime... + systemd-monitor-counter: + plugin: custom + pluginConfig: + invoke_interval: 5m + timeout: 1m + max_output_length: 80 + concurrency: 1 + source: systemd-monitor + conditions: + - type: FrequentKubeletRestart + reason: NoFrequentKubeletRestart + message: kubelet is functioning properly + - type: FrequentDockerRestart + reason: NoFrequentDockerRestart + message: docker is functioning properly + - type: FrequentContainerdRestart + reason: NoFrequentContainerdRestart + message: containerd is functioning properly + rules: + - type: permanent + condition: FrequentKubeletRestart + reason: FrequentKubeletRestart + path: "/home/kubernetes/bin/log-counter" + args: + - "--journald-source=systemd" + - "--log-path=/var/log/journal" + - "--lookback=20m" + - "--delay=5m" + - "--count=5" + - "--pattern=Started Kubernetes kubelet." + timeout: 1m + - type: permanent + condition: FrequentDockerRestart + reason: FrequentDockerRestart + path: "/home/kubernetes/bin/log-counter" + args: + - "--journald-source=systemd" + - "--log-path=/var/log/journal" + - "--lookback=20m" + - "--count=5" + - "--pattern=Starting Docker Application Container Engine..." + timeout: 1m + - type: permanent + condition: FrequentContainerdRestart + reason: FrequentContainerdRestart + path: "/home/kubernetes/bin/log-counter" + args: + - "--journald-source=systemd" + - "--log-path=/var/log/journal" + - "--lookback=20m" + - "--count=5" + - "--pattern=Starting containerd container runtime..." + timeout: 1m + custom-plugin-monitor: + enabled: + - /config/network-problem-monitor.json + scripts: + enabled: + - network_problem.sh + source: + network_problem.sh: | + #!/bin/bash + + # This plugin checks for common network issues. Currently, it only checks + # if the conntrack table is full. + + OK=0 + NONOK=1 + UNKNOWN=2 + + [ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_max ] || exit $UNKNOWN + [ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_count ] || exit $UNKNOWN + + conntrack_max=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_max) + conntrack_count=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_count) + + if (( conntrack_count >= conntrack_max )); then + echo "Conntrack table full" + exit $NONOK + fi + + echo "Conntrack table available" + exit $OK + config: + network-problem-monitor: + plugin: custom + pluginConfig: + invoke_interval: 30s + timeout: 5s + max_output_length: 80 + concurrency: 3 + source: network-custom-plugin-monitor + conditions: [] + rules: + - type: temporary + reason: ConntrackFull + path: "./config/plugin/network_problem.sh" + timeout: 3s + system-stats-monitor: + enabled: + - /config/system-stats-monitor.json + scripts: + enabled: null + source: null + config: + system-stats-monitor: + disk: + metricsConfigs: + disk/io_time: + displayName: disk/io_time + disk/weighted_io: + displayName: disk/weighted_io + disk/avg_queue_len: + displayName: disk/avg_queue_len + includeRootBlk: true + includeAllAttachedBlk: true + lsblkTimeout: 5s + invokeInterval: 60s +... diff --git a/tools/deployment/common/node-problem-detector.sh b/tools/deployment/common/node-problem-detector.sh new file mode 100755 index 000000000..031310aaf --- /dev/null +++ b/tools/deployment/common/node-problem-detector.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +set -xe + +#NOTE: Lint and package chart +make kubernetes-node-problem-detector + +#NOTE: Deploy command +tee /tmp/kubernetes-node-problem-detector.yaml << EOF +monitoring: + prometheus: + pod: + enabled: false + service: + enabled: true +manifests: + service: true +EOF +helm upgrade --install kubernetes-node-problem-detector \ + ./kubernetes-node-problem-detector --namespace=kube-system \ + --values=/tmp/kubernetes-node-problem-detector.yaml + +#NOTE: Wait for deploy +./tools/deployment/common/wait-for-pods.sh kube-system + +#NOTE: Validate Deployment info +helm status kubernetes-node-problem-detector diff --git a/tools/deployment/multinode/075-node-problem-detector.sh b/tools/deployment/multinode/075-node-problem-detector.sh new file mode 120000 index 000000000..47a0e3821 --- /dev/null +++ b/tools/deployment/multinode/075-node-problem-detector.sh @@ -0,0 +1 @@ +../common/node-problem-detector.sh \ No newline at end of file diff --git a/tools/deployment/osh-infra-monitoring/075-node-problem-detector.sh b/tools/deployment/osh-infra-monitoring/075-node-problem-detector.sh new file mode 120000 index 000000000..47a0e3821 --- /dev/null +++ b/tools/deployment/osh-infra-monitoring/075-node-problem-detector.sh @@ -0,0 +1 @@ +../common/node-problem-detector.sh \ No newline at end of file diff --git a/zuul.d/jobs.yaml b/zuul.d/jobs.yaml index b9355f523..720f97ea3 100644 --- a/zuul.d/jobs.yaml +++ b/zuul.d/jobs.yaml @@ -67,6 +67,7 @@ - ./tools/deployment/multinode/050-prometheus.sh - ./tools/deployment/multinode/060-alertmanager.sh - ./tools/deployment/multinode/070-kube-state-metrics.sh + - ./tools/deployment/multinode/075-node-problem-detector.sh - ./tools/deployment/multinode/080-node-exporter.sh - ./tools/deployment/multinode/085-process-exporter.sh - ./tools/deployment/multinode/090-openstack-exporter.sh @@ -190,6 +191,7 @@ - ./tools/deployment/osh-infra-monitoring/050-prometheus.sh - ./tools/deployment/osh-infra-monitoring/060-alertmanager.sh - ./tools/deployment/osh-infra-monitoring/070-kube-state-metrics.sh + - ./tools/deployment/osh-infra-monitoring/075-node-problem-detector.sh - ./tools/deployment/osh-infra-monitoring/080-node-exporter.sh - ./tools/deployment/osh-infra-monitoring/090-process-exporter.sh - ./tools/deployment/osh-infra-monitoring/100-openstack-exporter.sh