From f995680e2a64d54adcf3c1d6f3ba7d9c27d01e28 Mon Sep 17 00:00:00 2001 From: kranthi guttikonda Date: Wed, 10 Oct 2018 11:02:45 -0400 Subject: [PATCH] Prometheus kubelet.rules change kube_node_status_ready and up metrics are obsolete to check the kubernetes node condition. When a kubelet is down that means node itself in NotReady state. With 1.3.1 kube-state-metrics exporter kube_node_status_condition metric provides the status value of the kubelet (essentially node). https://github.com/kubernetes/kube-state-metrics/blob/master/Documentation /node-metrics.md kube_node_status_condition includes condition=Ready and status as true, flase and unknown. When a kubelet is stopped the status will be unknown since the kubelet itself will unable to talk to API. In other cases it will be false. When the node is registered and available it will be set to true. Replaced the kube_node_status_ready with kube_node_status_condition and changed the 1h to 1m and increased the severity to "critical". Also modified the K8SKubeletDown definitions with 1m and critical sevrity Implements: Bug 1797133 Closes-Bug: #1797133 Change-Id: I025adb13c9d8642a218dfda1ff30f1577fa8c826 Signed-off-by: Kranthi Kiran Guttikonda --- prometheus/values.yaml | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 09b2fa0ff..d590cffb0 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1177,15 +1177,23 @@ conf: - name: kubelet.rules rules: - alert: K8SNodeNotReady - expr: kube_node_status_ready{condition="true"} == 0 - for: 1h + expr: kube_node_status_condition{condition="Ready", status="unknown"} == 1 or kube_node_status_condition{condition="Ready", status="false"} == 1 + for: 1m labels: - severity: warning + severity: critical annotations: - description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour - summary: Node status is NotReady + description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute + summary: '{{ $labels.node }} Node status is NotReady and {{ $labels.status }}' - alert: K8SManyNodesNotReady - expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 + expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2 for: 1m labels: severity: critical @@ -1193,7 +1201,7 @@ conf: description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' summary: Many Kubernetes nodes are Not Ready - alert: K8SNodesNotReady - expr: count(kube_node_status_ready{condition="true"} == 0) > 0 + expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0 for: 1m labels: severity: critical @@ -1202,15 +1210,15 @@ conf: summary: One or more Kubernetes nodes are Not Ready - alert: K8SKubeletDown expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 - for: 1h + for: 1m labels: - severity: warning + severity: critical annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 - for: 1h + for: 1m labels: severity: critical annotations: