diff --git a/monitor-helm-elastic/centos/monitor-helm-elastic.spec b/monitor-helm-elastic/centos/monitor-helm-elastic.spec index b019fb1..1be1423 100644 --- a/monitor-helm-elastic/centos/monitor-helm-elastic.spec +++ b/monitor-helm-elastic/centos/monitor-helm-elastic.spec @@ -23,6 +23,7 @@ Patch02: 0002-Add-compatibility-for-k8s-1.16.patch Patch03: 0003-use-oss-image.patch Patch04: 0004-Update-to-Elastic-7.4.0-Release.patch Patch05: 0005-set-initial-masters-to-master-0.patch +Patch06: 0006-readiness-probe-enhancements.patch BuildRequires: helm @@ -36,6 +37,7 @@ Monitor Helm elasticsearch charts %patch03 -p1 %patch04 -p1 %patch05 -p1 +%patch06 -p1 %build # initialize helm and build the toolkit diff --git a/monitor-helm-elastic/files/0006-readiness-probe-enhancements.patch b/monitor-helm-elastic/files/0006-readiness-probe-enhancements.patch new file mode 100644 index 0000000..5a1f362 --- /dev/null +++ b/monitor-helm-elastic/files/0006-readiness-probe-enhancements.patch @@ -0,0 +1,91 @@ +From 36ea0e2a2fd6cf6ac8cb19411c14c5ef4d0618f9 Mon Sep 17 00:00:00 2001 +From: Kevin Smith +Date: Mon, 23 Mar 2020 10:43:07 -0400 +Subject: [PATCH 1/1] readiness probe enhancements + +--- + elasticsearch/templates/statefulset.yaml | 46 +++++++++++++++++++++++++++----- + elasticsearch/values.yaml | 2 ++ + 2 files changed, 41 insertions(+), 7 deletions(-) + +diff --git a/elasticsearch/templates/statefulset.yaml b/elasticsearch/templates/statefulset.yaml +index e17d39e..483e1f4 100644 +--- a/elasticsearch/templates/statefulset.yaml ++++ b/elasticsearch/templates/statefulset.yaml +@@ -194,7 +194,7 @@ spec: + # If the node is starting up wait for the cluster to be ready (request params: '{{ .Values.clusterHealthCheckParams }}' ) + # Once it has started only check that the node itself is responding + START_FILE=/tmp/.es_start_file +- ++ + http () { + local path="${1}" + if [ -n "${ELASTIC_USERNAME}" ] && [ -n "${ELASTIC_PASSWORD}" ]; then +@@ -209,13 +209,45 @@ spec: + echo 'Elasticsearch is already running, lets check the node is healthy' + http "/" + else +- echo 'Waiting for elasticsearch cluster to become cluster to be ready (request params: "{{ .Values.clusterHealthCheckParams }}" )' +- if http "/_cluster/health?{{ .Values.clusterHealthCheckParams }}" ; then +- touch ${START_FILE} +- exit 0 ++ DATA_NODE=$(printenv node.data) ++ if [[ "$DATA_NODE" == true ]]; then ++ # This is a data node, check for health depending on whether we can ++ # reach the master node and how many data nodes there are. ++ DATA_NODE_COUNT=$(http "/_cat/nodes?master_timeout=1s" | grep -c data) ++ echo "data node count = $DATA_NODE_COUNT" ++ if [[ $DATA_NODE_COUNT -gt 1 ]]; then ++ # We connected to master and there is more than one data node. ++ echo 'Waiting for elasticsearch cluster to become ready (request params: "{{ .Values.clusterHealthCheckParams }}" )' ++ if http "/_cluster/health?{{ .Values.clusterHealthCheckParams }}" ; then ++ touch ${START_FILE} ++ exit 0 ++ else ++ echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParams }}" )' ++ exit 1 ++ fi ++ else ++ # Cannot connect to the master or we are the only data node ++ # found. Could be DOR, AIO-SX, other host is locked and we ++ # experienced a pod restart or other similar scenario. ++ echo "Cannot connect to master or less than 2 data nodes" ++ echo 'Waiting for elasticsearch cluster to become ready (request params: "{{ .Values.clusterHealthCheckParamsBasic }}" )' ++ if http "/_cluster/health?{{ .Values.clusterHealthCheckParamsBasic }}" ; then ++ touch ${START_FILE} ++ exit 0 ++ else ++ echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParamsBasic }}" )' ++ exit 1 ++ fi ++ fi + else +- echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParams }}" )' +- exit 1 ++ echo 'Waiting for elasticsearch cluster to become ready (request params: "{{ .Values.clusterHealthCheckParams }}" )' ++ if http "/_cluster/health?{{ .Values.clusterHealthCheckParams }}" ; then ++ touch ${START_FILE} ++ exit 0 ++ else ++ echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParams }}" )' ++ exit 1 ++ fi + fi + fi + ports: +diff --git a/elasticsearch/values.yaml b/elasticsearch/values.yaml +index 0d983eb..ebbae6c 100755 +--- a/elasticsearch/values.yaml ++++ b/elasticsearch/values.yaml +@@ -197,6 +197,8 @@ readinessProbe: + + # https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-health.html#request-params wait_for_status + clusterHealthCheckParams: "wait_for_status=green&timeout=1s" ++# Used for readiness probe when on a data node and only a basic health check is needed. ++clusterHealthCheckParamsBasic: "local=true" + + ## Use an alternate scheduler. + ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ +-- +1.8.3.1 + diff --git a/stx-monitor-helm/stx-monitor-helm/manifests/monitor_manifest.yaml b/stx-monitor-helm/stx-monitor-helm/manifests/monitor_manifest.yaml index 73ccca2..aa6cc43 100644 --- a/stx-monitor-helm/stx-monitor-helm/manifests/monitor_manifest.yaml +++ b/stx-monitor-helm/stx-monitor-helm/manifests/monitor_manifest.yaml @@ -106,7 +106,7 @@ data: esMajorVersion: 7 masterService: 'mon-elasticsearch-data-headless, mon-elasticsearch-master' podManagementPolicy: OrderedReady - clusterHealthCheckParams: 'local=true' + clusterHealthCheckParams: 'wait_for_no_relocating_shards&wait_for_no_initializing_shards&timeout=1s' maxUnavailable: 1 extraEnvs: - name: DATA_PRESTOP_SLEEP