[mariadb] Add cluster wait job

Add job that waits when initial bootstrapping of cluster is completed
which is required to pause db creation and initialization when cluster
is not fully bootstrapped.

Change-Id: I705df1a1b1a34f464dc36a36dd7964f8a7bf72d9
This commit is contained in:
Vasyl Saienko 2024-09-16 05:39:48 +00:00
parent 243289aae3
commit 9e5fea6e18
6 changed files with 337 additions and 1 deletions

View File

@ -15,7 +15,7 @@ apiVersion: v1
appVersion: v10.6.7
description: OpenStack-Helm MariaDB
name: mariadb
version: 0.2.49
version: 0.2.50
home: https://mariadb.com/kb/en/
icon: http://badges.mariadb.org/mariadb-badge-180x60.png
sources:

View File

@ -0,0 +1,190 @@
#!/usr/bin/env python3
import datetime
from enum import Enum
import logging
import os
import sys
import time
import pymysql
import pykube
MARIADB_HOST = os.getenv("MARIADB_HOST")
MARIADB_PASSWORD = os.getenv("MARIADB_PASSWORD")
MARIADB_REPLICAS = os.getenv("MARIADB_REPLICAS")
MARIADB_CLUSTER_STATE_LOG_LEVEL = os.getenv("MARIADB_CLUSTER_STATE_LOG_LEVEL", "INFO")
MARIADB_CLUSTER_STABILITY_COUNT = int(
os.getenv("MARIADB_CLUSTER_STABILITY_COUNT", "30")
)
MARIADB_CLUSTER_STABILITY_WAIT = int(os.getenv("MARIADB_CLUSTER_STABILITY_WAIT", "4"))
MARIADB_CLUSTER_CHECK_WAIT = int(os.getenv("MARIADB_CLUSTER_CHECK_WAIT", "30"))
MARIADB_CLUSTER_STATE_CONFIGMAP = os.getenv("MARIADB_CLUSTER_STATE_CONFIGMAP")
MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE = os.getenv(
"MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE", "openstack"
)
MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT = int(
os.getenv("MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT", 60)
)
log_level = MARIADB_CLUSTER_STATE_LOG_LEVEL
logging.basicConfig(
stream=sys.stdout,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
LOG = logging.getLogger("mariadb-cluster-wait")
LOG.setLevel(log_level)
def login():
config = pykube.KubeConfig.from_env()
client = pykube.HTTPClient(
config=config, timeout=MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT
)
LOG.info(f"Created k8s api client from context {config.current_context}")
return client
api = login()
cluster_state_map = (
pykube.ConfigMap.objects(api)
.filter(namespace=MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE)
.get_by_name(MARIADB_CLUSTER_STATE_CONFIGMAP)
)
def get_current_state(cluster_state_map):
cluster_state_map.get(
MARIADB_CLUSTER_STATE_INITIAL_BOOTSTRAP_COMPLETED_KEY, "False"
)
def retry(times, exceptions):
def decorator(func):
def newfn(*args, **kwargs):
attempt = 0
while attempt < times:
try:
return func(*args, **kwargs)
except exceptions:
attempt += 1
LOG.exception(
f"Exception thrown when attempting to run {func}, attempt {attempt} of {times}"
)
return func(*args, **kwargs)
return newfn
return decorator
class initalClusterState:
initial_state_key = "initial-bootstrap-completed.cluster"
@retry(times=100, exceptions=(Exception))
def __init__(self, api, namespace, name):
self.namespace = namespace
self.name = name
self.cm = (
pykube.ConfigMap.objects(api)
.filter(namespace=self.namespace)
.get_by_name(self.name)
)
def get_default(self):
"""We have deployments with completed job, but it is not reflected
in the configmap state. Assume when configmap is created more than
1h and we doing update/restart, and key not in map this is
existed environment. So we assume the cluster was initialy bootstrapped.
This is needed to avoid manual actions.
"""
now = datetime.datetime.utcnow()
created_at = datetime.datetime.strptime(
self.cm.obj["metadata"]["creationTimestamp"], "%Y-%m-%dT%H:%M:%SZ"
)
delta = datetime.timedelta(seconds=3600)
if now - created_at > delta:
self.complete()
return "COMPLETED"
return "NOT_COMPLETED"
@property
@retry(times=10, exceptions=(Exception))
def is_completed(self):
self.cm.reload()
if self.initial_state_key in self.cm.obj["data"]:
return self.cm.obj["data"][self.initial_state_key]
return self.get_default() == "COMPLETED"
@retry(times=100, exceptions=(Exception))
def complete(self):
patch = {"data": {self.initial_state_key: "COMPLETED"}}
self.cm.patch(patch)
ics = initalClusterState(
api, MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE, MARIADB_CLUSTER_STATE_CONFIGMAP
)
if ics.is_completed:
LOG.info("The initial bootstrap was completed, skipping wait...")
sys.exit(0)
LOG.info("Checking for mariadb cluster state.")
def is_mariadb_stabe():
try:
wsrep_OK = {
"wsrep_ready": "ON",
"wsrep_connected": "ON",
"wsrep_cluster_status": "Primary",
"wsrep_local_state_comment": "Synced",
"wsrep_cluster_size": str(MARIADB_REPLICAS),
}
wsrep_vars = ",".join(["'" + var + "'" for var in wsrep_OK.keys()])
db_cursor = pymysql.connect(
host=MARIADB_HOST, password=MARIADB_PASSWORD,
read_default_file="/etc/mysql/admin_user.cnf"
).cursor()
db_cursor.execute(f"SHOW GLOBAL STATUS WHERE Variable_name IN ({wsrep_vars})")
wsrep_vars = db_cursor.fetchall()
diff = set(wsrep_vars).difference(set(wsrep_OK.items()))
if diff:
LOG.error(f"The wsrep is not OK: {diff}")
else:
LOG.info("The wspep is ready")
return True
except Exception as e:
LOG.error(f"Got exception while checking state. {e}")
return False
count = 0
ready = False
stable_for = 1
while True:
if is_mariadb_stabe():
stable_for += 1
LOG.info(
f"The cluster is stable for {stable_for} out of {MARIADB_CLUSTER_STABILITY_COUNT}"
)
if stable_for == MARIADB_CLUSTER_STABILITY_COUNT:
ics.complete()
sys.exit(0)
else:
LOG.info(f"Sleeping for {MARIADB_CLUSTER_STABILITY_WAIT}")
time.sleep(MARIADB_CLUSTER_STABILITY_WAIT)
continue
else:
LOG.info("Resetting stable_for count.")
stable_for = 0
LOG.info(f"Sleeping for {MARIADB_CLUSTER_CHECK_WAIT}")
time.sleep(MARIADB_CLUSTER_CHECK_WAIT)

View File

@ -57,4 +57,6 @@ data:
mariadb_controller.py: |
{{ tuple "bin/_mariadb_controller.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
{{- end }}
mariadb-wait-for-cluster.py: |
{{ tuple "bin/_mariadb-wait-for-cluster.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
{{- end }}

View File

@ -0,0 +1,123 @@
{{/*
Copyright 2019 Mirantis inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
{{- if .Values.manifests.job_cluster_wait }}
{{- $envAll := . }}
{{- $serviceAccountName := print .Release.Name "-cluster-wait" }}
{{ tuple $envAll "cluster_wait" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod
namespace: {{ $envAll.Release.Namespace }}
rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- update
- patch
- get
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod
namespace: {{ $envAll.Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod
subjects:
- kind: ServiceAccount
name: {{ $serviceAccountName }}
namespace: {{ $envAll.Release.Namespace }}
---
apiVersion: batch/v1
kind: Job
metadata:
name: "{{.Release.Name}}-cluster-wait"
labels:
{{ tuple $envAll "mariadb" "cluster-wait" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
annotations:
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
spec:
backoffLimit: {{ .Values.jobs.cluster_wait.clusterCheckRetries }}
template:
metadata:
labels:
{{ tuple $envAll "mariadb" "cluster-wait" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
spec:
{{ dict "envAll" $envAll "application" "cluster_wait" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
serviceAccountName: {{ $serviceAccountName }}
restartPolicy: OnFailure
nodeSelector:
{{ .Values.labels.job.node_selector_key }}: {{ .Values.labels.job.node_selector_value }}
initContainers:
{{ tuple $envAll "cluster_wait" list | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }}
containers:
- name: {{.Release.Name}}-mariadb-cluster-wait
{{ tuple $envAll "mariadb_scripted_test" | include "helm-toolkit.snippets.image" | indent 10 }}
{{ dict "envAll" $envAll "application" "cluster_wait" "container" "mariadb_cluster_wait" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }}
env:
- name: MARIADB_HOST
value: {{ tuple "oslo_db" "internal" $envAll | include "helm-toolkit.endpoints.endpoint_host_lookup" }}
- name: MARIADB_REPLICAS
value: {{ .Values.pod.replicas.server | quote }}
- name: MARIADB_CLUSTER_CHECK_WAIT
value: {{ .Values.jobs.cluster_wait.clusterCheckWait | quote }}
- name: MARIADB_CLUSTER_STABILITY_COUNT
value: {{ .Values.jobs.cluster_wait.clusterStabilityCount | quote }}
- name: MARIADB_CLUSTER_STABILITY_WAIT
value: {{ .Values.jobs.cluster_wait.clusterStabilityWait | quote }}
- name: MARIADB_CLUSTER_STATE_CONFIGMAP
value: {{ printf "%s-%s" .Release.Name "mariadb-state" | quote }}
- name: MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE
value: {{ $envAll.Release.Namespace }}
- name: MARIADB_PASSWORD
valueFrom:
secretKeyRef:
name: mariadb-dbadmin-password
key: MYSQL_DBADMIN_PASSWORD
command:
- /tmp/mariadb-wait-for-cluster.py
volumeMounts:
- name: pod-tmp
mountPath: /tmp
- name: mariadb-bin
mountPath: /tmp/mariadb-wait-for-cluster.py
subPath: mariadb-wait-for-cluster.py
readOnly: true
- name: mariadb-secrets
mountPath: /etc/mysql/admin_user.cnf
subPath: admin_user.cnf
readOnly: true
volumes:
- name: pod-tmp
emptyDir: {}
- name: mariadb-bin
configMap:
name: mariadb-bin
defaultMode: 0555
- name: mariadb-secrets
secret:
secretName: mariadb-secrets
defaultMode: 0444
{{- end }}

View File

@ -130,6 +130,16 @@ pod:
controller:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
cluster_wait:
pod:
runAsUser: 65534
runAsNonRoot: true
container:
mariadb_cluster_wait:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
affinity:
anti:
type:
@ -238,6 +248,10 @@ dependencies:
service: oslo_db
controller:
services: null
cluster_wait:
services:
- endpoint: internal
service: oslo_db
volume:
# this value is used for single pod deployments of mariadb to prevent losing all data
# if the pod is restarted
@ -254,6 +268,11 @@ volume:
size: 5Gi
jobs:
cluster_wait:
clusterCheckWait: 30
clusterCheckRetries: 30
clusterStabilityCount: 30
clusterStabilityWait: 4
exporter_create_sql_user:
backoffLimit: 87600
activeDeadlineSeconds: 3600
@ -672,4 +691,5 @@ manifests:
statefulset: true
deployment_controller: true
service_master: true
job_cluster_wait: false
...

View File

@ -65,4 +65,5 @@ mariadb:
- 0.2.47 Deploy exporter as sidecar
- 0.2.48 Switch to mariadb controller deployment
- 0.2.49 Remove ingress deployment
- 0.2.50 Add cluster-wait job
...