Fix lifecycle pre apply checks
When one host has its availability changed, the application lifecycle framework will call a hook with an 'evaluate-reapply' operation and with the 'relative_timing' field missing. This hook should block the 'apply' and 'evaluate-reapply' operations if Ceph is unresponvise. Not blocking these operations will lead to a scenario where the ceph-csi configuration gets broken and the PVCs fail to attach, mount and umount. Example of hook_info dict when the 'evaluate-reapply' operation is started: {'mode': 'auto', 'lifecycle_type': 'check', 'operation': \ 'evaluate-reapply', 'extra': {'trigger': {'type': \ 'host-availability-updated', 'availability': 'available'}}} Example of hook_info dict when the 'apply' operations is started: {'mode': 'auto', 'lifecycle_type': 'check', 'relative_timing': 'pre',\ 'operation': 'apply', 'extra': {}} This change fixes the semantic check for 'evaluate-reapply' and 'apply' operations, considering the 'relative_timing' field. Test-Plan: PASS: On AIO-DX restart the standby controller and when it is back 'available' verify if the auto reapply is blocked if Ceph status is not HEALTH_OK PASS: Verify application update is working PASS: Verify application auto apply is working PASS: Add a new storage tier and verify the application is re-applied correctly. PASS: Manual and auto application apply is blocked when Ceph status is not HEALTH_OK Closes-bug: 2097570 Change-Id: I09bc9b0bfe53ddae3008ae164a177f3ea5a6ae63 Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
This commit is contained in:
parent
d896ab6a41
commit
bc01cdee11
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -153,6 +153,8 @@ class CephFSProvisionerHelm(base.FluxCDBaseHelm):
|
||||
# Get tier info.
|
||||
tiers = self.dbapi.storage_tier_get_list()
|
||||
cluster_id = cutils.get_ceph_fsid()
|
||||
if not cluster_id:
|
||||
raise Exception("Could not identify Ceph cluster fsid. Try again when ceph cli is responsive.")
|
||||
storage_classes = []
|
||||
|
||||
for bk in ceph_bks:
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -71,6 +71,8 @@ class RbdProvisionerHelm(base.FluxCDBaseHelm):
|
||||
# Get tier info.
|
||||
tiers = self.dbapi.storage_tier_get_list()
|
||||
cluster_id = cutils.get_ceph_fsid()
|
||||
if not cluster_id:
|
||||
raise Exception("Could not identify Ceph cluster fsid. Try again when ceph cli is responsive.")
|
||||
storage_classes = []
|
||||
|
||||
for bk in ceph_bks:
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2021-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2021-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -12,6 +12,7 @@
|
||||
# pylint: disable=no-member
|
||||
# pylint: disable=no-name-in-module
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from oslo_log import log as logging
|
||||
from sysinv.common import constants
|
||||
@ -38,11 +39,15 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
"""
|
||||
# Semantic checks
|
||||
if hook_info.lifecycle_type == LifecycleConstants.APP_LIFECYCLE_TYPE_SEMANTIC_CHECK:
|
||||
if hook_info.mode == LifecycleConstants.APP_LIFECYCLE_MODE_AUTO and \
|
||||
((hook_info.operation == constants.APP_APPLY_OP and
|
||||
hook_info.relative_timing == LifecycleConstants.APP_LIFECYCLE_TIMING_PRE) or
|
||||
hook_info.mode == constants.APP_EVALUATE_REAPPLY_OP):
|
||||
return self.pre_auto_apply_check(conductor_obj)
|
||||
# The kube_app logic does not send the hook_info.relative_timing value
|
||||
# when this is an APP_EVALUATE_REAPLY_OP operation.
|
||||
# Therefore, check the hook_info.operation first and validate if the
|
||||
# relative_timing is provided. If it is not, run the pre-apply checks.
|
||||
if hook_info.operation in [constants.APP_APPLY_OP,
|
||||
constants.APP_EVALUATE_REAPPLY_OP]:
|
||||
if "relative_timing" not in hook_info or \
|
||||
hook_info.relative_timing == LifecycleConstants.APP_LIFECYCLE_TIMING_PRE:
|
||||
return self.pre_apply_check(conductor_obj)
|
||||
|
||||
# Rbd
|
||||
elif hook_info.lifecycle_type == LifecycleConstants.APP_LIFECYCLE_TYPE_RBD:
|
||||
@ -67,14 +72,15 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
# Use the default behaviour for other hooks
|
||||
super(PlatformAppLifecycleOperator, self).app_lifecycle_actions(context, conductor_obj, app_op, app, hook_info)
|
||||
|
||||
def pre_auto_apply_check(self, conductor_obj):
|
||||
""" Semantic check for auto-apply
|
||||
def pre_apply_check(self, conductor_obj):
|
||||
""" Semantic check for apply
|
||||
|
||||
Check:
|
||||
- ceph access
|
||||
- ceph health
|
||||
- crushmap applied
|
||||
- replica count is non-zero so that manifest apply will not timeout
|
||||
- ceph cli is responsive as it will be used by the application during the apply
|
||||
|
||||
:param conductor_obj: conductor object
|
||||
|
||||
@ -96,7 +102,7 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
"CephOperator is not initialized yet")
|
||||
if not conductor_obj._ceph.have_ceph_monitor_access():
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Monitor access error")
|
||||
"Ceph monitor is unreacheable")
|
||||
if not conductor_obj._ceph.ceph_status_ok():
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Ceph status is not HEALTH_OK")
|
||||
@ -110,6 +116,13 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Not enough hosts in desired state")
|
||||
|
||||
# Check if ceph cli is responsive.
|
||||
ceph_fsid_cmd = ["timeout", "10", "ceph", "fsid"]
|
||||
result = subprocess.run(ceph_fsid_cmd, check=False)
|
||||
if (result.returncode != 0):
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Ceph CLI is not responsive")
|
||||
|
||||
def pre_apply(self, app_op, app, hook_info):
|
||||
"""Pre Apply actions
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user