Fix lifecycle pre apply checks

When one host has its availability changed, the application lifecycle
framework will call a hook with an 'evaluate-reapply' operation and
with the 'relative_timing' field missing.

This hook should block the 'apply' and 'evaluate-reapply' operations
if Ceph is unresponvise. Not blocking these operations will lead to a
scenario where the ceph-csi configuration gets broken and the PVCs
fail to attach, mount and umount.

Example of hook_info dict when the 'evaluate-reapply' operation is
started:

{'mode': 'auto', 'lifecycle_type': 'check', 'operation': \
 'evaluate-reapply', 'extra': {'trigger': {'type': \
 'host-availability-updated', 'availability': 'available'}}}

Example of hook_info dict when the 'apply' operations is started:

{'mode': 'auto', 'lifecycle_type': 'check', 'relative_timing': 'pre',\
 'operation': 'apply', 'extra': {}}

This change fixes the semantic check for 'evaluate-reapply' and
'apply' operations, considering the 'relative_timing' field.

Test-Plan:
  PASS: On AIO-DX restart the standby controller and when it is back
        'available' verify if the auto reapply is blocked if Ceph
        status is not HEALTH_OK
  PASS: Verify application update is working
  PASS: Verify application auto apply is working
  PASS: Add a new storage tier and verify the application is
        re-applied correctly.
  PASS: Manual and auto application apply is blocked when Ceph status
        is not HEALTH_OK

Closes-bug: 2097570

Change-Id: I09bc9b0bfe53ddae3008ae164a177f3ea5a6ae63
Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
This commit is contained in:
Felipe Sanches Zanoni 2025-02-06 18:18:14 -03:00
parent d896ab6a41
commit bc01cdee11
3 changed files with 28 additions and 11 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2024 Wind River Systems, Inc.
# Copyright (c) 2020-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -153,6 +153,8 @@ class CephFSProvisionerHelm(base.FluxCDBaseHelm):
# Get tier info.
tiers = self.dbapi.storage_tier_get_list()
cluster_id = cutils.get_ceph_fsid()
if not cluster_id:
raise Exception("Could not identify Ceph cluster fsid. Try again when ceph cli is responsive.")
storage_classes = []
for bk in ceph_bks:

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2024 Wind River Systems, Inc.
# Copyright (c) 2020-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -71,6 +71,8 @@ class RbdProvisionerHelm(base.FluxCDBaseHelm):
# Get tier info.
tiers = self.dbapi.storage_tier_get_list()
cluster_id = cutils.get_ceph_fsid()
if not cluster_id:
raise Exception("Could not identify Ceph cluster fsid. Try again when ceph cli is responsive.")
storage_classes = []
for bk in ceph_bks:

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2021-2024 Wind River Systems, Inc.
# Copyright (c) 2021-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -12,6 +12,7 @@
# pylint: disable=no-member
# pylint: disable=no-name-in-module
import os
import subprocess
from oslo_log import log as logging
from sysinv.common import constants
@ -38,11 +39,15 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
"""
# Semantic checks
if hook_info.lifecycle_type == LifecycleConstants.APP_LIFECYCLE_TYPE_SEMANTIC_CHECK:
if hook_info.mode == LifecycleConstants.APP_LIFECYCLE_MODE_AUTO and \
((hook_info.operation == constants.APP_APPLY_OP and
hook_info.relative_timing == LifecycleConstants.APP_LIFECYCLE_TIMING_PRE) or
hook_info.mode == constants.APP_EVALUATE_REAPPLY_OP):
return self.pre_auto_apply_check(conductor_obj)
# The kube_app logic does not send the hook_info.relative_timing value
# when this is an APP_EVALUATE_REAPLY_OP operation.
# Therefore, check the hook_info.operation first and validate if the
# relative_timing is provided. If it is not, run the pre-apply checks.
if hook_info.operation in [constants.APP_APPLY_OP,
constants.APP_EVALUATE_REAPPLY_OP]:
if "relative_timing" not in hook_info or \
hook_info.relative_timing == LifecycleConstants.APP_LIFECYCLE_TIMING_PRE:
return self.pre_apply_check(conductor_obj)
# Rbd
elif hook_info.lifecycle_type == LifecycleConstants.APP_LIFECYCLE_TYPE_RBD:
@ -67,14 +72,15 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
# Use the default behaviour for other hooks
super(PlatformAppLifecycleOperator, self).app_lifecycle_actions(context, conductor_obj, app_op, app, hook_info)
def pre_auto_apply_check(self, conductor_obj):
""" Semantic check for auto-apply
def pre_apply_check(self, conductor_obj):
""" Semantic check for apply
Check:
- ceph access
- ceph health
- crushmap applied
- replica count is non-zero so that manifest apply will not timeout
- ceph cli is responsive as it will be used by the application during the apply
:param conductor_obj: conductor object
@ -96,7 +102,7 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
"CephOperator is not initialized yet")
if not conductor_obj._ceph.have_ceph_monitor_access():
raise exception.LifecycleSemanticCheckException(
"Monitor access error")
"Ceph monitor is unreacheable")
if not conductor_obj._ceph.ceph_status_ok():
raise exception.LifecycleSemanticCheckException(
"Ceph status is not HEALTH_OK")
@ -110,6 +116,13 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
raise exception.LifecycleSemanticCheckException(
"Not enough hosts in desired state")
# Check if ceph cli is responsive.
ceph_fsid_cmd = ["timeout", "10", "ceph", "fsid"]
result = subprocess.run(ceph_fsid_cmd, check=False)
if (result.returncode != 0):
raise exception.LifecycleSemanticCheckException(
"Ceph CLI is not responsive")
def pre_apply(self, app_op, app, hook_info):
"""Pre Apply actions