From 1335bc484df331771e995ae822df3af84cc5739d Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 4 Jun 2024 19:42:54 +0000 Subject: [PATCH] Add auto run goenabled and start hosts services to mtcClient The 'mtcClient' currently automatically runs the main function's 'goenabled' scripts on process startup for all nodes if and when their run preconditions are met. However, that is not true for 'start host services' and, in the AIO system type case, the subfunction 'goenabled' scripts. Typically, this is acceptable because the 'mtcAgent' will request these scripts to be run during unlock and failure recovery scenarios. However, if the system administrator reconfigures the maintenance heartbeat fault handling action from the default 'fail' to any other setting [degrade,alarm,none] and a node reboots outside of maintenance control, then upon reboot recovery, the 'start host services' and, if the node is an AIO controller, the required subfunction 'goenabled' scripts are not executed. In such a case, the missing subfunction 'goenabled' flag file (/var/run/goenabled_subf) prevents the hbsAgent and hbsClient on that node from entering its in-service mode of operation. Instead they run waiting for the node's In-Test phase to complete ; which never happens. This can lead to what appears to be suck maintenance heartbeat alarms. However, its really caused by the maintenance heartbeat processes on that node gated from performing their mission mode function. The /var/run/goenabled_subf flag file is the AIO In-Test complete gate. It is set if the subfunction 'goenabled' tests pass. However, because this flag file is in /var/run (a volatile directory) it is lost/cleared over a reboot. This update adds the automatic execution of the AIO controller's subfunction 'goenabled' scripts and the 'start host services' for all nodes. Once all the required preconditions are met the scripts are run and that node is ready for service, regardless of how and the conditions underwhich it rebooted. Testing of this update is focused on - Verifying the originating issue is resolved. - Verify the changed behavior over the install of all system types. - Verify the changed behavior with an uncontrolled reboot or each node type for all the supported maintenance heartbeat failure action modes. Test Plan: PASS: Verify install of the following system types PASS: - AIO SX PASS: - AIO DX and AIO DX Plus PASS: - Standard DX with worker and storage nodes (vbox) PASS: - System Controller with 1 subcloud (dc-libvirt) PASS: Verify spontaneous reboot of unlocked active AIO controller with PASS: - heartbeat_failure_action=fail PASS: - heartbeat_failure_action=degrade PASS: - heartbeat_failure_action=alarm PASS: - heartbeat_failure_action=none PASS: Verify spontaneous reboot of unlocked standby AIO controller with PASS: - heartbeat_failure_action=fail PASS: - heartbeat_failure_action=degrade PASS: - heartbeat_failure_action=alarm PASS: - heartbeat_failure_action=none PASS: Verify reboot recovery after spontaneous reboot of worker PASS: Verify reboot recovery after spontaneous reboot of storage PASS: Verify start host services is run on mtcClient process startup. PASS: Verify start host services is run on worker and storage nodes when rebooted with all heartbeat failure recovery action modes. Regression: PASS: Verify degrade and alarm management over in-service heartbeat failure while when heartbeat_failure_action=fail PASS: Verify degrade and alarm management over in-service heartbeat failure while when heartbeat_failure_action=degrade PASS: Verify degrade and alarm management over in-service heartbeat failure while when heartbeat_failure_action=alarm PASS: Verify no alarm or degrade over in-service heartbeat failure while when heartbeat_failure_action=none PASS: Verify mtcClint over AIO standby controller lock/unlock PASS: Verify start host services is run on mtcClient on every node by command from mtcAgent process startup. PASS: Verify start host services is run on mtcClient over a unlock or graceful recovery by command from mtcAgent. PASS: Verify start host services check follows goenabled test completion on process startup. PASS: Verify stop host services is run over a node lock. PASS: Verify goenable main and subfunction failure handling PASS: Verify start hosts service failure handling PASS: Verify no coredump or crashdumps PASS: Verify no stuck alarms Closes-Bug: 2067917 Change-Id: Ie8aaf5da20b092267f637ad3df125019c244991b Signed-off-by: Eric MacDonald --- mtce/src/maintenance/mtcNodeComp.cpp | 84 ++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index f5bad581..df2330ee 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -1386,9 +1386,17 @@ int daemon_init ( string iface, string nodetype_str ) * restart in the no-reboot patching case */ if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) == false ) { + ilog ("posting main-function goenable tests"); ctrl.posted_script_set.push_front(GOENABLED_MAIN_SCRIPTS); } + if (( ctrl.nodetype & CONTROLLER_TYPE ) && + ( ctrl.system_type != SYSTEM_TYPE__NORMAL ) && + ( daemon_is_file_present ( GOENABLED_SUBF_PASS ) == false )) + { + ilog ("posting sub-function goenable tests"); + ctrl.posted_script_set.push_back(GOENABLED_SUBF_SCRIPTS); + } return (rc) ; } @@ -1398,6 +1406,10 @@ void daemon_service_run ( void ) int rc = PASS ; int file_not_present_count = 0 ; + /* Bool to track whether the start host services scripts run has + * been attempted at least once since last process startup. */ + bool start_host_services_needs_to_be_run = true ; + if ( daemon_is_file_present ( NODE_RESET_FILE ) ) { wlog ("mtce reboot required"); @@ -1897,6 +1909,78 @@ void daemon_service_run ( void ) daemon_signal_hdlr (); } } + + if (( start_host_services_needs_to_be_run == true) && + ( ctrl.posted_script_set.size() == 0 )) + { + bool run_start_host_services = false ; + dlog1 ("Start Host Services needs to be run"); + if ( ctrl.system_type == SYSTEM_TYPE__NORMAL ) + { + /* Any node on a standard system */ + if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) ) + { + ilog ("start host services on standard system accepted"); + run_start_host_services = true ; + } + else if ( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) ) + { + /* Don't run start host services if any goenabled failed */ + wlog ("start host services on standard system rejected ; goenabled failed"); + start_host_services_needs_to_be_run = false ; + } + } + else if ( ctrl.nodetype & CONTROLLER_TYPE ) + { + /* AIO controller */ + if ( daemon_is_file_present ( GOENABLED_SUBF_PASS ) ) + { + ilog ("start host services on all-in-one controller accepted"); + run_start_host_services = true ; + } + else if (( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) || + ( daemon_is_file_present ( GOENABLED_SUBF_FAIL )))) + { + /* Don't run start host services if any goenabled failed */ + wlog ("start host services on all-in-one controller rejected ; goenabled failed "); + start_host_services_needs_to_be_run = false ; + } + } + else + { + /* AIO plus : worker and storage */ + if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) ) + { + ilog ("start host services on all-in-one plus node accepted"); + run_start_host_services = true ; + } + else if ( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) ) + { + /* Don't run start host services if any goenabled failed */ + wlog ("start host services on all-in-one plus node rejected ; goenabled failed"); + start_host_services_needs_to_be_run = false ; + } + } + + if ( run_start_host_services ) + { + ctrl.posted_script_set.push_back ( HOSTSERVICES_SCRIPTS ); + + int cmd = MTC_CMD_NONE ; + if ( ctrl.nodetype & CONTROLLER_TYPE) + cmd = MTC_CMD_START_CONTROL_SVCS ; + else if ( ctrl.nodetype & WORKER_TYPE ) + cmd = MTC_CMD_START_WORKER_SVCS ; + else if ( ctrl.nodetype & STORAGE_TYPE ) + cmd = MTC_CMD_START_STORAGE_SVCS ; + + ctrl.hostservices.posted = cmd ; + ctrl.hostservices.monitor = MTC_CMD_NONE ; + ilog ("posted start host services ; from process startup ; cmd:%s", get_mtcNodeCommand_str(cmd)); + + start_host_services_needs_to_be_run = false ; + } + } daemon_signal_hdlr (); } daemon_exit();