From ba6c61584d62f4584dab6c8187f5973211e7e720 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 11 May 2021 12:15:42 -0400 Subject: [PATCH] Refactor background in-service start host services handling The maintenance add_handler fsm loads inventory and recovers host state over a process restart. If the active controller's uptime is less than 15 minutes the restart event is treated as a Dead Office Recovery (DOR) and is more forgiving to host recovery by scheduling the 'start host services' as a background operation so as to not hold up the add operation. The current implementation of the background handling of 'start host services' is not handling the AIO subfunction case properly in DOR mode as well as being difficult to follow and therfore fix and maintain. This miss handling leads to maintenance incorrectly failing the node with a subfunction configuration error over the DOR case. This update refactors the background handling of 'start host services' to fix the issue and improve its clearity and maintainability. Test Cases: PASS: Verify AIO DX DOR handling PASS: Verify AIO DX active controller reboot handling - standby with uptime ; < 15 min and > 15 min PASS: Verify AIO DX standby controller reboot handling PASS: Verify subfunction configuration error handling Regression: PASS: Verify start host services wait/retry handling. PASS: Verify start host services failure handling. PASS: Verify DOR of Standard system PASS: Verify DOR of AIO Plus system PASS: Verify AIO System Install PASS: Verify Standard System Install PASS: Verify AIO plus system install Change-Id: Ia4683672e3a2852b5b4837167b2dcd2a1e4e6d57 Closes-Bug: 1928095 Signed-off-by: Eric MacDonald --- mtce/src/maintenance/mtcNodeHdlrs.cpp | 165 ++++++++++++++++---------- 1 file changed, 101 insertions(+), 64 deletions(-) diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 49ac2684..6871bf93 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -7400,27 +7400,44 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } /************************************************************* - * Handle Start Host Services if its posted for execution + * Handle Main Function Start Host Services if it's 'needed' ************************************************************/ else if ( node_ptr->start_services_needed == true ) { - /* If Main Start Host Services is not already running then launch it */ - if (( node_ptr->start_services_running_main == false ) && - ( node_ptr->start_services_running_subf == false )) + /* If Main Start Host Services is not already running + * then launch it */ + if ( node_ptr->start_services_running_main == false ) { - bool start = true ; - if ( this->launch_host_services_cmd ( node_ptr , start ) != PASS ) + /* Only launch if the node is successfully configured + * and tested */ + if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) && + ( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) && + ( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED )) { - node_ptr->hostservices_failed = true ; - node_ptr->start_services_retries++ ; + /* Launch 'start' for this node type */ + bool start = true ; + if ( this->launch_host_services_cmd ( node_ptr , start ) != PASS ) + { + /* failed -> retry */ + node_ptr->hostservices_failed = true ; + node_ptr->start_services_running_main = false ; + node_ptr->start_services_retries++ ; + } + else + { + /* launched successfully */ + node_ptr->start_services_running_main = true ; + node_ptr->hostservices_failed = false ; + } } else { - node_ptr->start_services_running_main = true ; + ilog("%s start host services ; waiting to launch (%x)", + node_ptr->hostname.c_str(), + node_ptr->mtce_flags); } } - /* Handle start host services response for both main and - * subfunction levels */ + /* Handle Main start host services response */ else { /* Wait for host services to complete - pass or fail. @@ -7431,23 +7448,81 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) /* wait for the mtcClient's response ... */ break ; } - - node_ptr->start_services_running_main = false ; - - if ( rc != PASS ) + else if ( rc != PASS ) { - - /* set the correct failed flag */ - if ( node_ptr->start_services_needed_subf == true ) + node_ptr->hostservices_failed = true ; + node_ptr->start_services_retries++ ; + wlog ("%s %s request failed ; (retry %d)\n", + node_ptr->hostname.c_str(), + node_ptr->host_services_req.name.c_str(), + node_ptr->start_services_retries); + } + else /* success path */ + { + node_ptr->start_services_needed = false ; + node_ptr->hostservices_failed = false ; + node_ptr->start_services_retries = 0 ; + } + node_ptr->start_services_running_main = false ; + } + } + /************************************************************* + * Handle Sub Function Start Host Services if it's 'needed' + ************************************************************/ + else if ( node_ptr->start_services_needed_subf == true ) + { + /* If Subf Start Host Services is not already running + * then launch it */ + if ( node_ptr->start_services_running_subf == false ) + { + /* Only launch if the node and subfunction are + * successfully configured and tested */ + if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) && + ( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) && + ( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED ) && + ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) && + ( node_ptr->mtce_flags & MTC_FLAG__SUBF_GOENABLED )) + { + /* Launch 'start' for this subfunction type */ + bool start = true ; + bool subf = true ; + if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS ) { - node_ptr->start_services_running_subf = false ; + /* failed -> retry */ node_ptr->hostservices_failed_subf = true ; + node_ptr->start_services_running_subf = false ; + node_ptr->start_services_retries++ ; } else { - node_ptr->hostservices_failed = true ; + /* launched successfully */ + node_ptr->hostservices_failed_subf = false ; + node_ptr->start_services_running_subf = true ; } - + } + else + { + ilog("%s subf start host services ; waiting to launch (%x)", + node_ptr->hostname.c_str(), + node_ptr->mtce_flags); + } + } + /* Handle Subf start host services response */ + else + { + /* Wait for host services to complete - pass or fail. + * The host_services_handler manages timeout. */ + int rc = this->host_services_handler ( node_ptr ); + if ( rc == RETRY ) + { + /* wait for the mtcClient's response ... */ + break ; + } + node_ptr->start_services_running_subf = false ; + if ( rc != PASS ) + { + node_ptr->start_services_running_subf = false ; + node_ptr->hostservices_failed_subf = true ; node_ptr->start_services_retries++ ; wlog ("%s %s request failed ; (retry %d)\n", @@ -7457,52 +7532,14 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } else /* success path */ { - /* clear the correct fail flag */ - if (( node_ptr->start_services_needed_subf == true ) && - ( node_ptr->start_services_running_subf == true )) - { - node_ptr->start_services_needed_subf = false ; - node_ptr->start_services_running_subf = false ; - node_ptr->hostservices_failed_subf = false ; - } - else - { - node_ptr->hostservices_failed = false ; - } - - /************************************************* - * Handle running the subfunction start worker - * host services command as a background operation - * after the controller start result has come in - * as a PASS. - ************************************************/ - if ( node_ptr->start_services_needed_subf == true ) - { - bool start = true ; - bool subf = node_ptr->start_services_needed_subf ; - if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS ) - { - node_ptr->hostservices_failed_subf = true ; - - /* try again on next audit */ - node_ptr->start_services_retries++ ; - } - else - { - node_ptr->start_services_running_subf = true ; - } - } - else - { - /* All host service scripts pass ; done */ - clear_hostservices_ctls ( node_ptr ); - node_ptr->hostservices_failed_subf = false ; - node_ptr->hostservices_failed = false ; - } + node_ptr->start_services_needed_subf = false ; + node_ptr->hostservices_failed_subf = false ; + node_ptr->start_services_running_subf = false ; + node_ptr->start_services_retries = 0 ; } + node_ptr->start_services_running_subf = false ; } } - if ( NOT_THIS_HOST ) { if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||