diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index e1a0572a..046db72a 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -397,10 +397,9 @@ void mtc_stages_init ( void ) recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START ] = "Heartbeat-Start"; recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK ] = "Heartbeat-Soak"; recoveryStages_str[MTC_RECOVERY__STATE_CHANGE ] = "State Change"; - recoveryStages_str[MTC_RECOVERY__ENABLE_START ] = "Enable-Start"; recoveryStages_str[MTC_RECOVERY__FAILURE ] = "Failure"; recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT ] = "WorkQ-Wait"; - recoveryStages_str[MTC_RECOVERY__ENABLE_WAIT ] = "Enable-Wait"; + recoveryStages_str[MTC_RECOVERY__ENABLE ] = "Enable"; recoveryStages_str[MTC_RECOVERY__STAGES ] = "unknown"; disableStages_str [MTC_DISABLE__START ] = "Disable-Start"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index bf7abb8e..875f3551 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -948,7 +948,7 @@ typedef enum string get_delStages_str ( mtc_delStages_enum stage ); -#define MTC_MAX_FAST_ENABLES (3) +#define MTC_MAX_FAST_ENABLES (5) typedef enum { MTC_RECOVERY__START = 0, @@ -974,10 +974,9 @@ typedef enum MTC_RECOVERY__HEARTBEAT_START, MTC_RECOVERY__HEARTBEAT_SOAK, MTC_RECOVERY__STATE_CHANGE, - MTC_RECOVERY__ENABLE_START, MTC_RECOVERY__FAILURE, MTC_RECOVERY__WORKQUEUE_WAIT, - MTC_RECOVERY__ENABLE_WAIT, + MTC_RECOVERY__ENABLE, MTC_RECOVERY__STAGES, } mtc_recoveryStages_enum ; diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 8d77fb3a..f705f095 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -4927,17 +4927,17 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa } if ( temp_count != mnfa_host_count[iface] ) - { + { slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n", node_ptr->hostname.c_str(), get_iface_name_str(iface), mnfa_host_count[iface], temp_count ); mnfa_host_count[iface] = temp_count ; mnfa_host_count[iface] = temp_count ; - } + } else { - wlog ("%s MNFA host tally (%s:%d)\n", + dlog ("%s MNFA host tally (%s:%d)\n", node_ptr->hostname.c_str(), get_iface_name_str(iface), mnfa_host_count[iface] ); @@ -5903,9 +5903,6 @@ int nodeLinkClass::critical_process_failed( string & hostname, node_ptr->hostname.c_str()); /* dlog */ } - /* Start fresh the next time we enter graceful recovery handler */ - node_ptr->graceful_recovery_counter = 0 ; - /* Set node as unlocked-disabled-failed */ allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, MTC_OPER_STATE__DISABLED, @@ -6863,7 +6860,7 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr, } /** Validate and log Recovery stage changes */ -int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, +int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, mtc_recoveryStages_enum newHdlrStage ) { int rc = PASS ; @@ -6871,14 +6868,14 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, if (( newHdlrStage >= MTC_RECOVERY__STAGES ) || ( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES )) { - slog ("%s Invalid recovery stage (%d:%d)\n", + slog ("%s Invalid recovery stage (%d:%d)\n", node_ptr->hostname.c_str(), - node_ptr->recoveryStage, + node_ptr->recoveryStage, newHdlrStage ); if ( newHdlrStage < MTC_RECOVERY__STAGES ) { - clog ("%s ? -> %s\n", + clog ("%s ? -> %s\n", node_ptr->hostname.c_str(), get_recoveryStages_str(newHdlrStage).c_str()); @@ -6890,11 +6887,11 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, rc = FAIL ; } } - else + else { - clog ("%s %s -> %s\n", + clog ("%s %s -> %s\n", node_ptr->hostname.c_str(), - get_recoveryStages_str(node_ptr->recoveryStage).c_str(), + get_recoveryStages_str(node_ptr->recoveryStage).c_str(), get_recoveryStages_str(newHdlrStage).c_str()); node_ptr->recoveryStage = newHdlrStage ; diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index b898c375..8f6ce2f3 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1660,7 +1660,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) * 2. Setting the node operational state to Disabled * 3. Setting the Enable action */ - if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES ) + node_ptr->graceful_recovery_counter++ ; + if ( node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES ) { /* gate off further mtcAlive messaging timme the offline * handler runs. This prevents stale messages from making it @@ -2555,7 +2556,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else if ( rc == PASS ) { /* Start Graceful Recovery */ - recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ; + recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE ) ; break ; } else if ( rc == FAIL_WORKQ_TIMEOUT ) @@ -2571,51 +2572,37 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) nodeLinkClass::force_full_enable ( node_ptr ); break ; } - case MTC_RECOVERY__ENABLE_START: + case MTC_RECOVERY__ENABLE: { - /* Create the recovery enable timer. This timer is short. - * A node need to stay enabled with the hartbeat service - * running for a period of time before declaring it enabled */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); - - recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ; - break; - } - case MTC_RECOVERY__ENABLE_WAIT: - { - /* When this timer fires the host has been up for enough time */ - if ( node_ptr->mtcTimer.ring == true ) + if ( is_controller(node_ptr) ) { - if ( is_controller(node_ptr) ) + if ( mtcSmgrApi_request ( node_ptr, + CONTROLLER_ENABLED, + SMGR_MAX_RETRIES ) != PASS ) { - if ( mtcSmgrApi_request ( node_ptr, - CONTROLLER_ENABLED, - SMGR_MAX_RETRIES ) != PASS ) - { - wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n", - node_ptr->hostname.c_str()); - } + wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager ; allowing enable\n", + node_ptr->hostname.c_str()); } - /* Node Has Recovered */ - node_ptr->graceful_recovery_counter = 0 ; - recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); - node_ptr->health_threshold_counter = 0 ; - node_ptr->enabled_count++ ; - node_ptr->http_retries_cur = 0 ; - - doneQueue_purge ( node_ptr ); - if ( node_ptr->was_dor_recovery_mode ) - { - report_dor_recovery ( node_ptr , "is ENABLED" ); - } - else - { - plog ("%s is ENABLED (Gracefully Recovered)\n", - node_ptr->hostname.c_str()); - } - alarm_enabled_clear ( node_ptr, false ); } + /* Node Has Recovered */ + node_ptr->graceful_recovery_counter = 0 ; + recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + node_ptr->health_threshold_counter = 0 ; + node_ptr->enabled_count++ ; + node_ptr->http_retries_cur = 0 ; + + doneQueue_purge ( node_ptr ); + if ( node_ptr->was_dor_recovery_mode ) + { + report_dor_recovery ( node_ptr , "is ENABLED" ); + } + else + { + plog ("%s is ENABLED (Gracefully Recovered)\n", + node_ptr->hostname.c_str()); + } + alarm_enabled_clear ( node_ptr, false ); break ; } default: diff --git a/mtce/src/maintenance/mtcNodeMnfa.cpp b/mtce/src/maintenance/mtcNodeMnfa.cpp index af2493b1..8ebbc15c 100644 --- a/mtce/src/maintenance/mtcNodeMnfa.cpp +++ b/mtce/src/maintenance/mtcNodeMnfa.cpp @@ -159,19 +159,20 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mnfa_graceful_recovery == true ) { - /* Restart the heartbeat for this recovered host */ - // send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS ); - if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) { - ilog ("%s graceful recovery from MNFA\n", node_ptr->hostname.c_str()); - recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); + ilog ("%s graceful recovery (graceful recover count:%d)", + node_ptr->hostname.c_str(), + node_ptr->graceful_recovery_counter); } else { - wlog ("%s already gracefully recovering\n", node_ptr->hostname.c_str() ); + wlog ("%s graceful recovery restart (graceful recover count:%d)", + node_ptr->hostname.c_str(), + node_ptr->graceful_recovery_counter ); } + recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); } } @@ -298,43 +299,38 @@ void nodeLinkClass::mnfa_exit ( bool force ) * Clear heartbeat degrades */ for ( struct node * ptr = head ; ; ptr = ptr->next ) { - if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) || - ( ptr->hbs_minor[MGMNT_IFACE] == true )) && - ( ptr->operState == MTC_OPER_STATE__ENABLED )) + std::list::iterator mnfa_awol_ptr ; + for ( mnfa_awol_ptr = mnfa_awol_list.begin() ; + mnfa_awol_ptr != mnfa_awol_list.end() ; + mnfa_awol_ptr++ ) { - ptr->hbs_minor[MGMNT_IFACE] = false ; - ptr->hbs_minor[CLSTR_IFACE] = false ; + /* skip host if not in the mnfa pool */ + if ( ptr->hostname.compare(*(mnfa_awol_ptr)) ) + continue ; - if ( force == true ) + if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) || + ( ptr->hbs_minor[MGMNT_IFACE] == true )) && + ( ptr->operState == MTC_OPER_STATE__ENABLED )) { - elog ("... %s failed ; auto-recovering\n", - ptr->hostname.c_str()); + ptr->hbs_minor[MGMNT_IFACE] = false ; + ptr->hbs_minor[CLSTR_IFACE] = false ; - /* Set node as failed */ - availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED ); - enableStageChange ( ptr, MTC_ENABLE__START ); - adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE ); - } - else - { - if ( ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) + if ( force == true ) { - if ( ptr->degrade_mask == 0 ) - { - availStatusChange ( ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - } + elog ("... %s failed ; auto-recovering\n", + ptr->hostname.c_str()); - if ( ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) - { - recoveryStageChange ( ptr, MTC_RECOVERY__START ); - adminActionChange ( ptr, MTC_ADMIN_ACTION__RECOVER ); + /* Set node as failed */ + availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED ); + enableStageChange ( ptr, MTC_ENABLE__START ); + adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE ); } else { - wlog ("%s already gracefully recovering\n", ptr->hostname.c_str() ); + mnfa_recover_host ( ptr ); } } + break ; } if (( ptr->next == NULL ) || ( ptr == tail )) break ;