From 08c8c657951a128c6c4a08ddc014625924bc0279 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Mon, 24 Aug 2020 19:31:42 -0400 Subject: [PATCH] Debounce SM Unhealthy state notification Maintenance doesn't give an unhealthy Service Mgmnt (SM) process enough time to attempt a self recovery before failing and rebooting the controller it runs on. This update adds a small debounce to delay mtce's reaction to SM's unhealthy state notification. Only if the failure state persists for longer than 6 back-to-back mtcAlive messages, approximately 30 secs, will maintenance fail and recovery the node through reboot. Change-Id: Ica1b0925f0c767001d80e6a3b9928a6761b0c00f Closes-Bug: #1892789 Signed-off-by: Eric MacDonald --- mtce/src/maintenance/mtcCompMsg.cpp | 119 +++++++++++++++++----------- 1 file changed, 71 insertions(+), 48 deletions(-) diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 0d57e2e5..2cac1bf4 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -707,66 +707,89 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na ****************************************************************************/ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int interface ) { - struct timespec ts ; - clock_gettime (CLOCK_MONOTONIC, &ts ); + static int _sm_unhealthy_debounce_counter [MAX_IFACES] = {0,0} ; - /* Get health state of the host - presently limited to the following - * - * during boot = NODE_HEALTH_UNKNOWN - * /var/run/.config_pass = NODE_HEALTHY - * /var/run/.config_fail = NODE_UNHEALTHY - * - * */ + struct timespec ts ; + clock_gettime (CLOCK_MONOTONIC, &ts ); - /* Init the message buffer */ - MEMSET_ZERO (msg); - snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_worker_msg_header()); - msg.cmd = cmd ; - msg.num = MTC_PARM_MAX_IDX ; + /* Get health state of the host - presently limited to the following + * + * during boot = NODE_HEALTH_UNKNOWN + * /var/run/.config_pass = NODE_HEALTHY + * /var/run/.config_fail = NODE_UNHEALTHY + * + * */ - /* Insert the host uptime */ - msg.parm[MTC_PARM_UPTIME_IDX] = ts.tv_sec ; + /* Init the message buffer */ + MEMSET_ZERO (msg); + snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_worker_msg_header()); + msg.cmd = cmd ; + msg.num = MTC_PARM_MAX_IDX ; - /* Insert the host health - TO BE OBSOLTETED */ - msg.parm[MTC_PARM_HEALTH_IDX] = get_node_health( get_hostname() ) ; + /* Insert the host uptime */ + msg.parm[MTC_PARM_UPTIME_IDX] = ts.tv_sec ; - /* Insert the mtce flags */ - msg.parm[MTC_PARM_FLAGS_IDX] = 0 ; - if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ; - if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_NOT_HEALTHY ; - if ( daemon_is_file_present ( CONFIG_PASS_FILE ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_HEALTHY ; - if ( daemon_is_file_present ( NODE_LOCKED_FILE ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_LOCKED ; - if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_GOENABLED ; - if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHING ; - if ( daemon_is_file_present ( NODE_IS_PATCHED_FILE ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHED ; + /* Insert the host health - TO BE OBSOLTETED */ + msg.parm[MTC_PARM_HEALTH_IDX] = get_node_health( get_hostname() ) ; - /* manage the worker subfunction flag */ - if ( is_subfunction_worker () == true ) + /* Insert the mtce flags */ + msg.parm[MTC_PARM_FLAGS_IDX] = 0 ; + if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ; + if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_NOT_HEALTHY ; + if ( daemon_is_file_present ( CONFIG_PASS_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_HEALTHY ; + if ( daemon_is_file_present ( NODE_LOCKED_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_LOCKED ; + if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_GOENABLED ; + if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHING ; + if ( daemon_is_file_present ( NODE_IS_PATCHED_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHED ; + + /* manage the worker subfunction flag */ + if ( is_subfunction_worker () == true ) + { + if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) ) { - if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) ) - { - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_CONFIGURED ; + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_CONFIGURED ; - /* Only set the go enabled subfunction flag if the pass file only exists */ - if (( daemon_is_file_present ( GOENABLED_SUBF_PASS ) == true ) && - ( daemon_is_file_present ( GOENABLED_SUBF_FAIL ) == false )) - { - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_GOENABLED ; - } + /* Only set the go enabled subfunction flag if the pass file only exists */ + if (( daemon_is_file_present ( GOENABLED_SUBF_PASS ) == true ) && + ( daemon_is_file_present ( GOENABLED_SUBF_FAIL ) == false )) + { + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_GOENABLED ; } } + } - if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) ) - msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ; - if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) ) + if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ; + + if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) ) + { + /* debounce 6 mtcAlive messages = ~25-30 second debounce */ + #define MAX_SM_UNHEALTHY_DEBOUNCE (6) + if ( ++_sm_unhealthy_debounce_counter[interface] > MAX_SM_UNHEALTHY_DEBOUNCE ) + { + wlog("SM Unhealthy flag set (%s)", + get_iface_name_str(interface)); msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_UNHEALTHY ; + } + else + { + wlog("SM Unhealthy debounce %d of %d (%s)", + _sm_unhealthy_debounce_counter[interface], + MAX_SM_UNHEALTHY_DEBOUNCE, + get_iface_name_str(interface)); + } + } + else + { + _sm_unhealthy_debounce_counter[interface] = 0 ; + } /* add the interface and sequence number to the mtcAlice message */ identity.append ( ",\"interface\":\"");