Debounce SM Unhealthy state notification
Maintenance doesn't give an unhealthy Service Mgmnt (SM) process enough time to attempt a self recovery before failing and rebooting the controller it runs on. This update adds a small debounce to delay mtce's reaction to SM's unhealthy state notification. Only if the failure state persists for longer than 6 back-to-back mtcAlive messages, approximately 30 secs, will maintenance fail and recovery the node through reboot. Change-Id: Ica1b0925f0c767001d80e6a3b9928a6761b0c00f Closes-Bug: #1892789 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
b4e935a631
commit
08c8c65795
@ -707,66 +707,89 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
|
||||
****************************************************************************/
|
||||
int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int interface )
|
||||
{
|
||||
struct timespec ts ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||
static int _sm_unhealthy_debounce_counter [MAX_IFACES] = {0,0} ;
|
||||
|
||||
/* Get health state of the host - presently limited to the following
|
||||
*
|
||||
* during boot = NODE_HEALTH_UNKNOWN
|
||||
* /var/run/.config_pass = NODE_HEALTHY
|
||||
* /var/run/.config_fail = NODE_UNHEALTHY
|
||||
*
|
||||
* */
|
||||
struct timespec ts ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||
|
||||
/* Init the message buffer */
|
||||
MEMSET_ZERO (msg);
|
||||
snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_worker_msg_header());
|
||||
msg.cmd = cmd ;
|
||||
msg.num = MTC_PARM_MAX_IDX ;
|
||||
/* Get health state of the host - presently limited to the following
|
||||
*
|
||||
* during boot = NODE_HEALTH_UNKNOWN
|
||||
* /var/run/.config_pass = NODE_HEALTHY
|
||||
* /var/run/.config_fail = NODE_UNHEALTHY
|
||||
*
|
||||
* */
|
||||
|
||||
/* Insert the host uptime */
|
||||
msg.parm[MTC_PARM_UPTIME_IDX] = ts.tv_sec ;
|
||||
/* Init the message buffer */
|
||||
MEMSET_ZERO (msg);
|
||||
snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_worker_msg_header());
|
||||
msg.cmd = cmd ;
|
||||
msg.num = MTC_PARM_MAX_IDX ;
|
||||
|
||||
/* Insert the host health - TO BE OBSOLTETED */
|
||||
msg.parm[MTC_PARM_HEALTH_IDX] = get_node_health( get_hostname() ) ;
|
||||
/* Insert the host uptime */
|
||||
msg.parm[MTC_PARM_UPTIME_IDX] = ts.tv_sec ;
|
||||
|
||||
/* Insert the mtce flags */
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] = 0 ;
|
||||
if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ;
|
||||
if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_NOT_HEALTHY ;
|
||||
if ( daemon_is_file_present ( CONFIG_PASS_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_HEALTHY ;
|
||||
if ( daemon_is_file_present ( NODE_LOCKED_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_LOCKED ;
|
||||
if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_GOENABLED ;
|
||||
if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHING ;
|
||||
if ( daemon_is_file_present ( NODE_IS_PATCHED_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHED ;
|
||||
/* Insert the host health - TO BE OBSOLTETED */
|
||||
msg.parm[MTC_PARM_HEALTH_IDX] = get_node_health( get_hostname() ) ;
|
||||
|
||||
/* manage the worker subfunction flag */
|
||||
if ( is_subfunction_worker () == true )
|
||||
/* Insert the mtce flags */
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] = 0 ;
|
||||
if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ;
|
||||
if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_NOT_HEALTHY ;
|
||||
if ( daemon_is_file_present ( CONFIG_PASS_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_HEALTHY ;
|
||||
if ( daemon_is_file_present ( NODE_LOCKED_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_LOCKED ;
|
||||
if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_GOENABLED ;
|
||||
if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHING ;
|
||||
if ( daemon_is_file_present ( NODE_IS_PATCHED_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__PATCHED ;
|
||||
|
||||
/* manage the worker subfunction flag */
|
||||
if ( is_subfunction_worker () == true )
|
||||
{
|
||||
if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) )
|
||||
{
|
||||
if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) )
|
||||
{
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_CONFIGURED ;
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_CONFIGURED ;
|
||||
|
||||
/* Only set the go enabled subfunction flag if the pass file only exists */
|
||||
if (( daemon_is_file_present ( GOENABLED_SUBF_PASS ) == true ) &&
|
||||
( daemon_is_file_present ( GOENABLED_SUBF_FAIL ) == false ))
|
||||
{
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_GOENABLED ;
|
||||
}
|
||||
/* Only set the go enabled subfunction flag if the pass file only exists */
|
||||
if (( daemon_is_file_present ( GOENABLED_SUBF_PASS ) == true ) &&
|
||||
( daemon_is_file_present ( GOENABLED_SUBF_FAIL ) == false ))
|
||||
{
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_GOENABLED ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;
|
||||
if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) )
|
||||
if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;
|
||||
|
||||
if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) )
|
||||
{
|
||||
/* debounce 6 mtcAlive messages = ~25-30 second debounce */
|
||||
#define MAX_SM_UNHEALTHY_DEBOUNCE (6)
|
||||
if ( ++_sm_unhealthy_debounce_counter[interface] > MAX_SM_UNHEALTHY_DEBOUNCE )
|
||||
{
|
||||
wlog("SM Unhealthy flag set (%s)",
|
||||
get_iface_name_str(interface));
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_UNHEALTHY ;
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("SM Unhealthy debounce %d of %d (%s)",
|
||||
_sm_unhealthy_debounce_counter[interface],
|
||||
MAX_SM_UNHEALTHY_DEBOUNCE,
|
||||
get_iface_name_str(interface));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
_sm_unhealthy_debounce_counter[interface] = 0 ;
|
||||
}
|
||||
|
||||
/* add the interface and sequence number to the mtcAlice message */
|
||||
identity.append ( ",\"interface\":\"");
|
||||
|
Loading…
Reference in New Issue
Block a user