From e62642e97f4665fb4a26ff5fe2549afe5622c367 Mon Sep 17 00:00:00 2001 From: fperez Date: Wed, 4 Sep 2024 19:41:55 -0300 Subject: [PATCH] Fix intermittent process failure alarms auto-clear issues This commit addresses the issue of intermittent failures that occur when errors are encountered while opening files with extra text for specific processes. These errors led to mismatches between the entity_instance_id of the created alarm and the alarm being deleted. With this commit, the extra text is now appended only to the alarm when it is created, and it will not be considered when the system attempts to remove the alarm. This change helps prevent the mismatches caused by file errors and ensure alarms are handled correctly. Test plan PASS: Build package. PASS: Install package and bootstrap system PASS: Use Eric macDonald's pmon regression tests to verify behavior. closes-bug: 2078986 Change-Id: I622450c45770d251d62a80ccb964c65ce9e4d935 Signed-off-by: fperez --- mtce/src/pmon/pmonHdlr.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index 2abe1255..73d60d40 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -1496,20 +1496,12 @@ int manage_alarm ( process_config_type * ptr, int action ) pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr () ; string processInfo = ptr->process; - // check for extra text - if((ptr->status_monitoring ) && (ptr->status_failure_text_file)) - { - string extra_text = get_status_failure_text(ptr); - if(!extra_text.empty()) - { - processInfo.append(" ("); - processInfo.append(extra_text); - processInfo.append(")"); - } - } if ( action == PMON_CLEAR ) { + // Currently, there is no need to include the full entity_instance_id + // to delete an alarm. The process name (without additional text) will be sent instead + // to avoid mismatches due to file errors. if ( ptr->alarm_severity != FM_ALARM_SEVERITY_CLEAR ) { ilog ("%s from '%s' to 'clear'\n", ptr->process, alarmUtil_getSev_str(ptr->alarm_severity).c_str()); @@ -1534,6 +1526,17 @@ int manage_alarm ( process_config_type * ptr, int action ) } else { + // check for extra text. + if((ptr->status_monitoring ) && (ptr->status_failure_text_file)) + { + string extra_text = get_status_failure_text(ptr); + if(!extra_text.empty()) + { + processInfo.append(" ("); + processInfo.append(extra_text); + processInfo.append(")"); + } + } if ( ptr->restart == true ) { /* handle as error now rather than command */