Fix intermittent process failure alarms auto-clear issues

This commit addresses the issue of intermittent failures that occur
when errors are encountered while opening files with extra text for
specific processes. These errors led to mismatches between the
entity_instance_id of the created alarm and the alarm being deleted.

With this commit, the extra text is now appended only to the alarm
when it is created, and it will not be considered when the system
attempts to remove the alarm. This change helps prevent the
mismatches caused by file errors and ensure alarms are handled
correctly.

Test plan
PASS: Build package.
PASS: Install package and bootstrap system
PASS: Use Eric macDonald's pmon regression tests to verify
      behavior.

closes-bug: 2078986

Change-Id: I622450c45770d251d62a80ccb964c65ce9e4d935
Signed-off-by: fperez <fabrizio.perez@windriver.com>
This commit is contained in:
fperez 2024-09-04 19:41:55 -03:00
parent b12122960c
commit e62642e97f

View File

@ -1496,20 +1496,12 @@ int manage_alarm ( process_config_type * ptr, int action )
pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr () ;
string processInfo = ptr->process;
// check for extra text
if((ptr->status_monitoring ) && (ptr->status_failure_text_file))
{
string extra_text = get_status_failure_text(ptr);
if(!extra_text.empty())
{
processInfo.append(" (");
processInfo.append(extra_text);
processInfo.append(")");
}
}
if ( action == PMON_CLEAR )
{
// Currently, there is no need to include the full entity_instance_id
// to delete an alarm. The process name (without additional text) will be sent instead
// to avoid mismatches due to file errors.
if ( ptr->alarm_severity != FM_ALARM_SEVERITY_CLEAR )
{
ilog ("%s from '%s' to 'clear'\n", ptr->process, alarmUtil_getSev_str(ptr->alarm_severity).c_str());
@ -1534,6 +1526,17 @@ int manage_alarm ( process_config_type * ptr, int action )
}
else
{
// check for extra text.
if((ptr->status_monitoring ) && (ptr->status_failure_text_file))
{
string extra_text = get_status_failure_text(ptr);
if(!extra_text.empty())
{
processInfo.append(" (");
processInfo.append(extra_text);
processInfo.append(")");
}
}
if ( ptr->restart == true )
{
/* handle as error now rather than command */