Add alarmed process audit to Process Monitor

A failure to query process monitor alarms from
FM during process startup can lead to a stuck
failed process alarm.

Rather than hold up the process monitor startup
sequence due to an unresponsive fault manager,
this update introduces an in-service alarm audit
that looks for asserted alarms and compares that
readout to the process monitor's runtime view.

A difference in view is considered a state mismatch
that requires corrective action. The runtime state
of the process monitor always takes precidence over
what is found in the FM database.

A mismatch is declared and corrective action is
taken if:

 - FM has a process failure alarm that pmond does not
   Corrective Action: Clear alarm in FM database

 - FM has a process failure alarm with a severity
   that differs from the pmond runtime state.
   Corrective Action: Update severity in FM database

 - FM has a process failure alarm for a process
   that pmond does not recognize.
   Corrective Action: Clear alarm in FM database

This update only runs the audit on process startup
until first successful query.
A future update may enable the audit in-service.

Test Plan:

PASS: Verify all mismatch case handling
PASS: Verify handling of valid active alarm
PASS: Verify handling severity mismatch ; unsupported
PASS: Verify pmond failure handling regression soak
PASS: Verify pmond process restart regression soak
PASS: Verify alarm handling over pmond process restart
PASS: Verify alarmed state audit period and logging
PASS: Verify pmond process failure alarm remains ignored by pmond
PASS: Verify handling of persistently failed process over pmond restart
PASS: Verify audit handling while FM is not running
      - audit retries every 50 seconds until fm query is successful

COND: Verify audit handling while FM is stopped/blocked/stalled
      - alarm query blocks till fm runs again or is killed
      - this is the reason the audit is not run in-service.

Change-Id: I697faa804dc7979fbb8b6f6c63811a6dda8c3118
Closes-Bug: 1892884
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2021-02-24 12:36:31 -05:00
parent edafeef57a
commit 6cf5e84825
4 changed files with 176 additions and 95 deletions

View File

@ -231,6 +231,7 @@ typedef struct
recovery_method_type recovery_method ; /**< How processes are recovered */
bool reload_config ;
bool patching_in_progress ;
bool last_alarm_query_pass;
} pmon_ctrl_type ;
void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr );

View File

@ -38,14 +38,14 @@ void pmonAlarm_init ( void )
alarmUtil_type * ptr ;
/** Process Failure Alarm ****************************************************/
ptr = &alarm_list[PMON_ALARM_ID__PMOND];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", PMOND_ALARM_ID);
ptr->name = "process failure" ;
ptr->instc_prefix = "process=" ;
ptr->critl_reason = "";
ptr->minor_reason = "";
ptr->major_reason = "";
@ -56,12 +56,12 @@ void pmonAlarm_init ( void )
ptr->alarm.inhibit_alarms = FM_FALSE;
ptr->alarm.service_affecting = FM_TRUE ;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If problem consistently occurs after Host is locked and unlocked then "
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If problem consistently occurs after Host is locked and unlocked then "
"contact next level of support for root cause analysis and recovery.");
}
@ -97,38 +97,46 @@ EFmAlarmSeverityT pmonAlarm_state ( string hostname, pmon_alarm_id_enum id )
/******************************************************************************
*
* Name : manage_queried_alarms
* Name : query_alarms
*
* Description: query FM for all the existing process monitor alarms and build
* up the callers 'saved_alarm_list' with those process names and
* corresponding severity.
*
* Assumptions: If the hostname is passed in as not empty then assume the clear
* is requested.
*
* Updates : callers saved_alarm_list
*
* Returns : PASS if FM returns no error
* FAIL_REQUEST ... alarmUtil_query_identity failed
* FAIL_OPERATION ... fm_get_fault failed
* FAIL_NULL_POINTER ... failed to get memory
*
******************************************************************************/
void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
int query_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
{
static const char HOSTNAME_LABEL [] = "host=" ;
static const char PROCNAME_LABEL [] = ".process=" ;
int rc = FAIL ;
saved_alarm_list.clear();
/**
* Query all the pmon alarms and if there is an alarm for a
* process that is functioing properly then clear the alarm.
**/
SFmAlarmDataT * alarm_list_ptr = (SFmAlarmDataT*) malloc ((sizeof(SFmAlarmDataT)*PMON_MAX_ALARMS));
if ( alarm_list_ptr )
{
if ( alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ) == PASS )
/* Query all the pmon alarms */
rc = alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS );
if ( rc == RETRY )
{
dlog ("no %s alarms found", pmonAlarm_getId_str(PMON_ALARM_ID__PMOND).c_str());
rc = PASS ;
}
else if ( rc == PASS )
{
for ( int i = 0 ; i < PMON_MAX_ALARMS ; ++i )
{
/* loop over each active alarm and maintain its activity state */
if ( strnlen ((alarm_list_ptr+i)->entity_instance_id , MAX_FILENAME_LEN ) )
{
int rc ;
AlarmFilter alarm_filter ;
SFmAlarmDataT alarm_query ;
memset(&alarm_query, 0, sizeof(alarm_query));
@ -139,34 +147,49 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
{
string entity = alarm_filter.entity_instance_id ;
size_t pos = entity.find("process=");
if ( pos != std::string::npos )
{
string pn = entity.substr(pos+strlen("process="));
ilog ("%s alarm is %s (process:%s)\n", alarm_filter.entity_instance_id,
alarmUtil_getSev_str(alarm_query.severity).c_str(), pn.c_str());
rc = PASS ;
/* filter out 'process=pmond' as that alarm is handled by hbsAgent */
if ( pn.compare("pmond") )
string entity = alarm_filter.entity_instance_id ;
size_t pos_hn = entity.find(HOSTNAME_LABEL);
size_t pos_pn = entity.find(PROCNAME_LABEL);
if (( pos_hn != std::string::npos ) &&
( pos_pn != std::string::npos ))
{
string hn = entity.substr(pos_hn+strlen(HOSTNAME_LABEL), pos_pn-strlen(HOSTNAME_LABEL));
string pn = entity.substr(pos_pn+strlen(PROCNAME_LABEL));
/* verify hostname */
if ( ( hn.length() == 0 ) || ( hn != hostname ) )
{
if ( !hostname.empty() )
{
pmonAlarm_clear ( hostname, PMON_ALARM_ID__PMOND, pn );
}
else
{
active_process_alarms_type this_alarm ;
this_alarm.process = pn ;
this_alarm.severity = alarm_query.severity ;
saved_alarm_list.push_front ( this_alarm );
}
/* ignore alarms not for this host */
dlog ("%s %s %s alarm not for this host",
entity.c_str(),
hn.c_str(),
pn.c_str());
continue ;
}
dlog ("%s alarm is %s (process:%s)\n",
alarm_filter.entity_instance_id,
alarmUtil_getSev_str(alarm_query.severity).c_str(),
pn.c_str());
/* filter out 'process=pmond'
* ... that alarm is handled by hbsAgent */
if ( pn != MTC_SERVICE_PMOND_NAME )
{
active_process_alarms_type this_alarm ;
this_alarm.process = pn ;
this_alarm.severity = alarm_query.severity ;
saved_alarm_list.push_front ( this_alarm );
}
}
}
else
{
ilog ("fm_get_fault failed (rc:%d)\n", rc );
wlog ("fm_get_fault failed (rc:%d)\n", rc );
rc = FAIL_OPERATION ;
break ;
}
}
else
@ -174,10 +197,21 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
dlog2 ("last entry %d\n", i);
break ;
}
}
} /* for loop */
}
else
{
wlog("failed to query alarms from fm ; rc:%d", rc);
rc = FAIL_REQUEST ;
}
free(alarm_list_ptr);
}
else
{
elog ("unable to allocate memory for alarm list");
rc = FAIL_NULL_POINTER ;
}
return (rc);
}
/************************* A L A R M I N G **************************/

View File

@ -37,8 +37,10 @@ typedef struct
EFmAlarmSeverityT severity ;
} active_process_alarms_type ;
/* Clear any pending alarms if the specified hostname is valid */
void manage_queried_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
/* Query FM for a list of Process Monitor (200.006) alarms */
int query_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
void alarmed_process_audit ( void );
void pmonAlarm_init ( void );

View File

@ -41,15 +41,6 @@ static struct mtc_timer ptimer[MAX_PROCESSES] ;
std::list<string> config_files ;
std::list<string>::iterator string_iter_ptr ;
/* If there is an alarm in the list that matches one in the process list
* then update that process with its severity and failed state.
* If there is a process in the saved list that is not in the process list
* then clear its alarm as it is no longer valid.
*/
void manage_process_alarms ( list<active_process_alarms_type> & _list,
process_config_type * const ptr,
int const processes );
static process_config_type process_config[MAX_PROCESSES] ;
/* lookup process control by index and return its pointer if found.
@ -216,6 +207,7 @@ void pmon_timer_init ( void )
/* Init the timer for this process */
mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ;
}
_pmon_ctrl_ptr->last_alarm_query_pass = false ;
}
void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr );
@ -371,7 +363,7 @@ void init_process_config_memory ( void )
* all the process config files from /etc/pmon.d */
void load_processes ( void )
{
list<active_process_alarms_type> saved_alarm_list ;
list<active_process_alarms_type> queried_alarm_list ;
int rc = PASS ;
@ -385,10 +377,6 @@ void load_processes ( void )
close_process_socket ( &process_config[i] );
}
/* Query fm for existing pmon process alarms and
* for each that is found store their 'name' and
* 'severity' in the passed in saved list */
manage_queried_alarms ( saved_alarm_list );
/* init the process config memory */
init_process_config_memory ();
@ -454,13 +442,8 @@ void load_processes ( void )
}
_pmon_ctrl_ptr->reload_config = false ;
/* If there were process alarms that existed over the reload
* then ensure that those processes are updated with that information. */
if ( saved_alarm_list.size () )
{
ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size());
manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes );
}
/* use the audit to clear pre-existing alarms at process startup */
alarmed_process_audit ();
}
@ -1702,65 +1685,124 @@ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr
}
}
/************************************************************************
/***************************************************************************
*
* Name : manage_process_alarms
* Name : alarmed_process_audit
*
* Description: This interface manages process alarms over a process
* configuration reload
* Purpose : Verify the process state matches the queried alarm state
*
* Steps:
* Description: To correct process alarm state mismatches.
*
* 1. Loop over each item in the list and mark the process as failed
* with the specified severity level.
*
* 2. If the process is not found then clear its alarm as it is no
* longer a valid process in the new profile and we don't want a
* lingering stuck alarm.
*
*************************************************************************/
***************************************************************************/
void manage_process_alarms ( list<active_process_alarms_type> & _list,
process_config_type * const ptr,
int const processes )
void alarmed_process_audit ( void )
{
/* get out if the list is empty ; should not have been called if
* empty but ... just in case */
if ( ! _list.empty() )
/* Don't audit FM in service after the last query was successful.
* There is a blocking issue that needs to be dealt with */
if ( _pmon_ctrl_ptr->last_alarm_query_pass == true )
return ;
/*
* Query fm for existing pmon process alarms and
* for each that is found store their 'name' and
* 'severity' in the passed in queried_alarm_list.
*/
list<active_process_alarms_type> queried_alarm_list ;
int rc = query_alarms ( queried_alarm_list, get_ctrl_ptr()->my_hostname );
_pmon_ctrl_ptr->last_alarm_query_pass = (rc == PASS);
/* just return if query failed */
if ( _pmon_ctrl_ptr->last_alarm_query_pass == false )
return ;
if ( queried_alarm_list.size () )
{
list<active_process_alarms_type>::iterator _iter_ptr ;
alog ("audit found %ld active alarms", queried_alarm_list.size());
/* loop over the list ... */
for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr )
for ( _iter_ptr=queried_alarm_list.begin();
_iter_ptr!=queried_alarm_list.end();
++_iter_ptr )
{
/* for each item assum it is not found */
bool found = false ;
alog ("%s audit", _iter_ptr->process.c_str());
/* try and find this process in the new process profile */
for ( int i = 0 ; i < processes ; i++ )
/* find this process*/
for ( int i = 0 ; (i < _pmon_ctrl_ptr->processes) && !found ; i++ )
{
if ( ! _iter_ptr->process.compare((ptr+i)->process) )
{
/* If the process is found then mark it as failed and update its severity.
* At this point we then assume that there is an alarm raised for this process. */
found = true ;
process_config_type * ptr = &process_config[i];
(ptr+i)->failed = false ;
wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() );
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
if ( ! _iter_ptr->process.compare(ptr->process) )
{
found = true ;
if ( ptr->failed == false )
{
ilog ("%s stale alarm ; clearing",
_iter_ptr->process.c_str() );
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
_iter_ptr->process );
}
else if ( _iter_ptr->severity != ptr->alarm_severity )
{
wlog ("%s alarm severity mismatch ; %s -> %s ; correcting",
ptr->process,
alarmUtil_getSev_str(_iter_ptr->severity).c_str(),
alarmUtil_getSev_str(ptr->alarm_severity).c_str());
if ( ptr->alarm_severity == FM_ALARM_SEVERITY_MINOR )
{
pmonAlarm_minor(get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process, 0);
}
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_MAJOR )
{
pmonAlarm_major(get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process);
}
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_CRITICAL )
{
pmonAlarm_critical(get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process);
}
else
{
wlog ("%s unexpected severity '%s' ; clearing alarm",
ptr->process,
ptr->severity);
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process );
}
}
else
{
alog ("%s is alarmed '%s' ; audit",
ptr->process,
ptr->severity);
}
}
}
/* if not found then just clear the alarm */
if ( found == false)
{
wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() );
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
wlog ("%s is not a monitored process ; clearing alarm",
_iter_ptr->process.c_str());
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
_iter_ptr->process );
}
}
}
}
void pmon_service ( pmon_ctrl_type * ctrl_ptr )
{
std::list<int> socks ;
@ -1931,6 +1973,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
{
_get_events ();
mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period );
alarmed_process_audit ();
}
/* Run the degrade set/clear by audit */