Add alarmed process audit to Process Monitor
A failure to query process monitor alarms from FM during process startup can lead to a stuck failed process alarm. Rather than hold up the process monitor startup sequence due to an unresponsive fault manager, this update introduces an in-service alarm audit that looks for asserted alarms and compares that readout to the process monitor's runtime view. A difference in view is considered a state mismatch that requires corrective action. The runtime state of the process monitor always takes precidence over what is found in the FM database. A mismatch is declared and corrective action is taken if: - FM has a process failure alarm that pmond does not Corrective Action: Clear alarm in FM database - FM has a process failure alarm with a severity that differs from the pmond runtime state. Corrective Action: Update severity in FM database - FM has a process failure alarm for a process that pmond does not recognize. Corrective Action: Clear alarm in FM database This update only runs the audit on process startup until first successful query. A future update may enable the audit in-service. Test Plan: PASS: Verify all mismatch case handling PASS: Verify handling of valid active alarm PASS: Verify handling severity mismatch ; unsupported PASS: Verify pmond failure handling regression soak PASS: Verify pmond process restart regression soak PASS: Verify alarm handling over pmond process restart PASS: Verify alarmed state audit period and logging PASS: Verify pmond process failure alarm remains ignored by pmond PASS: Verify handling of persistently failed process over pmond restart PASS: Verify audit handling while FM is not running - audit retries every 50 seconds until fm query is successful COND: Verify audit handling while FM is stopped/blocked/stalled - alarm query blocks till fm runs again or is killed - this is the reason the audit is not run in-service. Change-Id: I697faa804dc7979fbb8b6f6c63811a6dda8c3118 Closes-Bug: 1892884 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
edafeef57a
commit
6cf5e84825
@ -231,6 +231,7 @@ typedef struct
|
||||
recovery_method_type recovery_method ; /**< How processes are recovered */
|
||||
bool reload_config ;
|
||||
bool patching_in_progress ;
|
||||
bool last_alarm_query_pass;
|
||||
|
||||
} pmon_ctrl_type ;
|
||||
void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr );
|
||||
|
@ -97,38 +97,46 @@ EFmAlarmSeverityT pmonAlarm_state ( string hostname, pmon_alarm_id_enum id )
|
||||
|
||||
/******************************************************************************
|
||||
*
|
||||
* Name : manage_queried_alarms
|
||||
* Name : query_alarms
|
||||
*
|
||||
* Description: query FM for all the existing process monitor alarms and build
|
||||
* up the callers 'saved_alarm_list' with those process names and
|
||||
* corresponding severity.
|
||||
*
|
||||
* Assumptions: If the hostname is passed in as not empty then assume the clear
|
||||
* is requested.
|
||||
*
|
||||
* Updates : callers saved_alarm_list
|
||||
*
|
||||
* Returns : PASS if FM returns no error
|
||||
* FAIL_REQUEST ... alarmUtil_query_identity failed
|
||||
* FAIL_OPERATION ... fm_get_fault failed
|
||||
* FAIL_NULL_POINTER ... failed to get memory
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
|
||||
int query_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
|
||||
{
|
||||
static const char HOSTNAME_LABEL [] = "host=" ;
|
||||
static const char PROCNAME_LABEL [] = ".process=" ;
|
||||
|
||||
int rc = FAIL ;
|
||||
saved_alarm_list.clear();
|
||||
|
||||
/**
|
||||
* Query all the pmon alarms and if there is an alarm for a
|
||||
* process that is functioing properly then clear the alarm.
|
||||
**/
|
||||
SFmAlarmDataT * alarm_list_ptr = (SFmAlarmDataT*) malloc ((sizeof(SFmAlarmDataT)*PMON_MAX_ALARMS));
|
||||
if ( alarm_list_ptr )
|
||||
{
|
||||
if ( alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ) == PASS )
|
||||
/* Query all the pmon alarms */
|
||||
rc = alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
dlog ("no %s alarms found", pmonAlarm_getId_str(PMON_ALARM_ID__PMOND).c_str());
|
||||
rc = PASS ;
|
||||
}
|
||||
else if ( rc == PASS )
|
||||
{
|
||||
for ( int i = 0 ; i < PMON_MAX_ALARMS ; ++i )
|
||||
{
|
||||
/* loop over each active alarm and maintain its activity state */
|
||||
if ( strnlen ((alarm_list_ptr+i)->entity_instance_id , MAX_FILENAME_LEN ) )
|
||||
{
|
||||
int rc ;
|
||||
AlarmFilter alarm_filter ;
|
||||
SFmAlarmDataT alarm_query ;
|
||||
memset(&alarm_query, 0, sizeof(alarm_query));
|
||||
@ -139,22 +147,36 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
|
||||
|
||||
if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
|
||||
{
|
||||
string entity = alarm_filter.entity_instance_id ;
|
||||
size_t pos = entity.find("process=");
|
||||
if ( pos != std::string::npos )
|
||||
{
|
||||
string pn = entity.substr(pos+strlen("process="));
|
||||
ilog ("%s alarm is %s (process:%s)\n", alarm_filter.entity_instance_id,
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(), pn.c_str());
|
||||
rc = PASS ;
|
||||
|
||||
/* filter out 'process=pmond' as that alarm is handled by hbsAgent */
|
||||
if ( pn.compare("pmond") )
|
||||
string entity = alarm_filter.entity_instance_id ;
|
||||
size_t pos_hn = entity.find(HOSTNAME_LABEL);
|
||||
size_t pos_pn = entity.find(PROCNAME_LABEL);
|
||||
|
||||
if (( pos_hn != std::string::npos ) &&
|
||||
( pos_pn != std::string::npos ))
|
||||
{
|
||||
if ( !hostname.empty() )
|
||||
string hn = entity.substr(pos_hn+strlen(HOSTNAME_LABEL), pos_pn-strlen(HOSTNAME_LABEL));
|
||||
string pn = entity.substr(pos_pn+strlen(PROCNAME_LABEL));
|
||||
|
||||
/* verify hostname */
|
||||
if ( ( hn.length() == 0 ) || ( hn != hostname ) )
|
||||
{
|
||||
pmonAlarm_clear ( hostname, PMON_ALARM_ID__PMOND, pn );
|
||||
/* ignore alarms not for this host */
|
||||
dlog ("%s %s %s alarm not for this host",
|
||||
entity.c_str(),
|
||||
hn.c_str(),
|
||||
pn.c_str());
|
||||
continue ;
|
||||
}
|
||||
else
|
||||
dlog ("%s alarm is %s (process:%s)\n",
|
||||
alarm_filter.entity_instance_id,
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(),
|
||||
pn.c_str());
|
||||
|
||||
/* filter out 'process=pmond'
|
||||
* ... that alarm is handled by hbsAgent */
|
||||
if ( pn != MTC_SERVICE_PMOND_NAME )
|
||||
{
|
||||
active_process_alarms_type this_alarm ;
|
||||
this_alarm.process = pn ;
|
||||
@ -163,10 +185,11 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("fm_get_fault failed (rc:%d)\n", rc );
|
||||
wlog ("fm_get_fault failed (rc:%d)\n", rc );
|
||||
rc = FAIL_OPERATION ;
|
||||
break ;
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -174,10 +197,21 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
|
||||
dlog2 ("last entry %d\n", i);
|
||||
break ;
|
||||
}
|
||||
} /* for loop */
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("failed to query alarms from fm ; rc:%d", rc);
|
||||
rc = FAIL_REQUEST ;
|
||||
}
|
||||
free(alarm_list_ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("unable to allocate memory for alarm list");
|
||||
rc = FAIL_NULL_POINTER ;
|
||||
}
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/************************* A L A R M I N G **************************/
|
||||
|
@ -37,8 +37,10 @@ typedef struct
|
||||
EFmAlarmSeverityT severity ;
|
||||
} active_process_alarms_type ;
|
||||
|
||||
/* Clear any pending alarms if the specified hostname is valid */
|
||||
void manage_queried_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
|
||||
/* Query FM for a list of Process Monitor (200.006) alarms */
|
||||
int query_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
|
||||
|
||||
void alarmed_process_audit ( void );
|
||||
|
||||
void pmonAlarm_init ( void );
|
||||
|
||||
|
@ -41,15 +41,6 @@ static struct mtc_timer ptimer[MAX_PROCESSES] ;
|
||||
std::list<string> config_files ;
|
||||
std::list<string>::iterator string_iter_ptr ;
|
||||
|
||||
/* If there is an alarm in the list that matches one in the process list
|
||||
* then update that process with its severity and failed state.
|
||||
* If there is a process in the saved list that is not in the process list
|
||||
* then clear its alarm as it is no longer valid.
|
||||
*/
|
||||
void manage_process_alarms ( list<active_process_alarms_type> & _list,
|
||||
process_config_type * const ptr,
|
||||
int const processes );
|
||||
|
||||
static process_config_type process_config[MAX_PROCESSES] ;
|
||||
|
||||
/* lookup process control by index and return its pointer if found.
|
||||
@ -216,6 +207,7 @@ void pmon_timer_init ( void )
|
||||
/* Init the timer for this process */
|
||||
mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ;
|
||||
}
|
||||
_pmon_ctrl_ptr->last_alarm_query_pass = false ;
|
||||
}
|
||||
|
||||
void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr );
|
||||
@ -371,7 +363,7 @@ void init_process_config_memory ( void )
|
||||
* all the process config files from /etc/pmon.d */
|
||||
void load_processes ( void )
|
||||
{
|
||||
list<active_process_alarms_type> saved_alarm_list ;
|
||||
list<active_process_alarms_type> queried_alarm_list ;
|
||||
|
||||
int rc = PASS ;
|
||||
|
||||
@ -385,10 +377,6 @@ void load_processes ( void )
|
||||
close_process_socket ( &process_config[i] );
|
||||
}
|
||||
|
||||
/* Query fm for existing pmon process alarms and
|
||||
* for each that is found store their 'name' and
|
||||
* 'severity' in the passed in saved list */
|
||||
manage_queried_alarms ( saved_alarm_list );
|
||||
|
||||
/* init the process config memory */
|
||||
init_process_config_memory ();
|
||||
@ -454,13 +442,8 @@ void load_processes ( void )
|
||||
}
|
||||
_pmon_ctrl_ptr->reload_config = false ;
|
||||
|
||||
/* If there were process alarms that existed over the reload
|
||||
* then ensure that those processes are updated with that information. */
|
||||
if ( saved_alarm_list.size () )
|
||||
{
|
||||
ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size());
|
||||
manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes );
|
||||
}
|
||||
/* use the audit to clear pre-existing alarms at process startup */
|
||||
alarmed_process_audit ();
|
||||
}
|
||||
|
||||
|
||||
@ -1702,65 +1685,124 @@ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr
|
||||
}
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
/***************************************************************************
|
||||
*
|
||||
* Name : manage_process_alarms
|
||||
* Name : alarmed_process_audit
|
||||
*
|
||||
* Description: This interface manages process alarms over a process
|
||||
* configuration reload
|
||||
* Purpose : Verify the process state matches the queried alarm state
|
||||
*
|
||||
* Steps:
|
||||
* Description: To correct process alarm state mismatches.
|
||||
*
|
||||
* 1. Loop over each item in the list and mark the process as failed
|
||||
* with the specified severity level.
|
||||
*
|
||||
* 2. If the process is not found then clear its alarm as it is no
|
||||
* longer a valid process in the new profile and we don't want a
|
||||
* lingering stuck alarm.
|
||||
*
|
||||
*************************************************************************/
|
||||
***************************************************************************/
|
||||
|
||||
void manage_process_alarms ( list<active_process_alarms_type> & _list,
|
||||
process_config_type * const ptr,
|
||||
int const processes )
|
||||
void alarmed_process_audit ( void )
|
||||
{
|
||||
/* get out if the list is empty ; should not have been called if
|
||||
* empty but ... just in case */
|
||||
if ( ! _list.empty() )
|
||||
/* Don't audit FM in service after the last query was successful.
|
||||
* There is a blocking issue that needs to be dealt with */
|
||||
if ( _pmon_ctrl_ptr->last_alarm_query_pass == true )
|
||||
return ;
|
||||
|
||||
/*
|
||||
* Query fm for existing pmon process alarms and
|
||||
* for each that is found store their 'name' and
|
||||
* 'severity' in the passed in queried_alarm_list.
|
||||
*/
|
||||
list<active_process_alarms_type> queried_alarm_list ;
|
||||
int rc = query_alarms ( queried_alarm_list, get_ctrl_ptr()->my_hostname );
|
||||
_pmon_ctrl_ptr->last_alarm_query_pass = (rc == PASS);
|
||||
|
||||
/* just return if query failed */
|
||||
if ( _pmon_ctrl_ptr->last_alarm_query_pass == false )
|
||||
return ;
|
||||
|
||||
if ( queried_alarm_list.size () )
|
||||
{
|
||||
list<active_process_alarms_type>::iterator _iter_ptr ;
|
||||
|
||||
alog ("audit found %ld active alarms", queried_alarm_list.size());
|
||||
|
||||
/* loop over the list ... */
|
||||
for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr )
|
||||
for ( _iter_ptr=queried_alarm_list.begin();
|
||||
_iter_ptr!=queried_alarm_list.end();
|
||||
++_iter_ptr )
|
||||
{
|
||||
/* for each item assum it is not found */
|
||||
bool found = false ;
|
||||
alog ("%s audit", _iter_ptr->process.c_str());
|
||||
|
||||
/* try and find this process in the new process profile */
|
||||
for ( int i = 0 ; i < processes ; i++ )
|
||||
/* find this process*/
|
||||
for ( int i = 0 ; (i < _pmon_ctrl_ptr->processes) && !found ; i++ )
|
||||
{
|
||||
if ( ! _iter_ptr->process.compare((ptr+i)->process) )
|
||||
process_config_type * ptr = &process_config[i];
|
||||
|
||||
if ( ! _iter_ptr->process.compare(ptr->process) )
|
||||
{
|
||||
/* If the process is found then mark it as failed and update its severity.
|
||||
* At this point we then assume that there is an alarm raised for this process. */
|
||||
found = true ;
|
||||
if ( ptr->failed == false )
|
||||
{
|
||||
ilog ("%s stale alarm ; clearing",
|
||||
_iter_ptr->process.c_str() );
|
||||
|
||||
(ptr+i)->failed = false ;
|
||||
wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() );
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
_iter_ptr->process );
|
||||
}
|
||||
else if ( _iter_ptr->severity != ptr->alarm_severity )
|
||||
{
|
||||
wlog ("%s alarm severity mismatch ; %s -> %s ; correcting",
|
||||
ptr->process,
|
||||
alarmUtil_getSev_str(_iter_ptr->severity).c_str(),
|
||||
alarmUtil_getSev_str(ptr->alarm_severity).c_str());
|
||||
if ( ptr->alarm_severity == FM_ALARM_SEVERITY_MINOR )
|
||||
{
|
||||
pmonAlarm_minor(get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process, 0);
|
||||
}
|
||||
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_MAJOR )
|
||||
{
|
||||
pmonAlarm_major(get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process);
|
||||
}
|
||||
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
pmonAlarm_critical(get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process);
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s unexpected severity '%s' ; clearing alarm",
|
||||
ptr->process,
|
||||
ptr->severity);
|
||||
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
alog ("%s is alarmed '%s' ; audit",
|
||||
ptr->process,
|
||||
ptr->severity);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* if not found then just clear the alarm */
|
||||
if ( found == false)
|
||||
{
|
||||
wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() );
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
|
||||
wlog ("%s is not a monitored process ; clearing alarm",
|
||||
_iter_ptr->process.c_str());
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
_iter_ptr->process );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||
{
|
||||
std::list<int> socks ;
|
||||
@ -1931,6 +1973,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||
{
|
||||
_get_events ();
|
||||
mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period );
|
||||
|
||||
alarmed_process_audit ();
|
||||
}
|
||||
|
||||
/* Run the degrade set/clear by audit */
|
||||
|
Loading…
Reference in New Issue
Block a user