#ifndef __INCLUDE_NODECLASS_H__ #define __INCLUDE_NODECLASS_H__ /* * Copyright (c) 2013-2016 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * */ /** * @file * Wind River CGTS Platform Node Maintenance "Node Manager" * class, support structs and enums. */ #include #include #include #include #include #include #define WANT_MTC #define WANT_HBS using namespace std; /* Include base class definition header */ #include "nodeBase.h" #include "hostUtil.h" /* for ... server_code and others */ #include "nodeTimers.h" #include "threadUtil.h" /* for ... thread_info_type thread_ctrl_type*/ #include "pingUtil.h" /* for ... ping_info_type */ #include "nodeCmds.h" /* for ... mtcCmd type */ #include "httpUtil.h" /* for ... libevent stuff */ #include "ipmiUtil.h" /* for ... mc_info_type */ #include "mtcHttpUtil.h" /* for ... libevent stuff */ #include "mtcSmgrApi.h" /* */ #include "alarmUtil.h" /* for ... SFmAlarmDataT */ #include "mtcAlarm.h" /* for ... MTC_ALARM_ID__xx and utils */ #include "mtcThreads.h" /* for ... mtcThread_ipmitool */ /**Default back-to-back heartbeat failures for disabled-failed condition */ #define HBS_FAILURE_THRESHOLD 10 /** Default back-to-back heartbeat failures for enabled-degraded condition */ #define HBS_DEGRADE_THRESHOLD 6 /** Default back-to-back heartbeat failures for enabled-degraded condition */ #define HBS_MINOR_THRESHOLD 4 /** If Debug, this number of missed heartbeats in a row creates a info log */ #define HBS_DBG_LOG_THRESHOLD 1 /** Clear (reset) heartbeat counter value */ #define HBS_CLEAR_COUNT 0 #ifdef SIMPLEX #undef SIMPLEX #endif #define SIMPLEX \ ( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == true ) #define THIS_HOST \ ( node_ptr->hostname == this->my_hostname ) #define NOT_THIS_HOST \ ( node_ptr->hostname != this->my_hostname ) #define LARGE_SYSTEM \ ( this->system_type == SYSTEM_TYPE__NORMAL ) #define CPE_SYSTEM \ ( this->system_type != SYSTEM_TYPE__NORMAL ) #define SIMPLEX_CPE_SYSTEM \ ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) /** * @addtogroup nodeLinkClass * @{ * * This class is used to maintain a linked list of nodes that * represent currently provisioned inventory. Its member * functions and data members along with the support files * in maintenance and heartbeat feature directories blend * to create a Higly Available and Reuseable Maintenance system. */ class nodeLinkClass { private: /** A single node entity within the nodeLinkClass that can * be spliced in or out of a node linked list */ struct node { /** * @addtogroup private_Node_variables * @{ * * A set of variables that make up a node including linking members. */ /** The name of the host node */ std::string uuid ; /** The name of the host node */ std::string hostname ; /** The IP address of the host node */ std::string ip ; /** The Mac address of the host node */ std::string mac ; /** The infrastructure network IP address of the host node */ std::string infra_ip ; /** The Mac address of the host's infra interface */ std::string infra_mac ; /** The type of node 'controller' or 'compute' node */ std::string type ; /** Short text phrase indicating the operation the FSM is * taking on this host */ std::string task ; /** Administrative action from inventory */ std::string action ; /** The Node Type ; compute or control or storage as a mask */ string functions ; /* comma delimited string of host types */ unsigned int nodetype ; /* numeric mask of functions */ string function_str ; /* single host type string representing the main function of the host */ unsigned int function ; /* numeric representing function_str */ string subfunction_str ; /* single host type string ie "compute" */ unsigned int subfunction ; /* numeric representing subfunction_str */ /** set to true if the host specific sub function enable handler passes */ bool subf_enabled ; /** set true if the BMC is provisioned */ bool bm_provisioned ; /** general retry counter */ int retries ; /** number of http rest API retries since last clear */ int http_retries_cur ; /* Command handler retries counter */ int cmd_retries ; /* Retry counter for power actions (on/off)*/ int power_action_retries ; /** Generic toggle switch */ bool toggle ; /** back to back health failure counter */ int health_threshold_counter ; int mtce_flags ; /* true if this node is patching */ bool patching ; /* true if this node is patched but not reset */ bool patched ; /** The node's reported uptime */ unsigned int uptime ; unsigned int uptime_save ; /** Set to true once the host's add FSM is done */ bool add_completed ; int uptime_refresh_counter ; /** Counts the number of times this node was unlocked. * NOTE: This value should be stored in the database. * so that it is not reset to 0 on every swact. */ int node_unlocked_counter ; int mtcalive_timeout ; /* start host service retry controls */ int start_services_retries ; bool start_services_running_main ; bool start_services_running_subf ; bool start_services_needed ; bool start_services_needed_subf ; /* for the add handler that defers start to the inservice test handler. this provides a means of telling maintenance that the subfunction start needs to also be run. */ /** Pointer to the previous node in the list */ struct node *prev; /** Pointer to the next node in the list */ struct node *next; /** @} private_Node_variables */ /** @addtogroup private_Maintenance_variables * @{ * * Finite State Machine variables and member functions * for 'this' host/node * * The CGTS Maintenacne syste follows the X.731 maintenance model * which uses the states below; For full list of states please * refer to nodeBase.h * * A brief summary is (host and node are used inter-changably) * * Administrative Action: Actions a user may take on a host at the user * interface ; i.e. Lock, Unlock, Reset, Reinstall * * Administrative State : The state a host enters into when the above * actions are taken ; i.e. Locked or Unlocked. * * Operational State : The operating state of the node based on the * administrative actions ; Enabled or Disabled. * * Availability State : The useability state of a host based on the * two previous states and events that may occur * over time ; i.e. available, failed, degraded, * intest. */ mtc_nodeAdminAction_enum adminAction ; /**< Administrative Action */ list adminAction_todo_list ; /**< Administrative Action */ mtc_nodeAdminState_enum adminState ; /**< Administrative State */ mtc_nodeOperState_enum operState ; /**< Operational State */ mtc_nodeAvailStatus_enum availStatus ; /**< Availability Status */ mtc_nodeConfigAction_enum configAction; /**< Configuration Action */ mtc_nodeOperState_enum operState_subf ; /**< Subfunction Operational State */ mtc_nodeAvailStatus_enum availStatus_subf ; /**< Subfunction Availability Status */ mtc_nodeOperState_enum operState_dport ; /**< Data Port Operational State */ mtc_nodeAvailStatus_enum availStatus_dport; /**< Data Port Availability Status */ /** Maintains the current handler stage. * This is a union of all handler types such as enable, * disable, degrade etc. See nodeBase.h for list of union members */ mtc_stages_union handlerStage; /* Individual FSM handler stages */ mtc_offlineStages_enum offlineStage ; mtc_onlineStages_enum onlineStage ; mtc_swactStages_enum swactStage ; mtc_addStages_enum addStage ; mtc_delStages_enum delStage ; mtc_recoveryStages_enum recoveryStage ; mtc_oosTestStages_enum oosTestStage ; mtc_insvTestStages_enum insvTestStage ; mtc_configStages_enum configStage ; mtc_resetProgStages_enum resetProgStage ; mtc_reinstallStages_enum reinstallStage ; /** Board management specific FSM Stages */ mtc_powerStages_enum powerStage ; mtc_powercycleStages_enum powercycleStage ; mtc_subStages_enum subStage ; mtc_resetStages_enum resetStage ; mtc_sensorStages_enum sensorStage ; /** This gate is used to block mtcAlive messages from reaching * the state machine until its ready to receive them. * * Issue: The mtcClient on a slave host will continuously send the * mtcAlive 'I'm here' messages after a reboot and until that message * is acknowledged. This is done to make the recovery of a host more * robust in a potentially lossy network. Without this, a single * dropped mtcAlive message could result in an unlock-enable timeout * which would lead to a disabled-failed state and re-recovery attempt * after a recovery timeout (mtcTimers.h:HOST_MTCALIVE_TIMEOUT) * period. Besides the system administrator seeing a disabled-failed * condition the customer would realize a longer than nessary outage * of that host. * * Fix: By having the mtcClient repeatedly send the mtcAlive message * on reset recovery until it is acknowledged by active mtcAgent * prevents the above issue. However it has a side affect on the * maintenance FSM for that host. This mtcAlive gate prevents * the state machine from seeing mtcAlive messages when it does not * care about them. */ bool mtcAlive_gate ; int mtcAlive_count ; int mtcAlive_misses ; int mtcAlive_hits ; int mtcAlive_purge ; bool mtcAlive_mgmnt ; /* set true when mtcAlive is rx'd from mgmnt network */ bool mtcAlive_infra ; /* set true when mtcAlive is rx'd from infra network */ /* Both of these booleans are set true upon receipt of a mtcAlive message. */ bool mtcAlive_online ; /* this is consumed by online and offline handler */ bool mtcAlive_offline ; /* this is consumed by reset progression handler */ int offline_search_count ; /* count back-2-back mtcAlive request misses */ bool offline_log_reported ; /* prevents offline/online log flooding when */ bool online_log_reported ; /* availStatus switches between these states */ /* and failed */ /** Host's mtc timer struct. Use to time handler stages. * * reset -> reset command response * reboot -> then wait for mtcalive message * mtcalive -> then wait for go enabled message */ struct mtc_timer mtcAlive_timer ; /* the fault handling offline handler timer */ struct mtc_timer offline_timer ; /* Host level DOR recovery mode time and bools */ int dor_recovery_time ; bool dor_recovery_mode ; bool was_dor_recovery_mode ; /** Integer code representing the host health */ int health ; /** Flag indicating that the unknown health state * has already been reported */ bool unknown_health_reported ; /* Booleans indicating the main or subfunction has config failure */ bool config_failed ; bool config_failed_subf ; /* Booleans indicating the main or subfunction has passed the OOS test */ bool goEnabled ; bool goEnabled_subf ; /* Booleans indicating the main or subfunction has failed the OOS test */ bool goEnabled_failed ; bool goEnabled_failed_subf ; /* Boolean indicating the main or subfunction has start host services * failure. */ bool hostservices_failed ; bool hostservices_failed_subf ; /* Boolean indicating the main or subfunction has inservice failure */ bool inservice_failed ; bool inservice_failed_subf ; /** node has reached enabled state this number of times */ bool enabled_count ; /** Number of OOS tests run so far */ int oos_test_count ; /** Number of INSV tests run so far */ int insv_test_count ; /** Used to throttle inservice recovery actions */ int insv_recovery_counter ; /** when true requests the task for this host be cleared at first opportunity */ bool clear_task ; /** Host's mtc timer struct. Use to time handler stages. * * reset -> reset command response * reboot -> then wait for mtcalive message * mtcalive -> then wait for go enabled message */ struct mtc_timer mtcTimer ; struct mtc_timer http_timer ; struct mtc_timer mtcCmd_timer ; struct mtc_timer oosTestTimer ; struct mtc_timer insvTestTimer ; struct mtc_timer mtcSwact_timer ; struct mtc_timer mtcConfig_timer ; struct mtc_timer power_timer ; struct mtc_timer host_services_timer ; mtcCmd host_services_req ; mtcCmd mtcAlive_req ; mtcCmd reboot_req ; mtcCmd general_req ; /* String that is used in the command handling logs which represents * the specific command handling that is in progress */ string cmdName ; /** Indicates presence of a command request */ unsigned int cmdReq ; /** Indicates presence of a command response */ unsigned int cmdRsp; /** Indicates acknowledgement of the initial host * services command in execution monitoroing mode */ unsigned int cmdAck; /** Command Response Status - Execution Status */ unsigned int cmdRsp_status ; /** Command Response Data - typically an error details string */ string cmdRsp_status_string ; bool reboot_cmd_ack_mgmnt ; bool reboot_cmd_ack_infra ; /** Tracks back to back Fast Fault Recovery counts */ int graceful_recovery_counter; /** Reboot acknowledge */ mtc_client_enum activeClient ; /** @} private_Maintenance_variables */ /** * @addtogroup private_libEvent_structs * @{ * * libEvent structures used to issue libEvent * HTTP REST API Requests to control this host * based on each service */ libEvent sysinvEvent; /**< Sysinv REST API Handling for host */ libEvent cfgEvent; /**< Sysinv REST API Handling for config changes */ libEvent vimEvent ; /**< VIM Event REST API Handling */ libEvent httpReq ; /**< Http libEvent Request Handling */ libEvent thisReq ; /**< Http libEvent Request Handling */ list libEvent_work_fifo ; list::iterator libEvent_work_fifo_ptr; list libEvent_done_fifo ; list::iterator libEvent_done_fifo_ptr; // bool work_ready ; int oper_sequence ; int oper_failures ; int no_work_log_throttle ; int log_throttle ; /* List of queue'ed mtce commands for this host */ mtcCmd cmd; list mtcCmd_work_fifo ; list::iterator mtcCmd_work_fifo_ptr; list mtcCmd_done_fifo ; list::iterator mtcCmd_done_fifo_ptr; /** @} private_libEvent_structs and utils */ /** * @addtogroup private_Heartbeat_variables * @{ * * A grouping a of private variables at the node level used to * control if a node is to be monitored, the monitoring failure * counts and next / previous pointers used to create the * monitored node pulse linked list */ /** Set 'true' when node minor threshold has exceeded */ bool hbs_minor[MAX_IFACES] ; /** Set 'true' when node is degraded due to back to back heartbeat pulse * misses tha exceed the major threshold */ bool hbs_degrade[MAX_IFACES] ; /** Set 'true' when node is failed due to back to back heartbeat pulse * misses that exceed the critical threshold */ bool hbs_failure[MAX_IFACES] ; /** log throttle controls for heartbeat service */ int stall_recovery_log_throttle ; int stall_monitor_log_throttle ; int lookup_mismatch_log_throttle ; int unexpected_pulse_log_throttle ; /** Pulse Next and Previous Link pointers for creating * a per-interface pulse link list */ struct { /** previous pulse pointer used to create the pulse linked list for one interface */ struct node * prev_ptr ; /** next pulse pointer used to create the pulse linked list for one interface */ struct node * next_ptr ; } pulse_link [MAX_IFACES] ; /** The link index number for this node is while in an interface pulse linked list */ int linknum [MAX_IFACES] ; /** true if this host is to be monitored for this indexed interface */ bool monitor [MAX_IFACES] ; /** Ongoing heartbeat count cleared on HBS_START reset */ int hbs_count [MAX_IFACES] ; /** Immediate running count of consecutive heartbeat misses */ int b2b_misses_count [MAX_IFACES]; /** Maximum heartbeat misses since node was last brought into service */ int max_count [MAX_IFACES]; /** total times minor count was exceeded */ int hbs_minor_count [MAX_IFACES]; /** total times this host degraded due to heartbeat misses */ int hbs_degrade_count [MAX_IFACES]; /** total times this host failed due to heartbeat loss */ int hbs_failure_count [MAX_IFACES]; /** current state of heartbeat failure per interface for mtcAgent */ bool heartbeat_failed [MAX_IFACES]; /** Resource reference identifier, aka resource reference array index */ int rri ; /** @} private_Heartbeat_variables */ /** * @addtogroup private_boad_management_variables * @{ * * Various host specific board management variables. */ /** The IP address of the host's board management controller */ string bm_ip ; /** The password of the host's board management controller */ string bm_pw ; /** A string label that represents the board management * controller type for this host */ string bm_type ; /** The operator provisioned board management hostname */ string bm_un ; /* Indicates there is a board management test * for this host in progress */ bool bm_test_in_progress ; /* Indicates there is a board management operation * in progress on this host */ bool bm_oper_in_progress ; /** * The BMC is 'accessible' once provisioning data is available * and bmc is verified pingable. **/ bool bm_accessible; /** @} private_boad_management_variables */ /** * @addtogroup private_monitoring_services_variables * @{ * * A grouping a of flags, mask and degrade resource lists * used to manage the degrade state of a host for process * and resource monitoring services. */ /* Bit mask of degrade reasons */ unsigned int degrade_mask ; /** Process Monitor Daemon Flag Missing count */ int pmon_missing_count ; /** Host degraded due to loss of Process Monitor running flag */ bool pmon_degraded ; /** Process Monitor Ready flag and degrade list */ bool pmond_ready ; /** Hardware Monitor Ready flag and degrade list */ bool hwmond_ready ; bool hwmond_monitor ; /** Heartbeat Client process ready to heartbeat flag */ bool hbsClient_ready ; /** hwmon reset and powercycle recovery control structure */ recovery_ctrl_type hwmon_reset ; recovery_ctrl_type hwmon_powercycle ; /** Resource Monitor Daemon Flag Missing count */ int rmond_missing_count ; /** Host degraded due to loss of Resource Monitor running flag */ bool rmond_degraded ; /** Resource Monitor Ready flag and degrade list */ bool rmond_ready ; std::list degraded_resources_list ; /** process or resource list string iterator */ std::list::iterator string_iter_ptr ; /** @} private_monitoring_services_variables */ /* List of alarms and current severity */ EFmAlarmSeverityT alarms[MAX_ALARMS]; /* tracks whether the alarms for this host have been loaded already or not */ bool alarms_loaded ; /** true if this host has recovered before the mnfa timeout period. * This bool flags the graceful recovery handler that this node * is recovering from mnfa and should manage graceful recovery * and uptime accordingly */ bool mnfa_graceful_recovery ; int stress_iteration ; /* for bmc ping access monitor */ ping_info_type bm_ping_info ; /* the bmc info struct filled in and log printed by a * call to ipmiUtil_mc_info_load. */ mc_info_type mc_info ; bool mc_info_query_active ; bool mc_info_query_done ; bool reset_cause_query_active ; bool reset_cause_query_done ; bool power_status_query_active ; bool power_status_query_done ; bool power_on = false ; /* a timer used in the bm_handler to query * the mc_info and reset cause */ struct mtc_timer bm_timer ; /* timer used to manage the bmc access alarm */ struct mtc_timer bmc_access_timer ; /***************************************************** * Maintenance Thread Structs *****************************************************/ /* control data the parent uses to manage the thread */ thread_ctrl_type ipmitool_thread_ctrl ; /*info the thread uses to execute and post results */ thread_info_type ipmitool_thread_info ; /* extra thread info for board management control thread */ thread_extra_info_type thread_extra_info ; }; struct node * head ; /**< Node Linked List Head pointer */ struct node * tail ; /**< Node Linked List Tail pointer */ /** Allocate memory for a new node. * * Preserves the node address in the node_ptr list and increments * the memory_allocs counter used by the inservice test audit. * * @return * a pointer to the memory of the newly allocated node */ struct nodeLinkClass::node * newNode ( void ); /** Build the Resource Reference Array */ void build_rra ( void ); /** Free the memory used by a node. * * The memory to be removed is found in the node_ptr list, cleared and * the memory_allocs counter is decremented. * If the memory cannot be found then an error is returned. * * @param node_ptr * is a pointer to the node to be freed * @return * a signed integer of PASS or -EINVAL */ int delNode ( struct nodeLinkClass::node * node_ptr ); /** Start heartbeating a new node. * * Node is added to the end of the node linked list. * * @param node_info_ptr * is a pointer containing pertinent info about the physical node * @return * a pointer to the newly added node */ struct nodeLinkClass::node* addNode ( string hostname ); struct nodeLinkClass::node* addUuid ( string uuid ); /** Stop heartbeating a node. * * Node is spliced out of the node linked list. * * @param node_info_ptr * is a pointer containing info required to find the node in the node list * @return * an integer of PASS or -EINVAL */ int remNode ( string hostname ); /** Get pointer to "hostname" node. * * Node list lookup by pointer from hostname. * * @param node_info_ptr * is a pointer containing info required to find the node in the node list * @return * a pointer to the hostname's node */ struct nodeLinkClass::node* getNode ( string hostname ); /** Get the node pointer based on the service and libevent base pointer. * * Node list lookup by pointer service and libevent base pointer. * * @param libEvent_enum * service type * @param base_ptr * pointer to the libEvent base * * @return * a pointer to the hostname's node */ struct nodeLinkClass::node* getEventBaseNode ( libEvent_enum service, struct event_base * base_ptr); /** Get a reference to the libEvent containing the supplied * libEvent.base pointer. * * @param base_ptr * pointer to the libEvent base * * @return * reference to valid or null libEvent */ libEvent & getEvent ( struct event_base * base_ptr); int manage_dnsmasq_bmc_hosts ( struct nodeLinkClass::node * node_ptr ); /* run the maintenance fsm against a host */ int fsm ( struct nodeLinkClass::node * node_ptr ); /* specific handlers called within the fsm */ int enable_handler ( struct nodeLinkClass::node * node_ptr ); int recovery_handler ( struct nodeLinkClass::node * node_ptr ); int disable_handler ( struct nodeLinkClass::node * node_ptr ); int add_handler ( struct nodeLinkClass::node * node_ptr ); int delete_handler ( struct nodeLinkClass::node * node_ptr ); int cfg_handler ( struct nodeLinkClass::node * node_ptr ); int cmd_handler ( struct nodeLinkClass::node * node_ptr ); int swact_handler ( struct nodeLinkClass::node * node_ptr ); int reset_handler ( struct nodeLinkClass::node * node_ptr ); int reboot_handler ( struct nodeLinkClass::node * node_ptr ); int reinstall_handler ( struct nodeLinkClass::node * node_ptr ); int power_handler ( struct nodeLinkClass::node * node_ptr ); int powercycle_handler ( struct nodeLinkClass::node * node_ptr ); int offline_handler ( struct nodeLinkClass::node * node_ptr ); int online_handler ( struct nodeLinkClass::node * node_ptr ); int oos_test_handler ( struct nodeLinkClass::node * node_ptr ); int insv_test_handler ( struct nodeLinkClass::node * node_ptr ); int stress_handler ( struct nodeLinkClass::node * node_ptr ); int bm_handler ( struct nodeLinkClass::node * node_ptr ); int uptime_handler ( void ); int host_services_handler ( struct nodeLinkClass::node * node_ptr ); /* Starts the specified 'reset or powercycle' recovery monitor */ int hwmon_recovery_monitor ( struct nodeLinkClass::node * node_ptr, int hwmon_event ); /* server specific power state query handler */ bool (*is_poweron_handler) (string hostname, string query_response ); /* Calculate the overall reset progression timeout */ int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries ); /* These interfaces will start and stop the offline FSM if not already active */ void start_offline_handler ( struct nodeLinkClass::node * node_ptr ); void stop_offline_handler ( struct nodeLinkClass::node * node_ptr ); /***************************************************************************** * * Name : ipmi_command_send * * Description: This utility starts the ipmitool command handling thread * with the specified command. * * Returns : PASS if all the pre-start semantic checks pass and the * thread was started. * * Otherwise the thread was not started and some non zero * FAIL_xxxx code is returned after a representative design * log is generated. * *****************************************************************************/ int ipmi_command_send ( struct nodeLinkClass::node * node_ptr, int command ) ; /***************************************************************************** * * Name : ipmi_command_recv * * Description: This utility will check for ipmitool command thread completion. * * Returns : PASS is returned if the thread reports done. * RETRY is returned if the thread has not completed. * FAIL_RETRY is returned after 10 back-to-back calls return RETRY. * * Assumptions: The caller is expected to call ipmi_command_done once it has * consumed the results of the thread * *****************************************************************************/ int ipmi_command_recv ( struct nodeLinkClass::node * node_ptr ); /***************************************************************************** * * Name : ipmi_command_done * * Description: This utility frees the ipmitool command thread for next execution. * *****************************************************************************/ void ipmi_command_done ( struct nodeLinkClass::node * node_ptr ); /* default all the BMC access variaables to the "no access" state */ void bmc_access_data_init ( struct nodeLinkClass::node * node_ptr ); /* Combo Host enable handler */ int enable_subf_handler ( struct nodeLinkClass::node * node_ptr ); /** set all service readies to false so that when the first one comes in' * it will be logged */ void clear_service_readies ( struct nodeLinkClass::node * node_ptr ); int update_dport_states ( struct nodeLinkClass::node * node_ptr, int event ); /* manage deciding to return or issue an immediate reboot if the * auto recovery threshold is exceeded. */ void manage_autorecovery ( struct nodeLinkClass::node * node_ptr ); /** *********************************************************************** * * Name : nodeLinkClass::workQueue_process * * Description: This is a Per Host Finite State Machine (FSM) that * processes the work queue for the supplied host's * node pointer. * * Constructs: * * node_ptr->libEvent_work_fifo - the current work queue/fifo * node_ptr->libEvent_done_fifo - queue/fifo of completed requests * * Operations: * * requests are added to the libEvent_work_fifo with workQueue_enqueue. * requests are removed from the libEvent_done_fifo with workQueue_dequeue. * * Behavior: * * In process libEvents are copied from the callers work queue to * its thisReq. * * Completed events including execution status are copied to the host's * done fifo. * * Failed events may be retried up to max_retries as specified by * the callers libEvent. * * @param event is a reference to the callers libEvent. * * @return an integer with values of PASS, FAIL, RETRY * * Implementation: in maintenance/mtcWorkQueue.cpp * * ************************************************************************/ int workQueue_process ( struct nodeLinkClass::node * node_ptr ); /** *********************************************************************** * * Name : nodeLinkClass::workQueue_del_cmd * * Description: To handle the pathalogical case where an event seems to * have timed out at the callers level then this interface * can be called to delete it from the work queue. * * @param node_ptr so that the hosts work queue can be found * @param sequence to specify the specific sequence number to remove * @return always PASS since there is nothing the caller can or needs * to do if the command is not present. * * Implementation: in maintenance/mtcWorkQueue.cpp * */ int workQueue_del_cmd ( struct nodeLinkClass::node * node_ptr, int sequence ); int doneQueue_purge ( struct nodeLinkClass::node * node_ptr ); int workQueue_purge ( struct nodeLinkClass::node * node_ptr ); int workQueue_done ( struct nodeLinkClass::node * node_ptr ); void workQueue_dump ( struct nodeLinkClass::node * node_ptr ); void doneQueue_dump ( struct nodeLinkClass::node * node_ptr ); int mtcCmd_workQ_purge( struct nodeLinkClass::node * node_ptr ); int mtcCmd_doneQ_purge( struct nodeLinkClass::node * node_ptr ); void mtcCmd_workQ_dump ( struct nodeLinkClass::node * node_ptr ); void mtcCmd_doneQ_dump ( struct nodeLinkClass::node * node_ptr ); void force_full_enable ( struct nodeLinkClass::node * node_ptr ); int adminActionChange ( struct nodeLinkClass::node * node_ptr, mtc_nodeAdminAction_enum newActionState ); /** Host Administrative State Change member function */ int adminStateChange ( struct nodeLinkClass::node * node_ptr, mtc_nodeAdminState_enum newAdminState ); /** Host Operational State Change member function */ int operStateChange ( struct nodeLinkClass::node * node_ptr, mtc_nodeOperState_enum newOperState ); /** Host Availability Status Change member function */ int availStatusChange ( struct nodeLinkClass::node * node_ptr, mtc_nodeAvailStatus_enum newAvailStatus ); int allStateChange ( struct nodeLinkClass::node * node_ptr, mtc_nodeAdminState_enum adminState, mtc_nodeOperState_enum operState, mtc_nodeAvailStatus_enum availStatus ); int subfStateChange ( struct nodeLinkClass::node * node_ptr, mtc_nodeOperState_enum operState_subf, mtc_nodeAvailStatus_enum availStatus_subf ); /** Host Enable Handler Stage Change member function */ int enableStageChange ( struct nodeLinkClass::node * node_ptr, mtc_enableStages_enum newHdlrStage ); /** Host Disable Handler Stage Change member function */ int disableStageChange ( struct nodeLinkClass::node * node_ptr, mtc_disableStages_enum newHdlrStage ); /** Host configuration stage Change member function */ int configStageChange ( struct nodeLinkClass::node * node_ptr, mtc_configStages_enum newHdlrStage ); /** Host Reset Handler Stage Change member function */ int resetStageChange ( struct nodeLinkClass::node * node_ptr, mtc_resetStages_enum newHdlrStage ); /** Host Reinstall Handler Stage Change member function */ int reinstallStageChange ( struct nodeLinkClass::node * node_ptr, mtc_reinstallStages_enum newHdlrStage ); /** Host Fast graceful Recovery Handler Stage Change member function */ int recoveryStageChange ( struct nodeLinkClass::node * node_ptr, mtc_recoveryStages_enum newHdlrStage ); /** Host Power control Handler Stage Change member function */ int powerStageChange ( struct nodeLinkClass::node * node_ptr, mtc_powerStages_enum newHdlrStage ); /** Host Powercycle control Handler Stage Change member function */ int powercycleStageChange ( struct nodeLinkClass::node * node_ptr, mtc_powercycleStages_enum newHdlrStage ); /** Out-Of-Service Test Stage Change member function */ int oosTestStageChange ( struct nodeLinkClass::node * node_ptr, mtc_oosTestStages_enum newHdlrStage ); /** Inservice Test Stage Change member function */ int insvTestStageChange ( struct nodeLinkClass::node * node_ptr, mtc_insvTestStages_enum newHdlrStage ); /** Host Sensor Handler Stage Change member function */ int sensorStageChange ( struct nodeLinkClass::node * node_ptr, mtc_sensorStages_enum newHdlrStage ); /** Generic Substage Stage change member function */ int subStageChange ( struct nodeLinkClass::node * node_ptr, mtc_subStages_enum newHdlrStage ); int failed_state_change ( struct nodeLinkClass::node * node_ptr ); /* issue a * - one way lazy reboot with * - graceful SM services shutdown and * - failsafe backup sysreq reset */ int lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_ptr ); int alarm_enabled_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr ); int alarm_insv_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_insv_failure ( struct nodeLinkClass::node * node_ptr ); int alarm_config_clear ( struct nodeLinkClass::node * node_ptr ); int alarm_config_failure ( struct nodeLinkClass::node * node_ptr ); int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev ); void clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr ); /** Find the node that has this timerID in its general mtc timer */ struct nodeLinkClass::node * get_mtcTimer_timer ( timer_t tid ); struct nodeLinkClass::node * get_mtcConfig_timer ( timer_t tid ); struct nodeLinkClass::node * get_mtcAlive_timer ( timer_t tid ); struct nodeLinkClass::node * get_offline_timer ( timer_t tid ); struct nodeLinkClass::node * get_mtcSwact_timer ( timer_t tid ); struct nodeLinkClass::node * get_mtcCmd_timer ( timer_t tid ); struct nodeLinkClass::node * get_oosTestTimer ( timer_t tid ); struct nodeLinkClass::node * get_insvTestTimer ( timer_t tid ); struct nodeLinkClass::node * get_power_timer ( timer_t tid ); struct nodeLinkClass::node * get_http_timer ( timer_t tid ); struct nodeLinkClass::node * get_thread_timer ( timer_t tid ); struct nodeLinkClass::node * get_ping_timer ( timer_t tid ); struct nodeLinkClass::node * get_bm_timer ( timer_t tid ); struct nodeLinkClass::node * get_bmc_access_timer ( timer_t tid ); struct nodeLinkClass::node * get_host_services_timer ( timer_t tid ); struct nodeLinkClass::node * get_powercycle_control_timer ( timer_t tid ); struct nodeLinkClass::node * get_powercycle_recovery_timer ( timer_t tid ); struct nodeLinkClass::node * get_reset_control_timer ( timer_t tid ); struct nodeLinkClass::node * get_reset_recovery_timer ( timer_t tid ); /* Launch the specified host services command start or stop for any host * type into the cmd_handler. In support of AIO a subf bool is optional * and forces the command to be COMPUTE (subfunction). * - requires cmd_handler fsm */ int launch_host_services_cmd ( struct nodeLinkClass::node * node_ptr, bool start , bool subf=false ); /* Private SYSINV API */ int mtcInvApi_update_task ( struct nodeLinkClass::node * node_ptr, string task ); int mtcInvApi_update_task_now ( struct nodeLinkClass::node * node_ptr, string task ); int mtcInvApi_force_task ( struct nodeLinkClass::node * node_ptr, string task ); int mtcInvApi_update_task ( struct nodeLinkClass::node * node_ptr, const char * task_str_ptr, int one ); int mtcInvApi_update_task ( struct nodeLinkClass::node * node_ptr, const char * task_str_ptr, int one, int two ); int mtcInvApi_update_value ( struct nodeLinkClass::node * node_ptr, string key, string value ); int mtcInvApi_update_uptime ( struct nodeLinkClass::node * node_ptr, unsigned int uptime ); int mtcInvApi_subf_states ( struct nodeLinkClass::node * node_ptr, string oper_subf, string avail_subf ); int mtcInvApi_force_states ( struct nodeLinkClass::node * node_ptr, string admin, string oper, string avail ); int mtcInvApi_update_states ( struct nodeLinkClass::node * node_ptr, string admin, string oper, string avail ); int mtcInvApi_update_states_now ( struct nodeLinkClass::node * node_ptr, string admin, string oper, string avail, string oper_subf, string avail_subf); int mtcInvApi_update_state ( struct nodeLinkClass::node * node_ptr, string state, string value ); /* Private SM API */ int mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, mtc_cmd_enum operation, int retries ); /* Private VIM API */ int mtcVimApi_state_change ( struct nodeLinkClass::node * node_ptr, libEvent_enum operation, int retries ); int set_bm_prov ( struct nodeLinkClass::node * node_ptr, bool state ); void set_uptime ( struct nodeLinkClass::node * node_ptr, unsigned int uptime, bool force ); // #endif /* WANT_MTC */ /** Interface to asser or clear severity specific heartbeat alarms */ void manage_heartbeat_alarm ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev, int iface ); /** Returns the heartbeat monitoring state for the specified interface */ bool get_hbs_monitor_state ( string & hostname, int iface ); /** List of allocated node memory. * * An array of node pointers. */ nodeLinkClass::node * node_ptrs[MAX_NODES] ; /** A memory allocation counter. * * Should represent the number of nodes in the linked list. */ int memory_allocs ; /** A memory used counter * * A variable storing the accumulated node memory */ int memory_used ; /** Inservice memory management audit. * * Verifies that the node_ptr list and memory_allocs jive as well * as all the node pointers point to a node in the linked list. * * @return * an integer representing a PASS or TODO: list other error codes. */ int memory_audit ( void ); /* Simplex mode auto recovery bools * * Set to true when the autorecovery threshold is reached * and we want to avoid taking further autorecovery action * even though it may be requested. */ bool autorecovery_disabled ; /* Set to true by fault detection methods that are * autorecoverable when in simplex mode. */ bool autorecovery_enabled ; /** Tracks the number of hosts that 'are currently' in service trouble * wrt heartbeat (above minor threshold). * This is used in multi-host failure avoidance. **/ int mnfa_host_count[MAX_IFACES] ; /** Tracks the number of times multi failure avoidance was exited */ int mnfa_occurances ; /** true when the multi node failure count exceeds the multi * node failure avoidance threshold and until there are no more * in service trouble hosts */ bool mnfa_active ; /** Recover or exit from the muli-node failure avoidance state * This involves restarting the heartbeat on all the nodes * that remain hbs_minor and clearing any heartbneat degrade * states that remain. */ void mnfa_exit ( bool force ); void mnfa_enter ( void ); void mnfa_add_host ( struct nodeLinkClass::node * node_ptr, iface_enum iface ); void mnfa_recover_host ( struct nodeLinkClass::node * node_ptr ); void hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, iface_enum iface ); /* Dead Office Recovery - system level controls */ void manage_dor_recovery ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT severity ); void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix ); struct { struct node * head_ptr ; /**< Pulse Linked List Head pointer */ struct node * tail_ptr ; /**< Pulse Linked List Tail pointer */ struct node * last_ptr ; /**< Pulse Linked List running last pointer */ } pulse_list [MAX_IFACES] ; /** General Pulse Pointer used to build pulse linked list */ struct node * pulse_ptr ; /** Number monitored hosts (nodes) for a specified interface */ int pulses[MAX_IFACES] ; /** Resource reference Array: An array used to store * resource references for the purpose of fast resource * lookup making thwe heartbat service more scalable. * * In this case it is an array of node link pointers * that are in the current active pulse list. */ struct node * hbs_rra[MAX_NODES]; /** Pulse list node lookup pointer by hostname. * * Get pointer to "hostname" node located in the pulse list. * * @param hostname - a string containing the name of the host * to be searched for in the pulse list. * @param iface - iface_enum specifying which interface linked * list to search. * * @return pointer to the node's control struct */ struct nodeLinkClass::node* getPulseNode ( string & hostname, iface_enum iface ); /** Manage the heartbeat pulse flags by node pointer * * These flags contain service information sent by the replying host. * One example of this is the pmond flag which indicates whether the process * monitor is running on that host. * * Flags that are not set are thresholded for degrade or alarm assertion * or cleared when found to be set again. * * @param pulse_ptr - node's control struct pointer * @param flags - integer containing a bit field set of flags * * */ void manage_pulse_flags ( struct nodeLinkClass::node* pulse_ptr, unsigned int flags ); /** Remove a node from the pulse list by name, index or node pointer * * Deal with all the removal cases ; head, tail, full splice * * @return * an integer of PASS or -FAULT, -ENXIO */ int remPulse_by_name ( string & hostname, iface_enum iface, bool clear_b2b_misses_count, unsigned int flags ); int remPulse_by_index ( string hostname, int index, iface_enum iface, bool clear_b2b_misses_count, unsigned int flags ); int remPulse ( struct node * node_ptr, iface_enum iface, bool clear_b2b_misses_count, unsigned int flags ); /** Debug Dump Log Interfaces */ void mem_log_general ( void ); void mem_log_general_mtce_hosts ( void ); void mem_log_mnfa ( void ); void mem_log_dor ( struct nodeLinkClass::node * node_ptr ); void mem_log_identity ( struct nodeLinkClass::node * node_ptr ); void mem_log_network ( struct nodeLinkClass::node * node_ptr ); void mem_log_state1 ( struct nodeLinkClass::node * node_ptr ); void mem_log_state2 ( struct nodeLinkClass::node * node_ptr ); void mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ); void mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr ); void mem_log_stage ( struct nodeLinkClass::node * node_ptr ); void mem_log_test_info ( struct nodeLinkClass::node * node_ptr ); void mem_log_bm ( struct nodeLinkClass::node * node_ptr ); void mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr ); void mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr ); void mem_log_type_info ( struct nodeLinkClass::node * node_ptr ); void mem_log_reset_info( struct nodeLinkClass::node * node_ptr ); void mem_log_power_info( struct nodeLinkClass::node * node_ptr ); void mem_log_thread_info ( struct nodeLinkClass::node * node_ptr ); void print_node_info ( struct nodeLinkClass::node * node_ptr ); // #endif /** Public Interfaces that allow hosts to be * added or removed from maintenance. */ public: nodeLinkClass(); /**< constructor */ ~nodeLinkClass(); /**< destructor */ system_type_enum system_type ; string functions ; /**< comma delimited string list of functions supported */ bool maintenance ; bool heartbeat ; /* offline_handler tuning controls */ int offline_threshold ; /* number of back to back mtcAlive misses before offline */ int offline_period ; /* offline handler mtcAlive request period */ /* dor mode data ; state and start time * - start time is used to compare how long slave hosts take to come up * after the active controller has entered dor mode */ bool dor_mode_active ; unsigned int dor_start_time ; int dor_mode_active_log_throttle ; bool hbs_disabled ; /**< Control heartbeat service state */ bool hbs_state_change ; /**< Flag service state change */ int hbs_pulse_period ; /**< The curent pulse period in msec */ int hbs_pulse_period_save ; /**< preserved copy of hbs_pulse_period */ /** a loop counter used to detect when the heartbeat service is silently failing */ int hbs_silent_fault_detector ; /* prevents flooding FM with the silent_fault detected log */ int hbs_silent_fault_logged ; /* tracks the number of pulse requests set on each interface */ int pulse_requests[MAX_IFACES] ; /** The number of heartbeat misses that result in a * minor notification to maintenance */ int hbs_minor_threshold ; /** The number of heartbeat misses that result in a degraded state */ int hbs_degrade_threshold ; /** The number of heartbeat misses that result in a failed state */ int hbs_failure_threshold ; /** Running Resource Reference Identifier */ int rrri ; bool active ; bool is_active ( void ) { return (active); } void set_activity_state ( bool state ) { active = state ; } /** Store the hostname of this controller */ string my_hostname ; /**< */ string my_local_ip ; /**< Primary IP address */ string my_float_ip ; /**< Secondary (floating) IP address */ /********* New Public Constructs for IPMI Comamnd Handling ***********/ /* the main fsm entrypoint to service all hosts */ void fsm ( void ) ; /** This controller's hostname set'er */ void set_my_hostname ( string hostname ); /** This controller's hostname get'er */ string get_my_hostname ( void ); /** This controller's local ip addr set'er */ void set_my_local_ip ( string & hostname ); /** This controller's local ip addr get'er */ string get_my_local_ip ( void ); /** This controller's local ip addr set'er */ void set_my_float_ip ( string & hostname ); /** This controller's local ip addr get'er */ string get_my_float_ip ( void ); /** get ip address for any hostname */ string get_hostaddr ( string & hostname ); /** get mac address for any hostname and specified interface */ string get_hostIfaceMac ( string & hostname, int iface ); /** get infrastructure network ip address for any hostname */ string get_infra_hostaddr ( string & hostname ); /** set a node's ip address */ int set_hostaddr ( string & hostname, string & ip ); /** set a node's infrastructure ip address */ int set_infra_hostaddr ( string & hostname, string & ip ); /** get hostname for any hostname */ string get_hostname ( string & hostaddr ); /******************************/ /* NODE TYPE Member Functions */ /******************************/ /** Fetch the node type (compute or controller) by hostname */ int get_nodetype ( string & hostname ); /** Check if a node is a controller */ bool is_controller ( struct nodeLinkClass::node * node_ptr ); /** Check if a node is a compute */ bool is_compute ( struct nodeLinkClass::node * node_ptr ); bool is_compute_subfunction ( struct nodeLinkClass::node * node_ptr ); string get_node_function_str ( string hostname ); string get_node_subfunction_str ( string hostname ); /** Check if a node is a storage */ bool is_storage ( struct nodeLinkClass::node * node_ptr ); /** Check if a node is a controller by hostname */ bool is_controller ( string & hostname ); /** Check if a node is a compute by hostname */ bool is_compute ( string & hostname ); bool is_compute_subfunction ( string & hostname ); /** Check if a node is a storage by hostname */ bool is_storage ( string & hostname ); /** Sets a hosts's function and subfunction members */ int update_host_functions ( string hostname , string functions ); /***********************************************************/ /** Number of provisioned hosts (nodes) */ int hosts ; /** Host has been deleted */ bool host_deleted ; /** Host Administrative State Change public member function */ int admin_state_change ( string hostname, string newAdminState ); /** Host Operational State Change public member function */ int oper_state_change ( string hostname, string newOperState ); /** Host Availability Status Change public member function */ int avail_status_change ( string hostname, string newAvailStatus ); /** Host Subfunction Operational State Change public member function */ int oper_subf_state_change ( string hostname, string newOperState ); /** Host Subfunction Availability Status Change public member function */ int avail_subf_status_change ( string hostname, string newAvailStatus ); /** Update mtce Key with Value */ int update_key_value ( string hostname, string key , string value ); /** This is the list of inventory by hostname. * The Maintenance FSM loops over this list * to provide maintenance service */ std::list hostname_inventory ; std::list::iterator host ; std::list mnfa_awol_list ; void mnfa_timeout_handler ( void ); /** Return the number of inventoried hosts */ int num_hosts ( void ); /** ********************************************************************** * * Name : nodeLinkClass::workQueue_enqueue * * Description: Adds the next sequence number to the supplied event * reference, creates a log prefix based on the event's * hostname, service, operation and sequence number * (to avoid repeated recreation) and then copies that * event to the work queue. * * @param event is a reference to the callers libEvent. * @return an integer with value of PASS. * * Implementation: in maintenance/mtcWorkQueue.cpp * * *********************************************************************/ int workQueue_enqueue ( libEvent & event ); /** ********************************************************************** * * Name : nodeLinkClass::doneQueue_dequeue * * Description: Searches the done queue for the event matching the supplied * event reference , specifically the sequence number. If found * it pulls the execution status information and then proceeds * to remove it from the done queue. * * If the event is found then the event status is returned. * If not found then a RETRY is returned. * If the done event status is RETRY then a FAIL is returned since * it should not be on the done queue with a retry status. * * @param event is a reference to the callers libEvent * @return an integer with values of PASS, FAIL, RETRY * * Implementation: in maintenance/mtcWorkQueue.cpp * * ************************************************************************/ int doneQueue_dequeue ( libEvent & event ); bool workQueue_present ( libEvent & event ); void workQueue_dump_all ( void ); void doneQueue_dump_all ( void ); void mtcCmd_workQ_dump_all ( void ); void mtcCmd_doneQ_dump_all ( void ); /** Add a host to the Node list */ int add_host ( node_inv_type & inv ); int mod_host ( node_inv_type & inv ); int set_host_failed ( node_inv_type & inv ); /** Check to see if the node list already contains any of the following * information and reject the add or modify if it does * * uuid * hostname * ip address * mac address * **/ int add_host_precheck ( node_inv_type & inv ); int del_host ( string uuid ); /** Returns empty string if not provisioned or the name of the host if it is */ string get_host ( string uuid ); string get_uuid ( string hostname ); void set_uuid ( string hostname, string uuid ); void set_task ( string hostname, string task ); /** Updates the hostname and resource reference identifier * based on the next one in the cycle */ void get_rris ( string & hostname, int & rri ); /** Performs a service affecting symantic check on whether * the specified uuid can be locked. * In the case of a compute node it asks Nova. * In the case of a controller it verifies that there is * another controller active and inservice. * * @params uuid string * @params reason int * * @returns true if locked and false otherwise * */ bool can_uuid_be_locked ( string uuid , int & reason ); //#ifdef WANT_HBS /** Add a host to the Node list */ int add_heartbeat_host ( const node_inv_type &inv ); // #endif void host_print ( struct nodeLinkClass::node * node_ptr ); /** Remove a host from Node list */ int rem_host ( string & hostname ); /* Returns the active client. */ mtc_client_enum get_activeClient ( string hostname ); /* Sets the active client for this particular host. The first use of this * is or reset/reboot acknowledge to the VIm over an evacuate reset request * from within the reboot handler. */ int set_activeClient ( string hostname, mtc_client_enum client ); /** Get the number of compute hosts that are operationally 'enabled' */ int enabled_compute_nodes ( void ); /** Get the number of storage hosts that are operationally 'enabled' */ int enabled_storage_nodes ( void ); /** get the number of hosts that are enabled excluding the active controller */ int enabled_nodes ( void ); /** Get the system's storage backend type */ int get_storage_backend ( void ); /** Returns true if the storage pool has a monitor running on * an unlocked-enabled storage host */ bool is_storage_mon_enabled ( void ) ; /** true if the management link's operational state is up and running */ bool mgmnt_link_up_and_running ; bool infra_link_up_and_running ; /** A boolean that is used to quickly determine if the infrastructure * network is provisioned and configured for this daemon to use */ bool infra_network_provisioned ; /** A debug bool hat allows infrastructure heartbeat failures to only * cause host degrade rather than failure */ bool infra_degrade_only ; int service_netlink_events ( int nl_socket , int ioctl_socket ); void manage_heartbeat_minor ( string hostname, iface_enum iface, bool clear_event ); void manage_heartbeat_degrade ( string hostname, iface_enum iface, bool clear_event ); void manage_heartbeat_failure ( string hostname, iface_enum iface, bool clear_event ); /* Clear heartbeat failed flag for all interfaces */ void manage_heartbeat_clear ( string hostname, iface_enum iface ); /** Test and Debug Members and Variables */ /** Print node info banner */ void print_node_info ( void ); int testhead ( int test ); int testmode ; // #ifdef WANT_MTC /** Hostname of the Active Controller */ std::string active_controller_hostname ; /** Hostname of the Inactive Controller */ std::string inactive_controller_hostname ; bool inactive_controller_is_patched ( void ); bool inactive_controller_is_patching ( void ); string get_inactive_controller_hostname ( void ); void set_inactive_controller_hostname ( string hostname ); string get_active_controller_hostname ( void ); void set_active_controller_hostname ( string hostname ); /** Returns 'true' if inactive controller main/subfunction is in-service * * In-Service if "unlocked-enabled-available or * unlocked-enabled-degraded */ bool is_inactive_controller_main_insv ( void ); bool is_inactive_controller_subf_insv ( void ); /** Returns true if the specified hostname is the active controller */ bool is_active_controller ( string hostname ); /** Returns number of enabled controllers */ int num_controllers_enabled ( void ); /** Run the FSM against the specified host */ int run_fsm ( string hostname ); /** Post a specific enable handler stage */ int set_enableStage ( string & hostname, mtc_enableStages_enum stage ); /** Get a posted enable handler stage */ mtc_enableStages_enum get_enableStage ( string & hostname ); /* Set the reboot stage */ int set_rebootStage ( string & hostname, mtc_resetProgStages_enum stage ); /** handle an expired timer. Find the node with this * timer ID and set its ringer */ void timer_handler ( int sig, siginfo_t *si, void *uc); struct mtc_timer mtcTimer ; struct mtc_timer mtcTimer_mnfa ; struct mtc_timer mtcTimer_token ; struct mtc_timer mtcTimer_uptime ; /* System Level DOR recovery timer * Note: tid != NULL represents DOR Mode Active */ struct mtc_timer mtcTimer_dor ; unsigned int get_cmd_resp ( string & hostname ); void set_cmd_resp ( string & hostname, mtc_message_type & msg ); void set_uptime ( string & hostname, unsigned int uptime, bool force ); unsigned int get_uptime ( string & hostname ); void set_uptime_refresh_ctr ( string & hostname, int value ); int get_uptime_refresh_ctr ( string & hostname ); /** Returns true when a 'maintenance alive' message for that * hostnamed node is received */ void set_mtcAlive ( string & hostname, int iface ); bool get_mtcAlive_gate ( string & hostname ); void ctl_mtcAlive_gate ( string & hostname, bool gated ); /** Store the latest mtce flags for the specified host * current flags are defined in nodebase.h #define MTC_FLAG__I_AM_CONFIGURED (0x00000001) #define MTC_FLAG__I_AM_NOT_HEALTHY (0x00000002) #define MTC_FLAG__I_AM_HEALTHY (0x00000004) #define MTC_FLAG__I_AM_LOCKED (0x00000008) */ void set_mtce_flags ( string hostname, int flags ); /** Updates the node's health code * Codes are found in nodeBase.h * * - NODE_HEALTH_UNKNOWN (0) * - NODE_HEALTHY (1) * - NODE_UNHEALTHY (2) * * */ void set_health ( string & hostname, int health ); /** Returns true when a 'go enabled' message for that * hostnamed node is received */ void set_goEnabled_failed ( string & hostname ); void set_goEnabled ( string & hostname ); bool get_goEnabled ( string & hostname ); void set_goEnabled_failed_subf ( string & hostname ); void set_goEnabled_subf ( string & hostname ); bool get_goEnabled_subf ( string & hostname ); int set_subf_info ( string hostname, string functions, string operState_subf, string availState_subf ); /** Board management variable setter and getter utilities * Only the bm_ip is propped through to the database */ int set_bm_ip ( string hostname , string bm_ip ); int set_bm_type ( string hostname , string bm_type ); int set_bm_un ( string hostname , string bm_un ); bool is_bm_ip_already_used ( string bm_ip ); int manage_bmc_provisioning ( struct node * node_ptr ); string get_bm_ip ( string hostname ); string get_bm_un ( string hostname ); string get_bm_type ( string hostname ); string get_hostname_from_bm_ip ( string bm_ip ); string get_hwmon_info ( string hostname ); int get_server_code ( string hostname ); void set_hwmond_monitor_state ( string & hostname, bool state ); bool get_hwmond_monitor_state ( string & hostname ); int manage_shadow_change ( string hostname ); int inotify_shadow_file_fd ; int inotify_shadow_file_wd ; /** The multi node failure avoidance type */ #define MNFA_NUMBER 0 #define MNFA_PERCENT 1 int mnfa_threshold_type ; /** % of hosts that need to simultaneously fail before 'mnfa' kicks in */ int mnfa_threshold_percent ; /** # of hosts that need to simultaneously fail before 'mnfa' kicks in */ int mnfa_threshold_number ; /** the calculated threshold */ int mnfa_threshold ; /** Calculates and returns the mnfa threshold based on enabled hosts */ int mnfa_calculate_threshold ( string hostname ); /***************************************** ** Process Monitor Event Utilities API ** *****************************************/ /** Interface to declare that a key service on the * specified host is up, running and ready */ int declare_service_ready ( string & hostname, unsigned int service ); /** Process Monitor 'Clear' Event handler. * * The process specified will be removed from the * 'degraded_processes_list' and 'critical_processes_list' for * the specified host. * if there are no other degraded/critical processes or other * degraded services/reasons against that host then * this handler will clear the degrade state for the * specified host all together. */ int degrade_pmond_clear ( string & hostname ); /** Resource Monitor 'Clear' Event handler. * * The resource specified will be removed from the * 'degraded_resources_list' for specified host. * if there are no other degraded resources or other * degraded services/reasons against that host then * this handler will clear the degrade state for the * specified host all together. */ int degrade_resource_clear ( string & hostname, string & resource ); /** * If the pmond degrade flag is not set then do so. * if the host is not degraded then set it to degraded. */ int degrade_process_raise ( string & hostname, string & process ); /** if host is unlocked-enabled generate a process failure log */ int log_process_failure ( string & hostname, string & process ); /** if host is unlocked-enabled generate a process failure alarm */ int alarm_process_failure ( string & hostname, string & process ); /** Resource Monitor Raise Event handler. * * The host will enter degrade state due to the specified resource * not running properly. The resource name is recorded in the * 'degraded_resources_list' for specified host. * Clearing degrade against this resource requires that host to * send a clear event against that resource or for that host to * fully re-enable */ int degrade_resource_raise ( string & hostname, string & resource ); /** Generate a resource failure log if the host is unlocked */ int log_resource_failure ( string & hostname, string & resource ); /** Hardware Process Monitor Degrade Event handler. * see implementation for details */ int node_degrade_control ( string & hostname, int state, string service ); /** Hardware Monitor 'Action' Event method * * The hardware monitor daemon is calling out a sensor that * is operating out of spec. The command is the accompanying * action that hwmond requested as a recovery action to this failure. * The sensor is the sensor name that triggersed the event. */ int invoke_hwmon_action ( string & hostname, int action, string & sensor ); /** Process Monitor Failed Event handler. * * The host will go out of service and be reset and * automatically re-enabled. */ int critical_process_failed( string & hostname, string & process, unsigned int nodetype ); /** Resource Monitor Failed Event handler. * * The host will go out of service and be reset and * automatically re-enabled. */ int critical_resource_failed( string & hostname, string & resource ); /************************************************************/ /** * Node state set'ers and get'ers */ mtc_nodeAdminAction_enum get_adminAction ( string & hostname ); int set_adminAction ( string & hostname, mtc_nodeAdminAction_enum adminAction ); mtc_nodeAdminState_enum get_adminState ( string & hostname ); int set_adminState ( string & hostname, mtc_nodeAdminState_enum adminState ); mtc_nodeOperState_enum get_operState ( string & hostname ); int set_operState ( string & hostname, mtc_nodeOperState_enum operState ); mtc_nodeAvailStatus_enum get_availStatus ( string & hostname ); int set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum availStatus ); /** Convert the supplied string to a valid maintenance Admin State enum */ mtc_nodeAdminState_enum adminState_str_to_enum ( const char * admin_string_ptr ); /** Convert the supplied string to a valid maintenance Oper State enum */ mtc_nodeOperState_enum operState_str_to_enum ( const char * oper_string_ptr ); /** Convert the supplied string to a valid maintenance Avail Status enum */ mtc_nodeAvailStatus_enum availStatus_str_to_enum ( const char * avail_string_ptr ); /** Convert the supplied enum to the corresponding Admin Action string */ string adminAction_enum_to_str ( mtc_nodeAdminAction_enum val ); /** Convert the supplied enum to the corresponding Admin State string */ string adminState_enum_to_str ( mtc_nodeAdminState_enum val ); /** Convert the supplied enum to the corresponding Oper State string */ string operState_enum_to_str ( mtc_nodeOperState_enum val ); /** Convert the supplied enum to the corresponding Avail Status string */ string availStatus_enum_to_str ( mtc_nodeAvailStatus_enum val ); string get_operState_dport ( string & hostname ); string get_availStatus_dport ( string & hostname ); /******************************************** ** External Services Control Utilities API * ********************************************/ /** number of times mtce will retry an API before it gives up. * Configurable option through mtc.ini */ int api_retries ; /* Inventory APIs */ int mtcInvApi_cfg_show ( string hostname ); int mtcInvApi_cfg_modify ( string hostname, bool install ); int mtcInvApi_load_host ( string & hostname , node_inv_type & info ); int mtcInvApi_update_task ( string hostname, string task ); int mtcInvApi_force_task ( string hostname, string task ); int mtcInvApi_update_state ( string hostname, string state, string value ); int mtcInvApi_update_states ( string hostname, string admin, string oper, string avail ); int mtcInvApi_force_states ( string hostname, string admin, string oper, string avail ); int mtcInvApi_subf_states ( string hostname, string oper_subf, string avail_subf ); int mtcInvApi_update_states_now ( string hostname, string admin, string oper, string avail, string oper_subf, string avail_subf ); int mtcInvApi_update_task_now ( string hostname, string task ); int mtcInvApi_update_value ( string hostname, string key, string value ); int mtcInvApi_update_uptime ( string hostname, unsigned int uptime ); void mtcInvApi_add_handler ( struct evhttp_request *req, void *arg ); void mtcInvApi_qry_handler ( struct evhttp_request *req, void *arg ); void mtcInvApi_get_handler ( struct evhttp_request *req, void *arg ); string mtcVimApi_state_get ( string hostname, int & http_status_code ); int mtcVimApi_system_info ( string & response ); void mtcSmgrApi_handler ( struct evhttp_request *req, void *arg ); void mtcHttpUtil_handler ( struct evhttp_request *req, void *arg ); /* Update the authentication token as a work queue'd command */ int mtcKeyApi_refresh_token ( string hostname ); /* Update the authentication token now ; as a blocking request */ int mtcKeyApi_get_token ( string hostname ); /*********************** Public Heartbeat Interfaces *********************/ /** Creates a linked list of nodes to heartbeat for the specified port * * Based on unlocked enabled hosts and provisioned ports * * @param * iface_enum specifying the port to create the pulse list for * @return * a pointer to the head of the burndown checkin list for the specified port */ int create_pulse_list ( iface_enum iface ); /** Clear the pulse list */ void clear_pulse_list ( iface_enum iface ); /** Remove a host from an interface's pulse list */ int remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags ); /** Manage the heartbeat pulse flags by hostname * * These flags contain service information sent by the replying host. * One example of this is the pmond flag which indicates whether the process * monitor is running on that host. * * Flags that are not set are thresholded for degrade or alarm assertion * or cleared when found to be set again. * * @param hostname - a string containing the name of the host * that sent the flags. * @param flags - integer containing a bit field set of flags * **/ void manage_pulse_flags ( string & hostname, unsigned int flags ); /** Control the heartbeat monitoring state of a host */ int mon_host ( const string & hostname, iface_enum iface, bool true_false ); /** Return true if the pulse list is empty */ bool pulse_list_empty ( iface_enum iface ); void recalibrate_thresholds ( void ); /** Handle heartbeat losses * * Any hosts that remain in the pulse list at the end * of the heartbeat period have not responded with a * pulse message suggesting a health issue with that host * This interface manages thresholding and acting on hosts * that exceed preset thresholds. * */ int lost_pulses ( iface_enum iface ); bool monitored_pulse ( string hostname , iface_enum iface ); /** Print the pulse list */ void print_pulse_list ( iface_enum iface ); /*********************** Public Heartbeat Pulse Data *********************/ /** How many pulses in the list */ int hbs_expected_pulses[MAX_IFACES]; /** How many pulses have come in */ int hbs_detected_pulses[MAX_IFACES]; /** Flag indicating the hbs service is ready to start monitoring hosts */ bool hbs_ready ; /*************************************************************************/ void memDumpAllState ( void ); void memDumpNodeState ( string hostname ); // #endif /** Common REST API Structs */ /* System Management REST API Control Struct */ libEvent sysinvEvent ; /* System Management REST API Control Struct */ libEvent smgrEvent ; /* Keystone Authentication Token Control Struct */ libEvent tokenEvent ; /** /etc/mtc.ini configurable timeouts */ int compute_mtcalive_timeout; int controller_mtcalive_timeout ; int goenabled_timeout ; int swact_timeout ; int sysinv_timeout ; int sysinv_noncrit_timeout ; int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */ int mnfa_recovery_timeout; /**< Multi-Node-Failure Avoidance Recovery Timeout */ int work_queue_timeout ; int node_reinstall_timeout ; /** /etc/mtc.ini configurable audit intervals */ int insv_test_period ; int oos_test_period ; int uptime_period ; int online_period ; int token_refresh_rate; int unknown_host_throttle ; int invalid_arg_throttle ; }; /** * @} nodeLinkClass */ /* allocates nodeLinkClass node_ptr */ #define GET_NODE_PTR(hostname) \ nodeLinkClass::node * node_ptr = this->getNode ( hostname ) ; \ if ( node_ptr == NULL ) \ { \ elog ("%s hostname unknown\n", hostname.c_str()); \ return (FAIL_HOSTNAME_LOOKUP); \ } #define CHK_NODE_PTR(node_ptr) \ if ( node_ptr == NULL ) \ { \ slog ("null node_ptr\n"); \ return (FAIL_NULL_POINTER); \ } nodeLinkClass * inv_init ( void ); nodeLinkClass * get_mtcInv_ptr ( void ); int module_init ( void ); const char * get_adminAction_str ( mtc_nodeAdminAction_enum action ); string bmc_get_ip ( string hostname, string mac , string & current_bm_ip ); void clear_host_degrade_causes ( unsigned int & degrade_mask ); bool sensor_monitoring_supported ( string hostname ); void autorecovery_clear ( string hostname ); void log_mnfa_pool ( std::list & mnfa_awol_list ); #endif /* __INCLUDE_NODECLASS_H__ */