Maintenance does not auto-start worker host services in AIO

The mtcClient is required to 'start host services' autonomously
following a node reboot. This is to handle the usecase where
the administrator disables maintenance heartbeat loss auto recovery.
If that node then reboots on its own, for whatever reason, maintenance
needs to ensure that it auto starts 'host services'.

A fairly recent update delivered support for that usecase:

    https://opendev.org/starlingx/metal/commit/
    1335bc484d

However, the current mechanism the mtcClient used to manage auto-
starting host services did not handle the worker subfunction case.
Moreover, the current implementation is not handling the potential
concurrency between the mtcClient process startup case and mtcAgent
requests during unlock recovery.

This case also fixes an issue where the mtcClient sometimes gets
into a mode where it floods the mtcAgent with a start host services
result message ; 20 unnecessary messages / sec. The aforementioned
update modified the mtcAgent to log receipt of this message which
then floods the mtcAgent log leading to unnecessary message handling
and log rotations.

Test Plan:

Success Path:

PASS: Verify mtcClient success path handling of start and stop host
      services function for the various node types in a ...
      - standard system with worker and storage nodes
      - all-in-one system with worker node
PASS: Verify appropriate start host services are run on each node
      type following a Dead Office Recovery (DOR).
      - standard system with worker and storage nodes
      - all-in-one system with worker node
PASS: Verify the mtcClient does not unnecessarily send host services
      result messages.
PASS: Verify handling of periodic start host services message while
      a node is in service.

Failure Path:

PASS: Verify mtcClient failure path handling of start and stop host
      services function for the various node types in a ...
      - standard system with worker and storage nodes
      - all-in-one system with worker node

PASS: Verify mtcClient start host services command handling when
      when message requests interleave with auto start handling
      during unlock recovery.

Closes-Bug: 2073802
Change-Id: I0da7a16c1f600cc60364f6bcec7587e2ff71c624
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2024-08-08 15:03:17 +00:00
parent 50204147ff
commit dab9c4774b
4 changed files with 84 additions and 47 deletions

View File

@ -413,29 +413,28 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
else if ( is_host_services_cmd ( msg.cmd ) == true )
{
ctrl_type * ctrl_ptr = get_ctrl_ptr () ;
/* Check to see if this command is already running.
* hostservices.posted is set to command on launch
* hostservices.monitor is set to command while monitoring */
if (( ctrl_ptr->hostservices.posted == msg.cmd ) ||
( ctrl_ptr->hostservices.monitor == msg.cmd ))
{
wlog ("%s already in progress (%d:%d)\n",
ilog ("%s request from %s network",
command_name.c_str(),
ctrl_ptr->hostservices.posted,
ctrl_ptr->hostservices.monitor );
iface_name_ptr);
if ( msg.cmd == MTC_CMD_START_CONTROL_SVCS )
{
ctrl_ptr->start_controller_hostservices = true ;
rc = PASS ;
}
else if ( msg.cmd == MTC_CMD_START_WORKER_SVCS )
{
ctrl_ptr->start_worker_hostservices = true ;
rc = PASS ;
}
else if ( msg.cmd == MTC_CMD_START_STORAGE_SVCS )
{
ctrl_ptr->start_storage_hostservices = true ;
rc = PASS ;
}
else
{
ctrl_ptr->posted_script_set.push_back ( HOSTSERVICES_SCRIPTS );
ctrl_ptr->posted_script_set.unique ();
ilog ("%s request posted from %s network",
command_name.c_str(),
iface_name_ptr);
ctrl_ptr->hostservices.posted = msg.cmd ;
ctrl_ptr->hostservices.monitor = MTC_CMD_NONE ;
rc = PASS ;

View File

@ -297,7 +297,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
/* log if not locked message */
if ( msg.cmd != MTC_MSG_LOCKED )
{
ilog ("%s '%s' ACK (rc:%d) (%s)",
ilog ("%s %s request ACK (rc:%d) (%s)",
hostname.c_str(),
get_mtcNodeCommand_str(msg.cmd),
msg.parm[0],
@ -305,7 +305,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
}
else
{
mlog ("%s '%s' ACK (rc:%d) (%s)",
mlog ("%s %s request ACK (rc:%d) (%s)",
hostname.c_str(),
get_mtcNodeCommand_str(msg.cmd),
msg.parm[0],

View File

@ -784,7 +784,7 @@ void _scripts_cleanup ( script_set_enum script_set )
script_ptr = &ctrl.hostservices ;
break ;
default:
slog ("invalid script set (%d)\n", script_set );
dlog ("invalid script set (%d)\n", script_set );
return ;
}
@ -838,7 +838,7 @@ void _manage_services_scripts ( void )
if ( ! ctrl.hostservices.scripts )
{
/* send a PASS result */
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, NULL );
ctrl.active_script_set = NO_SCRIPTS ;
return ;
}
@ -870,9 +870,10 @@ void _manage_services_scripts ( void )
}
else
{
ilog ("Host Services Complete ; all passed\n");
ilog ("Host Services Complete ; all passed ; %s", get_mtcNodeCommand_str(ctrl.current_hostservices_command));
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, NULL );
}
ctrl.active_script_set = NO_SCRIPTS ;
}
/* do if have we timed out ? */
@ -1937,18 +1938,25 @@ void daemon_service_run ( void )
}
}
if (( start_host_services_needs_to_be_run == true) &&
( ctrl.posted_script_set.size() == 0 ))
/* This is a process startup handling case.
* Need to ensure that the appropriate host
* services are started for the system/node
* type. */
if ( start_host_services_needs_to_be_run == true )
{
bool run_start_host_services = false ;
dlog1 ("Start Host Services needs to be run");
if ( ctrl.system_type == SYSTEM_TYPE__NORMAL )
{
/* Any node on a standard system */
if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) )
{
ilog ("start host services on standard system accepted");
run_start_host_services = true ;
if ( ctrl.nodetype & CONTROLLER_TYPE )
ctrl.start_controller_hostservices = true ;
else if ( ctrl.nodetype & WORKER_TYPE )
ctrl.start_worker_hostservices = true ;
else if ( ctrl.nodetype & STORAGE_TYPE )
ctrl.start_storage_hostservices = true ;
start_host_services_needs_to_be_run = false ;
}
else if ( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) )
{
@ -1963,7 +1971,11 @@ void daemon_service_run ( void )
if ( daemon_is_file_present ( GOENABLED_SUBF_PASS ) )
{
ilog ("start host services on all-in-one controller accepted");
run_start_host_services = true ;
if ( ctrl.nodetype & CONTROLLER_TYPE)
ctrl.start_controller_hostservices = true ;
if ( ctrl.nodetype & WORKER_TYPE )
ctrl.start_worker_hostservices = true ;
start_host_services_needs_to_be_run = false ;
}
else if (( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) ||
( daemon_is_file_present ( GOENABLED_SUBF_FAIL ))))
@ -1979,7 +1991,11 @@ void daemon_service_run ( void )
if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) )
{
ilog ("start host services on all-in-one plus node accepted");
run_start_host_services = true ;
if ( ctrl.nodetype & WORKER_TYPE )
ctrl.start_worker_hostservices = true ;
else if ( ctrl.nodetype & STORAGE_TYPE )
ctrl.start_storage_hostservices = true ;
start_host_services_needs_to_be_run = false ;
}
else if ( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) )
{
@ -1989,24 +2005,43 @@ void daemon_service_run ( void )
}
}
if ( run_start_host_services )
{
ctrl.posted_script_set.push_back ( HOSTSERVICES_SCRIPTS );
int cmd = MTC_CMD_NONE ;
if ( ctrl.nodetype & CONTROLLER_TYPE)
cmd = MTC_CMD_START_CONTROL_SVCS ;
else if ( ctrl.nodetype & WORKER_TYPE )
cmd = MTC_CMD_START_WORKER_SVCS ;
else if ( ctrl.nodetype & STORAGE_TYPE )
cmd = MTC_CMD_START_STORAGE_SVCS ;
ctrl.hostservices.posted = cmd ;
ctrl.hostservices.monitor = MTC_CMD_NONE ;
ilog ("posted start host services ; from process startup ; cmd:%s", get_mtcNodeCommand_str(cmd));
start_host_services_needs_to_be_run = false ;
}
}
// Handle auto start of node personality services.
// - prioritize controller first
// - prevent more than one being posted at once
if (( ctrl.start_controller_hostservices ) &&
( ctrl.posted_script_set.empty()) &&
( ctrl.active_script_set == NO_SCRIPTS ))
{
ctrl.posted_script_set.push_front ( HOSTSERVICES_SCRIPTS );
ctrl.current_hostservices_command =
ctrl.hostservices.posted = MTC_CMD_START_CONTROL_SVCS ;
ctrl.hostservices.monitor = MTC_CMD_NONE ;
ilog ("scheduling %s", get_mtcNodeCommand_str(ctrl.hostservices.posted));
ctrl.start_controller_hostservices = false ;
}
if (( ctrl.start_worker_hostservices ) &&
( ctrl.posted_script_set.empty()) &&
( ctrl.active_script_set == NO_SCRIPTS ))
{
ctrl.posted_script_set.push_front ( HOSTSERVICES_SCRIPTS );
ctrl.current_hostservices_command =
ctrl.hostservices.posted = MTC_CMD_START_WORKER_SVCS ;
ctrl.hostservices.monitor = MTC_CMD_NONE ;
ilog ("scheduling %s", get_mtcNodeCommand_str(ctrl.hostservices.posted));
ctrl.start_worker_hostservices = false ;
}
if (( ctrl.start_storage_hostservices ) &&
( ctrl.posted_script_set.empty()) &&
( ctrl.active_script_set == NO_SCRIPTS ))
{
ctrl.posted_script_set.push_front ( HOSTSERVICES_SCRIPTS );
ctrl.current_hostservices_command =
ctrl.hostservices.posted = MTC_CMD_START_STORAGE_SVCS ;
ctrl.hostservices.monitor = MTC_CMD_NONE ;
ilog ("scheduling %s", get_mtcNodeCommand_str(ctrl.hostservices.posted));
ctrl.start_storage_hostservices = false ;
}
daemon_signal_hdlr ();
}
@ -2246,7 +2281,6 @@ int run_hostservices_scripts ( unsigned int cmd )
{
ilog ("no service scripts\n");
ctrl.hostservices.scripts = 0 ;
_manage_services_scripts ();
ctrl.active_script_set = NO_SCRIPTS ;
return (PASS);
}

View File

@ -134,6 +134,10 @@ typedef struct
/* Start/Stop Hosts Services execution control timing and completion status */
script_ctrl_type hostservices ;
int current_hostservices_command = MTC_CMD_NONE ;
bool start_controller_hostservices = false ;
bool start_worker_hostservices = false ;
bool start_storage_hostservices = false ;
/* The script set that is executing */
script_set_enum active_script_set ;