Stop the process monitor (pmond) on controlled self-reboot

There are still cases seen where on an AIO SX unlock operation fails to reboot due to pmond recovering the mtcClient following a mtcClient self-reboot and launch of fail-safe sysreq reset thread. Following a self-reboot, the Process Monitor (pmond) detects an active monitoring failure of the mtcClient. However, at that same time systemctl reports that the system is running degraded, not stopping. So the previous fix to pmon does not know that the system is stopping so it restarts mtcClient ; like before but valid systemctl state readout. This update is a further enhancement for the issue reported by https://bugs.launchpad.net/starlingx/+bug/1883519 with update https://review.opendev.org/#/c/735609 by commanding the mtcClient to stop pmond, with verification and retries, immediately before a self-reboot. Change-Id: I17fde797803c537f4f448b4764585f1f1acc4e2a Closes-Bug: 1883519 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2020-08-11 11:30:53 -04:00 · 2020-08-11 11:30:53 -04:00 · 7f6cd7ae3a
commit 7f6cd7ae3a
parent 7a0a2dac1a
1 changed files with 52 additions and 0 deletions
--- a/mtce/src/maintenance/mtcCompMsg.cpp
+++ b/mtce/src/maintenance/mtcCompMsg.cpp
@ -55,6 +55,51 @@ extern char *program_invocation_short_name;
 int mtcAlive_mgmnt_sequence = 0 ;
 int mtcAlive_clstr_sequence = 0 ;

+
+/************************************************************************
+ *
+ * Name        : stop pmon
+ *
+ * Purpose     : Used before issuing the self reboot so that pmond
+ *               does not try and recover any processes,
+ *               most importantly this one which would
+ *               cancel the sysreq failsafe thread.
+ *
+ ************************************************************************/
+void stop_pmon( void )
+{
+    /* max pipe command response length */
+    #define PIPE_COMMAND_RESPON_LEN (100)
+    ilog("Stopping pmon to prevent process recovery during shutdown");
+    for ( int retry = 0 ; retry < 5 ; retry++ )
+    {
+        char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
+        int rc = system("/usr/bin/systemctl stop pmon");
+        sleep(2);
+
+        /* confirm pmon is no longer active */
+        execute_pipe_cmd ( "/usr/bin/systemctl is-active pmon", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN );
+        if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 )
+        {
+            string temp = pipe_cmd_output ;
+            if ( temp.find ("inactive") != string::npos )
+            {
+                ilog("pmon is now inactive (%d:%d)", retry, rc);
+                break ;
+            }
+            else
+            {
+                ilog("pmon is not inactive (%s) ; retrying (%d:%d)",
+                      temp.c_str(), retry, rc);
+            }
+        }
+        else
+        {
+            elog("pmon status query failed ; retrying (%d:%d)", retry, rc);
+        }
+    }
+}
+
 /* Receive and process commands from controller maintenance */
 int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
 {
@ -470,6 +515,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
                ilog ("Reboot - fit bypass (%s)\n", interface_name.c_str());
                return (PASS);
            }
+            stop_pmon();
            ilog ("Reboot (%s)\n", interface_name.c_str());
            daemon_log ( NODE_RESET_FILE, "reboot command" );
            fork_sysreq_reboot ( delay );
@ -500,6 +546,10 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
            {
                ilog ("Lazy Reboot (%s) ; now\n", interface_name.c_str() );
            }
+
+            /* stop pmon before issuing the self reboot so that it does not
+             * try and recover any processes, most importantly this one */
+            stop_pmon();
            fork_sysreq_reboot ( delay );
            rc = system("/usr/bin/systemctl reboot");
        }
@ -510,6 +560,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
                ilog ("Reset - fit bypass (%s)\n", interface_name.c_str());
                return (PASS);
            }
+            stop_pmon();
            ilog ("Reset 'reboot -f' (%s)\n", interface_name.c_str());
            daemon_log ( NODE_RESET_FILE, "reset command" );
            fork_sysreq_reboot ( delay/2 );
@ -527,6 +578,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
            /* We fork a reboot as a fail safe.
             * If something goes wrong we should reboot anyway
             */
+            stop_pmon();
            fork_sysreq_reboot ( delay/2 );

            /* We fork the wipedisk command as it may take upwards of 30s