Fix node exclusive lock not released on console start/restart.
This patch forces kill of console process with SIGKILL if it did not terminate on SIGTERM within the CONF.console.kill_timeout and reads the shellinabox subprocess stdout/stderr after CONF.console.subprocess_timeout or if subprocess exited with non zero code. Change-Id: I55a112d877d94f31d27487846ff59fe27f602f8b Closes-Bug: 1587313 Story: 1587313 Task: 9654
This commit is contained in:
parent
37f272436f
commit
0256e1e69f
@ -45,6 +45,10 @@ opts = [
|
||||
default=10,
|
||||
help=_('Time (in seconds) to wait for the console subprocess '
|
||||
'to start.')),
|
||||
cfg.IntOpt('kill_timeout',
|
||||
default=1,
|
||||
help=_('Time (in seconds) to wait for the shellinabox console '
|
||||
'subprocess to exit before sending SIGKILL signal.')),
|
||||
cfg.IPOpt('socat_address',
|
||||
default='$my_ip',
|
||||
help=_('IP address of Socat service running on the host of '
|
||||
|
@ -101,6 +101,23 @@ def _stop_console(node_uuid):
|
||||
console_pid = _get_console_pid(node_uuid)
|
||||
|
||||
os.kill(console_pid, signal.SIGTERM)
|
||||
|
||||
# make sure the process gets killed hard if required
|
||||
attempt = 0
|
||||
max_attempts = CONF.console.kill_timeout // 0.2
|
||||
|
||||
while attempt < max_attempts:
|
||||
if psutil.pid_exists(console_pid):
|
||||
if attempt == max_attempts - 1:
|
||||
os.kill(console_pid, signal.SIGKILL)
|
||||
LOG.debug("Waiting for the console process with PID %(pid)s "
|
||||
"to exit. Node: %(node)s.",
|
||||
{'pid': console_pid, 'node': node_uuid})
|
||||
time.sleep(0.2)
|
||||
attempt += 1
|
||||
else:
|
||||
break
|
||||
|
||||
except OSError as exc:
|
||||
if exc.errno != errno.ESRCH:
|
||||
msg = (_("Could not stop the console for node '%(node)s'. "
|
||||
@ -198,17 +215,20 @@ def start_shellinabox_console(node_uuid, port, console_cmd):
|
||||
# check if the console pid is created and the process is running.
|
||||
# if it is, then the shellinaboxd is invoked successfully as a daemon.
|
||||
# otherwise check the error.
|
||||
if locals['returncode'] is not None:
|
||||
if (locals['returncode'] == 0 and os.path.exists(pid_file)
|
||||
and psutil.pid_exists(_get_console_pid(node_uuid))):
|
||||
raise loopingcall.LoopingCallDone()
|
||||
else:
|
||||
|
||||
if (time.time() > expiration or locals['returncode'] is not None):
|
||||
(stdout, stderr) = popen_obj.communicate()
|
||||
locals['errstr'] = _(
|
||||
"Timeout or error while waiting for console "
|
||||
"subprocess to start for node: %(node)s.\n"
|
||||
"Command: %(command)s.\n"
|
||||
"Exit code: %(return_code)s.\n"
|
||||
"Stdout: %(stdout)r\n"
|
||||
"Stderr: %(stderr)r") % {
|
||||
'node': node_uuid,
|
||||
'command': ' '.join(args),
|
||||
'return_code': locals['returncode'],
|
||||
'stdout': stdout,
|
||||
@ -216,13 +236,6 @@ def start_shellinabox_console(node_uuid, port, console_cmd):
|
||||
LOG.warning(locals['errstr'])
|
||||
raise loopingcall.LoopingCallDone()
|
||||
|
||||
if (time.time() > expiration):
|
||||
locals['errstr'] = (_("Timeout while waiting for console "
|
||||
"subprocess to start for node %s.") %
|
||||
node_uuid)
|
||||
LOG.warning(locals['errstr'])
|
||||
raise loopingcall.LoopingCallDone()
|
||||
|
||||
locals = {'returncode': None, 'errstr': ''}
|
||||
expiration = time.time() + CONF.console.subprocess_timeout
|
||||
timer = loopingcall.FixedIntervalLoopingCall(_wait, node_uuid, obj)
|
||||
|
@ -145,8 +145,28 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
|
||||
console_utils._stop_console(self.info['uuid'])
|
||||
|
||||
mock_pid.assert_called_once_with(self.info['uuid'])
|
||||
mock_kill.assert_called_once_with(mock_pid.return_value,
|
||||
signal.SIGTERM)
|
||||
|
||||
# a check if process still exist (signal 0) in a loop
|
||||
mock_kill.assert_any_call(mock_pid.return_value, signal.SIG_DFL)
|
||||
# and that it receives the SIGTERM
|
||||
mock_kill.assert_any_call(mock_pid.return_value, signal.SIGTERM)
|
||||
mock_unlink.assert_called_once_with(pid_file)
|
||||
|
||||
@mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
|
||||
@mock.patch.object(os, 'kill', autospec=True)
|
||||
@mock.patch.object(psutil, 'pid_exists', autospec=True, return_value=True)
|
||||
@mock.patch.object(console_utils, '_get_console_pid', autospec=True)
|
||||
def test__stop_console_forced_kill(self, mock_pid, mock_psutil, mock_kill,
|
||||
mock_unlink):
|
||||
pid_file = console_utils._get_console_pid_file(self.info['uuid'])
|
||||
mock_pid.return_value = 12345
|
||||
|
||||
console_utils._stop_console(self.info['uuid'])
|
||||
|
||||
mock_pid.assert_called_once_with(self.info['uuid'])
|
||||
|
||||
# Make sure console process receives hard SIGKILL
|
||||
mock_kill.assert_any_call(mock_pid.return_value, signal.SIGKILL)
|
||||
mock_unlink.assert_called_once_with(pid_file)
|
||||
|
||||
@mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
|
||||
@ -322,7 +342,7 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
|
||||
mock_popen.assert_called_once_with(mock.ANY,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
mock_popen.return_value.poll.assert_called_once_with()
|
||||
mock_popen.return_value.poll.assert_called_with()
|
||||
|
||||
@mock.patch.object(console_utils, 'open',
|
||||
mock.mock_open(read_data='12345\n'))
|
||||
@ -350,11 +370,11 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
|
||||
|
||||
mock_stop.assert_called_once_with(self.info['uuid'])
|
||||
mock_dir_exists.assert_called_once_with()
|
||||
mock_pid_exists.assert_called_once_with(12345)
|
||||
mock_pid_exists.assert_called_with(12345)
|
||||
mock_popen.assert_called_once_with(mock.ANY,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
mock_popen.return_value.poll.assert_called_once_with()
|
||||
mock_popen.return_value.poll.assert_called_with()
|
||||
|
||||
@mock.patch.object(subprocess, 'Popen', autospec=True)
|
||||
@mock.patch.object(console_utils, '_ensure_console_pid_dir_exists',
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes a locking issue where ``ipmitool-shellinabox`` console
|
||||
interface users may encounter a situation where the bare metal node is
|
||||
locked until the conductor is restarted.
|
||||
See story `1587313 <https://storyboard.openstack.org/#!/story/1587313>`_
|
||||
for additional information.
|
Loading…
Reference in New Issue
Block a user