Kill misbehaving ipmitool
process
We can't trust ipmitool to terminate in time. We may have to kill the process if it's running for longer than we asked it to. On the other hand, abrupt IPMI exchange termination is said to be dangerous to the state of the BMC being managed. Therefore this patch only kills timed out IPMI "power status" call. For the purpose of killing hung `ipmitool` we inject the time-capped `popen.wait` call before the uncapped `popen.communicate` is called internally. Then just kill stuck `ipmitool` process and go on. Story: 2004449 Task: 28127 Change-Id: I7e1eafb334fe3a3337926aca27c14fe559ce0e39
This commit is contained in:
parent
1cdc13c61e
commit
9efb9e313d
@ -35,6 +35,12 @@ opts = [
|
|||||||
'sent to a server. There is a risk with some hardware '
|
'sent to a server. There is a risk with some hardware '
|
||||||
'that setting this too low may cause the BMC to crash. '
|
'that setting this too low may cause the BMC to crash. '
|
||||||
'Recommended setting is 5 seconds.')),
|
'Recommended setting is 5 seconds.')),
|
||||||
|
cfg.BoolOpt('kill_on_timeout',
|
||||||
|
default=True,
|
||||||
|
help=_('Kill `ipmitool` process invoked by ironic to read '
|
||||||
|
'node power state if `ipmitool` process does not exit '
|
||||||
|
'after `command_retry_timeout` timeout expires. '
|
||||||
|
'Recommended setting is True')),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,6 +30,7 @@ DRIVER.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import functools
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -370,12 +371,41 @@ def _parse_driver_info(node):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
def _exec_ipmitool_wait(timeout, driver_info, popen_obj):
|
||||||
|
wait_interval = min(timeout, 0.5)
|
||||||
|
|
||||||
|
while timeout >= 0:
|
||||||
|
if not popen_obj.poll():
|
||||||
|
return
|
||||||
|
|
||||||
|
time.sleep(wait_interval)
|
||||||
|
timeout -= wait_interval
|
||||||
|
|
||||||
|
LOG.warning('Killing timed out IPMI process "%(cmd)s" for node %(node)s.',
|
||||||
|
{'node': driver_info['uuid'], 'cmd': popen_obj.cmd})
|
||||||
|
|
||||||
|
popen_obj.terminate()
|
||||||
|
time.sleep(0.5)
|
||||||
|
if popen_obj.poll():
|
||||||
|
popen_obj.kill()
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if popen_obj.poll():
|
||||||
|
LOG.warning('Could not kill IPMI process "%(cmd)s" for node %(node)s.',
|
||||||
|
{'node': driver_info['uuid'], 'cmd': popen_obj.cmd})
|
||||||
|
|
||||||
|
|
||||||
|
def _exec_ipmitool(driver_info, command, check_exit_code=None,
|
||||||
|
kill_on_timeout=False):
|
||||||
"""Execute the ipmitool command.
|
"""Execute the ipmitool command.
|
||||||
|
|
||||||
:param driver_info: the ipmitool parameters for accessing a node.
|
:param driver_info: the ipmitool parameters for accessing a node.
|
||||||
:param command: the ipmitool command to be executed.
|
:param command: the ipmitool command to be executed.
|
||||||
:param check_exit_code: Single bool, int, or list of allowed exit codes.
|
:param check_exit_code: Single bool, int, or list of allowed exit codes.
|
||||||
|
:param kill_on_timeout: if `True`, kill unresponsive ipmitool on
|
||||||
|
`min_command_interval` timeout. Default is `False`. Makes no
|
||||||
|
effect on Windows.
|
||||||
:returns: (stdout, stderr) from executing the command.
|
:returns: (stdout, stderr) from executing the command.
|
||||||
:raises: PasswordFileFailedToCreate from creating or writing to the
|
:raises: PasswordFileFailedToCreate from creating or writing to the
|
||||||
temporary file.
|
temporary file.
|
||||||
@ -418,6 +448,21 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
|||||||
args.append('-N')
|
args.append('-N')
|
||||||
args.append(str(CONF.ipmi.min_command_interval))
|
args.append(str(CONF.ipmi.min_command_interval))
|
||||||
|
|
||||||
|
extra_args = {}
|
||||||
|
|
||||||
|
if kill_on_timeout:
|
||||||
|
# NOTE(etingof): We can't trust ipmitool to terminate in time.
|
||||||
|
# Therefore we have to kill it if it is running for longer than
|
||||||
|
# we asked it to.
|
||||||
|
# For that purpose we inject the time-capped `popen.wait` call
|
||||||
|
# before the uncapped `popen.communicate` is called internally.
|
||||||
|
# That gives us a chance to kill misbehaving `ipmitool` child.
|
||||||
|
extra_args['on_execute'] = functools.partial(
|
||||||
|
_exec_ipmitool_wait, timeout, driver_info)
|
||||||
|
|
||||||
|
if check_exit_code is not None:
|
||||||
|
extra_args['check_exit_code'] = check_exit_code
|
||||||
|
|
||||||
end_time = (time.time() + timeout)
|
end_time = (time.time() + timeout)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@ -431,9 +476,6 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
|||||||
# Resetting the list that will be utilized so the password arguments
|
# Resetting the list that will be utilized so the password arguments
|
||||||
# from any previous execution are preserved.
|
# from any previous execution are preserved.
|
||||||
cmd_args = args[:]
|
cmd_args = args[:]
|
||||||
extra_args = {}
|
|
||||||
if check_exit_code is not None:
|
|
||||||
extra_args['check_exit_code'] = check_exit_code
|
|
||||||
# 'ipmitool' command will prompt password if there is no '-f'
|
# 'ipmitool' command will prompt password if there is no '-f'
|
||||||
# option, we set it to '\0' to write a password file to support
|
# option, we set it to '\0' to write a password file to support
|
||||||
# empty password
|
# empty password
|
||||||
@ -559,7 +601,8 @@ def _power_status(driver_info):
|
|||||||
"""
|
"""
|
||||||
cmd = "power status"
|
cmd = "power status"
|
||||||
try:
|
try:
|
||||||
out_err = _exec_ipmitool(driver_info, cmd)
|
out_err = _exec_ipmitool(
|
||||||
|
driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout)
|
||||||
except (exception.PasswordFileFailedToCreate,
|
except (exception.PasswordFileFailedToCreate,
|
||||||
processutils.ProcessExecutionError) as e:
|
processutils.ProcessExecutionError) as e:
|
||||||
LOG.warning("IPMI power status failed for node %(node_id)s with "
|
LOG.warning("IPMI power status failed for node %(node_id)s with "
|
||||||
@ -1044,7 +1087,8 @@ class IPMIManagement(base.ManagementInterface):
|
|||||||
# extended sensor informations
|
# extended sensor informations
|
||||||
cmd = "sdr -v"
|
cmd = "sdr -v"
|
||||||
try:
|
try:
|
||||||
out, err = _exec_ipmitool(driver_info, cmd)
|
out, err = _exec_ipmitool(
|
||||||
|
driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout)
|
||||||
except (exception.PasswordFileFailedToCreate,
|
except (exception.PasswordFileFailedToCreate,
|
||||||
processutils.ProcessExecutionError) as e:
|
processutils.ProcessExecutionError) as e:
|
||||||
raise exception.FailedToGetSensorData(node=task.node.uuid,
|
raise exception.FailedToGetSensorData(node=task.node.uuid,
|
||||||
|
@ -996,6 +996,14 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
mock_support.assert_called_once_with('timing')
|
mock_support.assert_called_once_with('timing')
|
||||||
mock_exec.assert_called_once_with(*args)
|
mock_exec.assert_called_once_with(*args)
|
||||||
|
|
||||||
|
def test__exec_ipmitool_wait(self):
|
||||||
|
mock_popen = mock.MagicMock()
|
||||||
|
mock_popen.poll.side_effect = [1, 1, 1, 1, 1]
|
||||||
|
ipmi._exec_ipmitool_wait(1, {'uuid': ''}, mock_popen)
|
||||||
|
|
||||||
|
self.assertTrue(mock_popen.terminate.called)
|
||||||
|
self.assertTrue(mock_popen.kill.called)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_is_option_supported', autospec=True)
|
@mock.patch.object(ipmi, '_is_option_supported', autospec=True)
|
||||||
@mock.patch.object(ipmi, '_make_password_file', _make_password_file_stub)
|
@mock.patch.object(ipmi, '_make_password_file', _make_password_file_stub)
|
||||||
@mock.patch.object(utils, 'execute', autospec=True)
|
@mock.patch.object(utils, 'execute', autospec=True)
|
||||||
@ -1267,7 +1275,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
|
|
||||||
state = ipmi._power_status(self.info)
|
state = ipmi._power_status(self.info)
|
||||||
|
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
self.assertEqual(states.POWER_ON, state)
|
self.assertEqual(states.POWER_ON, state)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
@ -1276,7 +1285,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
|
|
||||||
state = ipmi._power_status(self.info)
|
state = ipmi._power_status(self.info)
|
||||||
|
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
self.assertEqual(states.POWER_OFF, state)
|
self.assertEqual(states.POWER_OFF, state)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
@ -1285,7 +1295,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
|
|
||||||
state = ipmi._power_status(self.info)
|
state = ipmi._power_status(self.info)
|
||||||
|
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
self.assertEqual(states.ERROR, state)
|
self.assertEqual(states.ERROR, state)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
@ -1294,14 +1305,15 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
self.assertRaises(exception.IPMIFailure,
|
self.assertRaises(exception.IPMIFailure,
|
||||||
ipmi._power_status,
|
ipmi._power_status,
|
||||||
self.info)
|
self.info)
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||||
def test__power_on_max_retries(self, sleep_mock, mock_exec):
|
def test__power_on_max_retries(self, sleep_mock, mock_exec):
|
||||||
self.config(command_retry_timeout=2, group='ipmi')
|
self.config(command_retry_timeout=2, group='ipmi')
|
||||||
|
|
||||||
def side_effect(driver_info, command):
|
def side_effect(driver_info, command, **kwargs):
|
||||||
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
||||||
"power on": [None, None]}
|
"power on": [None, None]}
|
||||||
return resp_dict.get(command, ["Bad\n", None])
|
return resp_dict.get(command, ["Bad\n", None])
|
||||||
@ -1309,8 +1321,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
mock_exec.side_effect = side_effect
|
mock_exec.side_effect = side_effect
|
||||||
|
|
||||||
expected = [mock.call(self.info, "power on"),
|
expected = [mock.call(self.info, "power on"),
|
||||||
mock.call(self.info, "power status"),
|
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
self.assertRaises(exception.PowerStateFailure,
|
self.assertRaises(exception.PowerStateFailure,
|
||||||
@ -1322,7 +1334,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||||
def test__soft_power_off(self, sleep_mock, mock_exec):
|
def test__soft_power_off(self, sleep_mock, mock_exec):
|
||||||
|
|
||||||
def side_effect(driver_info, command):
|
def side_effect(driver_info, command, **kwargs):
|
||||||
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
||||||
"power soft": [None, None]}
|
"power soft": [None, None]}
|
||||||
return resp_dict.get(command, ["Bad\n", None])
|
return resp_dict.get(command, ["Bad\n", None])
|
||||||
@ -1330,7 +1342,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
mock_exec.side_effect = side_effect
|
mock_exec.side_effect = side_effect
|
||||||
|
|
||||||
expected = [mock.call(self.info, "power soft"),
|
expected = [mock.call(self.info, "power soft"),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
state = ipmi._soft_power_off(task, self.info)
|
state = ipmi._soft_power_off(task, self.info)
|
||||||
@ -1342,7 +1354,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||||
def test__soft_power_off_max_retries(self, sleep_mock, mock_exec):
|
def test__soft_power_off_max_retries(self, sleep_mock, mock_exec):
|
||||||
|
|
||||||
def side_effect(driver_info, command):
|
def side_effect(driver_info, command, **kwargs):
|
||||||
resp_dict = {"power status": ["Chassis Power is on\n", None],
|
resp_dict = {"power status": ["Chassis Power is on\n", None],
|
||||||
"power soft": [None, None]}
|
"power soft": [None, None]}
|
||||||
return resp_dict.get(command, ["Bad\n", None])
|
return resp_dict.get(command, ["Bad\n", None])
|
||||||
@ -1350,8 +1362,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||||||
mock_exec.side_effect = side_effect
|
mock_exec.side_effect = side_effect
|
||||||
|
|
||||||
expected = [mock.call(self.info, "power soft"),
|
expected = [mock.call(self.info, "power soft"),
|
||||||
mock.call(self.info, "power status"),
|
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
self.assertRaises(exception.PowerStateFailure,
|
self.assertRaises(exception.PowerStateFailure,
|
||||||
@ -1402,9 +1414,9 @@ class IPMIToolDriverTestCase(Base):
|
|||||||
returns = iter([["Chassis Power is off\n", None],
|
returns = iter([["Chassis Power is off\n", None],
|
||||||
["Chassis Power is on\n", None],
|
["Chassis Power is on\n", None],
|
||||||
["\n", None]])
|
["\n", None]])
|
||||||
expected = [mock.call(self.info, "power status"),
|
expected = [mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status"),
|
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
mock_exec.side_effect = returns
|
mock_exec.side_effect = returns
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
@ -1426,7 +1438,8 @@ class IPMIToolDriverTestCase(Base):
|
|||||||
self.assertRaises(exception.IPMIFailure,
|
self.assertRaises(exception.IPMIFailure,
|
||||||
self.power.get_power_state,
|
self.power.get_power_state,
|
||||||
task)
|
task)
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_power_on', autospec=True)
|
@mock.patch.object(ipmi, '_power_on', autospec=True)
|
||||||
@mock.patch.object(ipmi, '_power_off', autospec=True)
|
@mock.patch.object(ipmi, '_power_off', autospec=True)
|
||||||
|
@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Kill ``ipmitool`` process invoked by ironic to read node's power state if
|
||||||
|
``ipmitool`` process does not exit after configured timeout expires. It
|
||||||
|
appears pretty common for ``ipmitool`` to run for five minutes (with
|
||||||
|
current ironic defauls) once it hits a non-responsive bare metal node.
|
||||||
|
This could slow down the management of other nodes due periodic tasks
|
||||||
|
slots exhaustion. The new behaviour could is enabled by default, but
|
||||||
|
could be disabled via the ``[ipmi]kill_on_timeout`` ironic configuration
|
||||||
|
option.
|
Loading…
Reference in New Issue
Block a user