Implement a --timeout for killing stalled scripts
This implements the feature required for bug #1595722 but the tripleo bug is not closed until os-refresh-config is invoked with an appropriate --timeout value. Change-Id: Ibcbb2090aed126abec8dac49efa53ecbdb2b9b2c DependsOn: If31f0d5d60e8585720c4c9c95cffa202f059f6f7 Partial-Bug: #1595722
This commit is contained in:
parent
d0cf563c2d
commit
1d828fa0dd
@ -18,10 +18,13 @@ import argparse
|
|||||||
import fcntl
|
import fcntl
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
OLD_BASE_DIR = '/opt/stack/os-config-refresh'
|
OLD_BASE_DIR = '/opt/stack/os-config-refresh'
|
||||||
DEFAULT_BASE_DIR = '/usr/libexec/os-refresh-config'
|
DEFAULT_BASE_DIR = '/usr/libexec/os-refresh-config'
|
||||||
|
|
||||||
@ -55,6 +58,21 @@ PHASES = ['pre-configure',
|
|||||||
'migration']
|
'migration']
|
||||||
|
|
||||||
|
|
||||||
|
def timeout():
|
||||||
|
p = psutil.Process()
|
||||||
|
children = list(p.get_children(recursive=True))
|
||||||
|
for child in children:
|
||||||
|
child.kill()
|
||||||
|
|
||||||
|
|
||||||
|
def exit(lock, statuscode=0):
|
||||||
|
signal.alarm(0)
|
||||||
|
if lock:
|
||||||
|
lock.truncate(0)
|
||||||
|
lock.close()
|
||||||
|
return statuscode
|
||||||
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Runs through all of the phases to ensure
|
description="""Runs through all of the phases to ensure
|
||||||
@ -72,6 +90,10 @@ def main(argv=sys.argv):
|
|||||||
parser.add_argument('--lockfile',
|
parser.add_argument('--lockfile',
|
||||||
default='/var/run/os-refresh-config.lock',
|
default='/var/run/os-refresh-config.lock',
|
||||||
help='Lock file to prevent multiple running copies.')
|
help='Lock file to prevent multiple running copies.')
|
||||||
|
parser.add_argument('--timeout',
|
||||||
|
type=int,
|
||||||
|
help='Seconds until the current run will be '
|
||||||
|
'terminated.')
|
||||||
options = parser.parse_args(argv[1:])
|
options = parser.parse_args(argv[1:])
|
||||||
|
|
||||||
if options.print_base:
|
if options.print_base:
|
||||||
@ -101,6 +123,15 @@ def main(argv=sys.argv):
|
|||||||
lock.truncate(0)
|
lock.truncate(0)
|
||||||
lock.write("Locked by pid==%d at %s\n" % (os.getpid(), time.localtime()))
|
lock.write("Locked by pid==%d at %s\n" % (os.getpid(), time.localtime()))
|
||||||
|
|
||||||
|
def timeout_handler(signum, frame):
|
||||||
|
log.error('Timeout reached: %ss. Sending SIGKILL to all children' %
|
||||||
|
options.timeout)
|
||||||
|
timeout()
|
||||||
|
|
||||||
|
if options.timeout:
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
signal.alarm(options.timeout)
|
||||||
|
|
||||||
for phase in PHASES:
|
for phase in PHASES:
|
||||||
phase_dir = os.path.join(BASE_DIR, '%s.d' % phase)
|
phase_dir = os.path.join(BASE_DIR, '%s.d' % phase)
|
||||||
log.debug('Checking %s' % phase_dir)
|
log.debug('Checking %s' % phase_dir)
|
||||||
@ -124,13 +155,11 @@ def main(argv=sys.argv):
|
|||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
log.error("Aborting...")
|
log.error("Aborting...")
|
||||||
return 1
|
return exit(lock, 1)
|
||||||
else:
|
else:
|
||||||
log.debug('No dir for phase %s' % phase)
|
log.debug('No dir for phase %s' % phase)
|
||||||
|
|
||||||
lock.truncate(0)
|
return exit(lock)
|
||||||
lock.close()
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -116,7 +116,6 @@ exit %(returncode)s
|
|||||||
self._write_script('pre-configure', '20-pre-second', 99)
|
self._write_script('pre-configure', '20-pre-second', 99)
|
||||||
self._write_script('configure', '10-conf-first', 0)
|
self._write_script('configure', '10-conf-first', 0)
|
||||||
returncode, stdout, stderr = self._run_orc()
|
returncode, stdout, stderr = self._run_orc()
|
||||||
print(stderr)
|
|
||||||
self.assertEqual('\n'.join([
|
self.assertEqual('\n'.join([
|
||||||
'10-pre-first starting',
|
'10-pre-first starting',
|
||||||
'10-pre-first done',
|
'10-pre-first done',
|
||||||
@ -126,6 +125,22 @@ exit %(returncode)s
|
|||||||
]), stdout)
|
]), stdout)
|
||||||
self.assertEqual(1, returncode)
|
self.assertEqual(1, returncode)
|
||||||
|
|
||||||
|
def test_cmd_with_timeout(self):
|
||||||
|
self._write_script('pre-configure', '10-pre-first', 0, 5)
|
||||||
|
self._write_script('pre-configure', '20-pre-second', 0, 5)
|
||||||
|
self._write_script('configure', '10-conf-first', 0, 5)
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
returncode, stdout, stderr = self._run_orc('--timeout', '2',
|
||||||
|
'--log-level', 'DEBUG')
|
||||||
|
# check run time accounts for the 2 seconds timeout
|
||||||
|
self.assertTrue(time.time() - now >= 2.0)
|
||||||
|
self.assertEqual('\n'.join([
|
||||||
|
'10-pre-first starting',
|
||||||
|
'',
|
||||||
|
]), stdout)
|
||||||
|
self.assertEqual(1, returncode)
|
||||||
|
|
||||||
def test_debug(self):
|
def test_debug(self):
|
||||||
returncode, stdout, stderr = self._run_orc('--log-level', 'DEBUG')
|
returncode, stdout, stderr = self._run_orc('--log-level', 'DEBUG')
|
||||||
self.assertEqual('', stdout)
|
self.assertEqual('', stdout)
|
||||||
|
@ -3,3 +3,4 @@
|
|||||||
# process, which may cause wedges in the gate later.
|
# process, which may cause wedges in the gate later.
|
||||||
pbr>=1.6 # Apache-2.0
|
pbr>=1.6 # Apache-2.0
|
||||||
dib-utils # Apache-2.0
|
dib-utils # Apache-2.0
|
||||||
|
psutil>=1.1.1,<2.0.0 # BSD
|
||||||
|
Loading…
Reference in New Issue
Block a user