Fix signal handling for daemons with InternalClient

The intentional use of "bare except" handling in catch_errors and some
daemons to prevent propagation on unexpected errors that do not
inherit from Exception (like eventlet.Timeout) or even BaseException
(like old-style classes) has the side effect of spuriously "handling"
*expected* errors like when a signal handler raises SystemExit.

The signal handler installed in our Daemon is intended to ensure first
that the entire process group and any forked processes (like rsync's)
receive the SIGTERM signal and also that the process itself
terminates.

The use of sys.exit was not a concious grandiose plans for graceful
shutdown (like the running[0] = False trick that wsgi server parent
process do) - the desired behavior for SIGTERM is to stop - hard.

This change ensures the original goals and intentions of our signal
handler are fulfilled without the undesirable side effect that can
cause our daemons to confusingly log an expected message to stop as an
unexpected error, and start ignoring additional SIGTERM messages;
forcing our kind operators to resort to brutal process murder.

Closes-Bug: #1489209
Change-Id: I9d2886611f6db2498cd6a8f81a58f2a611f40905
This commit is contained in:
Clay Gerrard 2016-10-11 13:23:11 -07:00
parent be1cd1ba40
commit c2ce92acd6
3 changed files with 76 additions and 9 deletions

View File

@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
import sys
import time import time
import signal import signal
from re import sub from re import sub
@ -46,9 +45,10 @@ class Daemon(object):
utils.capture_stdio(self.logger, **kwargs) utils.capture_stdio(self.logger, **kwargs)
def kill_children(*args): def kill_children(*args):
self.logger.info('SIGTERM received')
signal.signal(signal.SIGTERM, signal.SIG_IGN) signal.signal(signal.SIGTERM, signal.SIG_IGN)
os.killpg(0, signal.SIGTERM) os.killpg(0, signal.SIGTERM)
sys.exit() os._exit(0)
signal.signal(signal.SIGTERM, kill_children) signal.signal(signal.SIGTERM, kill_children)
if once: if once:

View File

@ -17,6 +17,9 @@
import unittest import unittest
import random import random
from contextlib import contextmanager
import eventlet
from six.moves import http_client as httplib from six.moves import http_client as httplib
@ -101,5 +104,52 @@ class TestWSGIServerProcessHandling(unittest.TestCase):
self._check_reload(server, node['ip'], node['port']) self._check_reload(server, node['ip'], node['port'])
@contextmanager
def spawn_services(ip_ports, timeout=10):
q = eventlet.Queue()
def service(sock):
try:
conn, address = sock.accept()
q.put(address)
eventlet.sleep(timeout)
conn.close()
finally:
sock.close()
pool = eventlet.GreenPool()
for ip, port in ip_ports:
sock = eventlet.listen((ip, port))
pool.spawn(service, sock)
try:
yield q
finally:
for gt in list(pool.coroutines_running):
gt.kill()
class TestHungDaemon(unittest.TestCase):
def setUp(self):
resetswift()
self.ip_ports = [
(dev['ip'], dev['port'])
for dev in Ring('/etc/swift', ring_name='account').devs
if dev
]
def test_main(self):
reconciler = Manager(['container-reconciler'])
with spawn_services(self.ip_ports) as q:
reconciler.start()
# wait for the reconciler to connect
q.get()
# once it's hung in our connection - send it sig term
print('Attempting to stop reconciler!')
reconciler.stop()
self.assertEqual(1, reconciler.status())
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -22,7 +22,8 @@ import unittest
from getpass import getuser from getpass import getuser
import logging import logging
from test.unit import tmpfile from test.unit import tmpfile
from mock import patch import mock
import signal
from swift.common import daemon, utils from swift.common import daemon, utils
@ -83,10 +84,26 @@ class TestRunDaemon(unittest.TestCase):
d.run(once=True) d.run(once=True)
self.assertEqual(d.once_called, True) self.assertEqual(d.once_called, True)
def test_signal(self):
d = MyDaemon({})
with mock.patch('swift.common.daemon.signal') as mock_signal:
mock_signal.SIGTERM = signal.SIGTERM
d.run()
signal_args, kwargs = mock_signal.signal.call_args
sig, func = signal_args
self.assertEqual(sig, signal.SIGTERM)
with mock.patch('swift.common.daemon.os') as mock_os:
func()
self.assertEqual(mock_os.method_calls, [
mock.call.killpg(0, signal.SIGTERM),
# hard exit because bare except handlers can trap SystemExit
mock.call._exit(0)
])
def test_run_daemon(self): def test_run_daemon(self):
sample_conf = "[my-daemon]\nuser = %s\n" % getuser() sample_conf = "[my-daemon]\nuser = %s\n" % getuser()
with tmpfile(sample_conf) as conf_file: with tmpfile(sample_conf) as conf_file:
with patch.dict('os.environ', {'TZ': ''}): with mock.patch.dict('os.environ', {'TZ': ''}):
daemon.run_daemon(MyDaemon, conf_file) daemon.run_daemon(MyDaemon, conf_file)
self.assertEqual(MyDaemon.forever_called, True) self.assertEqual(MyDaemon.forever_called, True)
self.assertTrue(os.environ['TZ'] is not '') self.assertTrue(os.environ['TZ'] is not '')
@ -94,17 +111,17 @@ class TestRunDaemon(unittest.TestCase):
self.assertEqual(MyDaemon.once_called, True) self.assertEqual(MyDaemon.once_called, True)
# test raise in daemon code # test raise in daemon code
MyDaemon.run_once = MyDaemon.run_raise with mock.patch.object(MyDaemon, 'run_once', MyDaemon.run_raise):
self.assertRaises(OSError, daemon.run_daemon, MyDaemon, self.assertRaises(OSError, daemon.run_daemon, MyDaemon,
conf_file, once=True) conf_file, once=True)
# test user quit # test user quit
MyDaemon.run_forever = MyDaemon.run_quit
sio = StringIO() sio = StringIO()
logger = logging.getLogger('server') logger = logging.getLogger('server')
logger.addHandler(logging.StreamHandler(sio)) logger.addHandler(logging.StreamHandler(sio))
logger = utils.get_logger(None, 'server', log_route='server') logger = utils.get_logger(None, 'server', log_route='server')
daemon.run_daemon(MyDaemon, conf_file, logger=logger) with mock.patch.object(MyDaemon, 'run_forever', MyDaemon.run_quit):
daemon.run_daemon(MyDaemon, conf_file, logger=logger)
self.assertTrue('user quit' in sio.getvalue().lower()) self.assertTrue('user quit' in sio.getvalue().lower())