add watchdog module

This provides the watchdog.watch function which is a contextmanager
that can be used before doing operations that have a high risk of
deadlocking or aberrantly taking huge amounts of time to complete
(which could be the root cause for other issues).

Unit tests require the use of sleep to make the timers pop either
before or after the event in question. There is 1 second buffer that's
encoded into each scenario which should be sufficient for unit tests
(even in busy systems). If that turns into an issue later, the buffer
can be increased (I was only seeing real skew of 1-2 ms locally).

Change-Id: I79e39f8a4aa702627ca39bea3e6fc672134b8a72
This commit is contained in:
Sean Dague 2015-01-09 11:27:55 -05:00
parent 18bcbe2ef8
commit df35680b67
4 changed files with 150 additions and 0 deletions

View File

@ -5,3 +5,4 @@
lockutils
opts
processutils
watchdog

View File

@ -0,0 +1,8 @@
==================================
:mod:`oslo_concurrency.watchdog`
==================================
.. automodule:: oslo_concurrency.watchdog
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,66 @@
# Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import contextlib
import logging
import threading
import time
@contextlib.contextmanager
def watch(logger, action, level=logging.DEBUG, after=5.0):
"""Log a message if an operation exceeds a time threshold.
This context manager is expected to be used when you are going to
do an operation in code which might either deadlock or take an
extraordinary amount of time, and you'd like to emit a status
message back to the user that the operation is still ongoing but
has not completed in an expected amount of time. This is more user
friendly than logging 'start' and 'end' events and making users
correlate the events to figure out they ended up in a deadlock.
:param logger: an object that complies to the logger definition
(has a .log method).
:param action: a meaningful string that describes the thing you
are about to do.
:param level: the logging level the message should be emitted
at. Defaults to logging.DEBUG.
:param after: the duration in seconds before the message is
emitted. Defaults to 5.0 seconds.
Example usage::
FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT)
LOG = logging.getLogger('mylogger')
with watchdog.watch(LOG, "subprocess call", logging.ERROR):
subprocess.call("sleep 10", shell=True)
print "done"
"""
start = time.time()
def log():
msg = "%s not completed after %0.3fs" % (action, time.time() - start)
logger.log(level, msg)
timer = threading.Timer(after, log)
timer.start()
try:
yield
finally:
timer.cancel()

75
tests/test_watchdog.py Normal file
View File

@ -0,0 +1,75 @@
# Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import subprocess
import time
import fixtures
from oslotest import base as test_base
from oslo_concurrency import watchdog
LOG_FORMAT = '%(levelname)s %(message)s'
class WatchdogTest(test_base.BaseTestCase):
def setUp(self):
super(WatchdogTest, self).setUp()
# capture the log bits where we can interrogate them
self.logger = logging.getLogger()
self.logger.setLevel(logging.DEBUG)
self.log = self.useFixture(
fixtures.FakeLogger(format=LOG_FORMAT, level=None)
)
def test_in_process_delay(self):
with watchdog.watch(self.logger, "in process", after=1.0):
time.sleep(2)
self.assertIn("DEBUG in process not completed after 1",
self.log.output)
loglines = self.log.output.rstrip().split("\n")
self.assertEqual(1, len(loglines), loglines)
def test_level_setting(self):
with watchdog.watch(self.logger, "in process",
level=logging.ERROR, after=1.0):
time.sleep(2)
self.assertIn("ERROR in process not completed after 1",
self.log.output)
loglines = self.log.output.rstrip().split("\n")
self.assertEqual(1, len(loglines), loglines)
def test_in_process_delay_no_message(self):
with watchdog.watch(self.logger, "in process", after=1.0):
pass
# wait long enough to know there won't be a message emitted
time.sleep(2)
self.assertEqual('', self.log.output)
def test_in_process_exploding(self):
try:
with watchdog.watch(self.logger, "ungraceful exit", after=1.0):
raise Exception()
except Exception:
pass
# wait long enough to know there won't be a message emitted
time.sleep(2)
self.assertEqual('', self.log.output)
def test_subprocess_delay(self):
with watchdog.watch(self.logger, "subprocess", after=0.1):
subprocess.call("sleep 2", shell=True)
self.assertIn("DEBUG subprocess not completed after 0",
self.log.output)