From df35680b673eabe0b664f17663f2272711b2e5b4 Mon Sep 17 00:00:00 2001 From: Sean Dague Date: Fri, 9 Jan 2015 11:27:55 -0500 Subject: [PATCH] add watchdog module This provides the watchdog.watch function which is a contextmanager that can be used before doing operations that have a high risk of deadlocking or aberrantly taking huge amounts of time to complete (which could be the root cause for other issues). Unit tests require the use of sleep to make the timers pop either before or after the event in question. There is 1 second buffer that's encoded into each scenario which should be sufficient for unit tests (even in busy systems). If that turns into an issue later, the buffer can be increased (I was only seeing real skew of 1-2 ms locally). Change-Id: I79e39f8a4aa702627ca39bea3e6fc672134b8a72 --- doc/source/api/index.rst | 1 + doc/source/api/watchdog.rst | 8 ++++ oslo_concurrency/watchdog.py | 66 +++++++++++++++++++++++++++++++ tests/test_watchdog.py | 75 ++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+) create mode 100644 doc/source/api/watchdog.rst create mode 100644 oslo_concurrency/watchdog.py create mode 100644 tests/test_watchdog.py diff --git a/doc/source/api/index.rst b/doc/source/api/index.rst index 33aadc4..2ee39d8 100644 --- a/doc/source/api/index.rst +++ b/doc/source/api/index.rst @@ -5,3 +5,4 @@ lockutils opts processutils + watchdog diff --git a/doc/source/api/watchdog.rst b/doc/source/api/watchdog.rst new file mode 100644 index 0000000..feccaeb --- /dev/null +++ b/doc/source/api/watchdog.rst @@ -0,0 +1,8 @@ +================================== + :mod:`oslo_concurrency.watchdog` +================================== + +.. automodule:: oslo_concurrency.watchdog + :members: + :undoc-members: + :show-inheritance: diff --git a/oslo_concurrency/watchdog.py b/oslo_concurrency/watchdog.py new file mode 100644 index 0000000..972aaa9 --- /dev/null +++ b/oslo_concurrency/watchdog.py @@ -0,0 +1,66 @@ +# Copyright (c) 2015 Hewlett-Packard Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import contextlib +import logging +import threading +import time + + +@contextlib.contextmanager +def watch(logger, action, level=logging.DEBUG, after=5.0): + """Log a message if an operation exceeds a time threshold. + + This context manager is expected to be used when you are going to + do an operation in code which might either deadlock or take an + extraordinary amount of time, and you'd like to emit a status + message back to the user that the operation is still ongoing but + has not completed in an expected amount of time. This is more user + friendly than logging 'start' and 'end' events and making users + correlate the events to figure out they ended up in a deadlock. + + :param logger: an object that complies to the logger definition + (has a .log method). + + :param action: a meaningful string that describes the thing you + are about to do. + + :param level: the logging level the message should be emitted + at. Defaults to logging.DEBUG. + + :param after: the duration in seconds before the message is + emitted. Defaults to 5.0 seconds. + + Example usage:: + + FORMAT = '%(asctime)-15s %(message)s' + logging.basicConfig(format=FORMAT) + LOG = logging.getLogger('mylogger') + + with watchdog.watch(LOG, "subprocess call", logging.ERROR): + subprocess.call("sleep 10", shell=True) + print "done" + + """ + start = time.time() + + def log(): + msg = "%s not completed after %0.3fs" % (action, time.time() - start) + logger.log(level, msg) + timer = threading.Timer(after, log) + timer.start() + try: + yield + finally: + timer.cancel() diff --git a/tests/test_watchdog.py b/tests/test_watchdog.py new file mode 100644 index 0000000..8bbacc9 --- /dev/null +++ b/tests/test_watchdog.py @@ -0,0 +1,75 @@ +# Copyright (c) 2015 Hewlett-Packard Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging +import subprocess +import time + +import fixtures +from oslotest import base as test_base + +from oslo_concurrency import watchdog + +LOG_FORMAT = '%(levelname)s %(message)s' + + +class WatchdogTest(test_base.BaseTestCase): + def setUp(self): + super(WatchdogTest, self).setUp() + # capture the log bits where we can interrogate them + self.logger = logging.getLogger() + self.logger.setLevel(logging.DEBUG) + self.log = self.useFixture( + fixtures.FakeLogger(format=LOG_FORMAT, level=None) + ) + + def test_in_process_delay(self): + with watchdog.watch(self.logger, "in process", after=1.0): + time.sleep(2) + self.assertIn("DEBUG in process not completed after 1", + self.log.output) + loglines = self.log.output.rstrip().split("\n") + self.assertEqual(1, len(loglines), loglines) + + def test_level_setting(self): + with watchdog.watch(self.logger, "in process", + level=logging.ERROR, after=1.0): + time.sleep(2) + self.assertIn("ERROR in process not completed after 1", + self.log.output) + loglines = self.log.output.rstrip().split("\n") + self.assertEqual(1, len(loglines), loglines) + + def test_in_process_delay_no_message(self): + with watchdog.watch(self.logger, "in process", after=1.0): + pass + # wait long enough to know there won't be a message emitted + time.sleep(2) + self.assertEqual('', self.log.output) + + def test_in_process_exploding(self): + try: + with watchdog.watch(self.logger, "ungraceful exit", after=1.0): + raise Exception() + except Exception: + pass + # wait long enough to know there won't be a message emitted + time.sleep(2) + self.assertEqual('', self.log.output) + + def test_subprocess_delay(self): + with watchdog.watch(self.logger, "subprocess", after=0.1): + subprocess.call("sleep 2", shell=True) + self.assertIn("DEBUG subprocess not completed after 0", + self.log.output)