add watchdog module

This provides the watchdog.watch function which is a contextmanager that can be used before doing operations that have a high risk of deadlocking or aberrantly taking huge amounts of time to complete (which could be the root cause for other issues). Unit tests require the use of sleep to make the timers pop either before or after the event in question. There is 1 second buffer that's encoded into each scenario which should be sufficient for unit tests (even in busy systems). If that turns into an issue later, the buffer can be increased (I was only seeing real skew of 1-2 ms locally). Change-Id: I79e39f8a4aa702627ca39bea3e6fc672134b8a72
2015-01-09 11:27:55 -05:00 · 2015-01-09 11:27:55 -05:00 · df35680b67
commit df35680b67
parent 18bcbe2ef8
4 changed files with 150 additions and 0 deletions
--- a/doc/source/api/index.rst
+++ b/doc/source/api/index.rst
@ -5,3 +5,4 @@
   lockutils
   opts
   processutils
+   watchdog
--- a/doc/source/api/watchdog.rst
+++ b/doc/source/api/watchdog.rst
@ -0,0 +1,8 @@
+==================================
+ :mod:`oslo_concurrency.watchdog`
+==================================
+
+.. automodule:: oslo_concurrency.watchdog
+  :members:
+  :undoc-members:
+  :show-inheritance:
--- a/oslo_concurrency/watchdog.py
+++ b/oslo_concurrency/watchdog.py
@ -0,0 +1,66 @@
+# Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import contextlib
+import logging
+import threading
+import time
+
+
+@contextlib.contextmanager
+def watch(logger, action, level=logging.DEBUG, after=5.0):
+    """Log a message if an operation exceeds a time threshold.
+
+    This context manager is expected to be used when you are going to
+    do an operation in code which might either deadlock or take an
+    extraordinary amount of time, and you'd like to emit a status
+    message back to the user that the operation is still ongoing but
+    has not completed in an expected amount of time. This is more user
+    friendly than logging 'start' and 'end' events and making users
+    correlate the events to figure out they ended up in a deadlock.
+
+    :param logger: an object that complies to the logger definition
+      (has a .log method).
+
+    :param action: a meaningful string that describes the thing you
+      are about to do.
+
+    :param level: the logging level the message should be emitted
+      at. Defaults to logging.DEBUG.
+
+    :param after: the duration in seconds before the message is
+      emitted. Defaults to 5.0 seconds.
+
+    Example usage::
+
+        FORMAT = '%(asctime)-15s %(message)s'
+        logging.basicConfig(format=FORMAT)
+        LOG = logging.getLogger('mylogger')
+
+        with watchdog.watch(LOG, "subprocess call", logging.ERROR):
+            subprocess.call("sleep 10", shell=True)
+            print "done"
+
+    """
+    start = time.time()
+
+    def log():
+        msg = "%s not completed after %0.3fs" % (action, time.time() - start)
+        logger.log(level, msg)
+    timer = threading.Timer(after, log)
+    timer.start()
+    try:
+        yield
+    finally:
+        timer.cancel()
--- a/tests/test_watchdog.py
+++ b/tests/test_watchdog.py
@ -0,0 +1,75 @@
+# Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import logging
+import subprocess
+import time
+
+import fixtures
+from oslotest import base as test_base
+
+from oslo_concurrency import watchdog
+
+LOG_FORMAT = '%(levelname)s %(message)s'
+
+
+class WatchdogTest(test_base.BaseTestCase):
+    def setUp(self):
+        super(WatchdogTest, self).setUp()
+        # capture the log bits where we can interrogate them
+        self.logger = logging.getLogger()
+        self.logger.setLevel(logging.DEBUG)
+        self.log = self.useFixture(
+            fixtures.FakeLogger(format=LOG_FORMAT, level=None)
+        )
+
+    def test_in_process_delay(self):
+        with watchdog.watch(self.logger, "in process", after=1.0):
+            time.sleep(2)
+        self.assertIn("DEBUG in process not completed after 1",
+                      self.log.output)
+        loglines = self.log.output.rstrip().split("\n")
+        self.assertEqual(1, len(loglines), loglines)
+
+    def test_level_setting(self):
+        with watchdog.watch(self.logger, "in process",
+                            level=logging.ERROR, after=1.0):
+            time.sleep(2)
+        self.assertIn("ERROR in process not completed after 1",
+                      self.log.output)
+        loglines = self.log.output.rstrip().split("\n")
+        self.assertEqual(1, len(loglines), loglines)
+
+    def test_in_process_delay_no_message(self):
+        with watchdog.watch(self.logger, "in process", after=1.0):
+            pass
+        # wait long enough to know there won't be a message emitted
+        time.sleep(2)
+        self.assertEqual('', self.log.output)
+
+    def test_in_process_exploding(self):
+        try:
+            with watchdog.watch(self.logger, "ungraceful exit", after=1.0):
+                raise Exception()
+        except Exception:
+            pass
+        # wait long enough to know there won't be a message emitted
+        time.sleep(2)
+        self.assertEqual('', self.log.output)
+
+    def test_subprocess_delay(self):
+        with watchdog.watch(self.logger, "subprocess", after=0.1):
+            subprocess.call("sleep 2", shell=True)
+        self.assertIn("DEBUG subprocess not completed after 0",
+                      self.log.output)