Add ready endpoint to webapp

When running nodepool launchers in kubernetes a common method to
update nodepool or its config is doing rolling restarts. The process
for this is start a new nodepool, wait for it to be ready and then
tear down the old instance. Currently this is not possible without
risking node_failures when there is only one instance serving a
label. The reason for this is that there is no reliable way to
determine when the new instance is fully started which could lead to a
too early tear down of the old instance. This would result in
node_failures for all in-flight nore requests that are only valid for
this provider.

Adding a /ready endpoint to the webapp can make this deterministic
using readiness checks of kubernetes.

Change-Id: I53e77f3d8aaa4742ce2a89c1179e8563f850270e
This commit is contained in:
Tobias Henkel 2019-11-19 14:23:43 +01:00
parent 2a3d4f842b
commit f7f0821e98
5 changed files with 52 additions and 0 deletions

View File

@ -394,6 +394,12 @@ launchers, all will provide the same information.
:resheader Content-Type: ``application/json`` or ``text/plain``
depending on the :http:header:`Accept` header
.. http:get:: /ready
Responds with status code 200 as soon as all configured providers are fully
started. During startup it returns 500. This can be used as a
readiness probe in a kubernetes based deployment.
Monitoring
----------

View File

@ -881,6 +881,7 @@ class NodePool(threading.Thread):
self._delete_thread = None
self._stats_thread = None
self._submittedRequests = {}
self.ready = False
def stop(self):
self._stopped = True
@ -1155,4 +1156,8 @@ class NodePool(threading.Thread):
except Exception:
self.log.exception("Exception in main loop:")
# At this point all providers are registered and fully functional
# so we can mark nodepool as ready.
self.ready = True
self._stop_event.wait(self.watermark_sleep)

View File

@ -17,9 +17,11 @@ import json
import logging
import yaml
from urllib import request
from urllib.error import HTTPError
from nodepool import tests
from nodepool import zk
from nodepool.nodeutils import iterate_timeout
class TestWebApp(tests.DBTestCase):
@ -248,3 +250,30 @@ class TestWebApp(tests.DBTestCase):
config = yaml.safe_load(open(configfile))
self.assertEqual(config['webapp']['port'], 8080)
self.assertEqual(config['webapp']['listen_address'], '127.0.0.1')
def test_webapp_ready(self):
configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
webapp = self.useWebApp(pool, port=0)
webapp.start()
port = webapp.server.socket.getsockname()[1]
# Query ready endpoint before the pool has been started. We expect
# an error in this case.
req = request.Request("http://localhost:%s/ready" % port)
with self.assertRaises(HTTPError, request.urlopen, req):
pass
pool.start()
# Now wait until we get a valid response.
for _ in iterate_timeout(30, Exception, 'ready succeeds'):
try:
f = request.urlopen(req)
break
except HTTPError:
pass
data = f.read()
self.assertEqual(data, b"OK")

View File

@ -75,6 +75,13 @@ class WebApp(threading.Thread):
self.server.server_close()
def get_cache(self, path, params, request_type):
# At first process ready request as this doesn't need caching.
if path == '/ready':
if not self.nodepool.ready:
raise webob.exc.HTTPServiceUnavailable()
else:
return time.time(), 'OK'
# TODO quick and dirty way to take query parameters
# into account when caching data
if params:

View File

@ -0,0 +1,5 @@
---
features:
- |
There is a new :http:get:`/ready` endpoint that can be used as a readiness
probe.