proxy-server: fix node error limiting

If the proxy-server gets an error (e.g. timeout) while reading data
from a backend node during an object GET then it will increment the
node's error limiting counter. If the node's error counter reaches a
threshold then the node is 'error-limited' i.e. not used for a period
of time.

When a read from a node fails the proxy will also attempt to resume
the read by searching for another useful node to replace the failed
node. Previously, the outcome of this search for a replacement node
erroneously determined whether the failed node's error counter was
incremented or not: the error counter was only incremented if a useful
replacement node was found.

For example, if a read from the last useful primary node failed, and
all handoff nodes then returned 404, the last primary node's would not
have its error counter incremented (for no good reason). However, if a
useful replacement handoff node was found, then the last primary
node's error counter would be incremented.

This patch fixes the inconsistent and confusing handling of node
errors by ensuring that the node error limiting counter is *always*
incremented when reading from a node fails, *before* the search for a
replacement node.

Note: The Related-Change fixed a bug whereby the replacement node's
error counter was incremented rather than the failed node's error
counter. This patch addresses the inconsistent incrementing of the
failed node's error counter.

Related-Change: Ibd405b79820cedaf9cb02906b8c13c8d34f4d7ec
Change-Id: I938ff7ed7bcbfb363f55821cccd92763dfce449d
This commit is contained in:
Alistair Coles 2023-08-04 14:06:25 +01:00
parent dce403f0fb
commit 01c7ade798
2 changed files with 35 additions and 25 deletions

View File

@ -1122,18 +1122,15 @@ class GetterBase(object):
:return: ``True`` if ``self.source`` has been updated, ``False``
otherwise.
"""
# Subclasses must implement this method
# Subclasses must implement this method, but _replace_source should be
# called to get a source installed
raise NotImplementedError()
def _replace_source(self, err_msg):
# _find_source can modify self.source so stash current source
old_source = self.source
if not self._find_source():
return False
self.app.error_occurred(old_source.node, err_msg)
old_source.close()
return True
def _replace_source(self, err_msg=''):
if self.source:
self.app.error_occurred(self.source.node, err_msg)
self.source.close()
return self._find_source()
def fast_forward(self, num_bytes):
"""
@ -1609,7 +1606,7 @@ class GetOrHeadHandler(GetterBase):
def get_working_response(self, req):
res = None
if self._find_source():
if self._replace_source():
res = Response(request=req)
res.status = self.source.resp.status
update_headers(res, self.source.resp.getheaders())

View File

@ -4538,12 +4538,14 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
self.assertEqual(resp.status_int, 500)
self.assertEqual(len(log), self.policy.ec_n_unique_fragments * 2)
log_lines = self.app.logger.get_lines_for_level('error')
self.assertEqual(2, len(log_lines), log_lines)
self.assertIn('Trying to read during GET: ChunkReadTimeout',
self.assertEqual(3, len(log_lines), log_lines)
self.assertIn('Trying to read next part of EC multi-part GET',
log_lines[0])
self.assertIn('Trying to read during GET: ChunkReadTimeout',
log_lines[1])
# not the most graceful ending
self.assertIn('Unhandled exception in request: ChunkReadTimeout',
log_lines[1])
log_lines[2])
def test_GET_with_multirange_short_resume_body(self):
self.app.object_chunk_size = 256
@ -5082,12 +5084,17 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
md5(resp.body, usedforsecurity=False).hexdigest(),
etag)
error_lines = self.logger.get_lines_for_level('error')
nparity = self.policy.ec_nparity
self.assertGreater(len(error_lines), nparity)
for line in error_lines[:nparity]:
self.assertIn('retrying', line)
for line in error_lines[nparity:]:
self.assertIn('ChunkReadTimeout (0.01s', line)
# all primaries timeout and get error limited
error_limit_lines = [
line for line in error_lines
if 'Trying to read EC fragment during GET (retrying)' in line]
self.assertEqual(self.policy.ec_n_unique_fragments,
len(error_limit_lines))
# all ec_ndata frag getters eventually get a read timeout
read_timeout_lines = [
line for line in error_lines if 'ChunkReadTimeout (0.01s' in line]
self.assertEqual(self.policy.ec_ndata,
len(read_timeout_lines))
for line in self.logger.logger.records['ERROR']:
self.assertIn(req.headers['x-trans-id'], line)
@ -5170,11 +5177,17 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
# resume but won't be able to give us all the right bytes
self.assertNotEqual(md5(resp.body).hexdigest(), etag)
error_lines = self.logger.get_lines_for_level('error')
self.assertEqual(ndata, len(error_lines))
for line in error_lines:
self.assertIn('ChunkReadTimeout (0.01s', line)
for line in self.logger.logger.records['ERROR']:
self.assertIn(req.headers['x-trans-id'], line)
# only ec_ndata primaries that timeout get error limited (404 or
# different etag primaries do not get error limited)
error_limit_lines = [
line for line in error_lines
if 'Trying to read EC fragment during GET (retrying)' in line]
self.assertEqual(self.policy.ec_ndata, len(error_limit_lines))
# all ec_ndata frag getters eventually get a read timeout
read_timeout_lines = [
line for line in error_lines if 'ChunkReadTimeout (0.01s' in line]
self.assertEqual(self.policy.ec_ndata,
len(read_timeout_lines))
debug_lines = self.logger.get_lines_for_level('debug')
nparity = self.policy.ec_nparity