proxy-server: fix node error limiting
If the proxy-server gets an error (e.g. timeout) while reading data from a backend node during an object GET then it will increment the node's error limiting counter. If the node's error counter reaches a threshold then the node is 'error-limited' i.e. not used for a period of time. When a read from a node fails the proxy will also attempt to resume the read by searching for another useful node to replace the failed node. Previously, the outcome of this search for a replacement node erroneously determined whether the failed node's error counter was incremented or not: the error counter was only incremented if a useful replacement node was found. For example, if a read from the last useful primary node failed, and all handoff nodes then returned 404, the last primary node's would not have its error counter incremented (for no good reason). However, if a useful replacement handoff node was found, then the last primary node's error counter would be incremented. This patch fixes the inconsistent and confusing handling of node errors by ensuring that the node error limiting counter is *always* incremented when reading from a node fails, *before* the search for a replacement node. Note: The Related-Change fixed a bug whereby the replacement node's error counter was incremented rather than the failed node's error counter. This patch addresses the inconsistent incrementing of the failed node's error counter. Related-Change: Ibd405b79820cedaf9cb02906b8c13c8d34f4d7ec Change-Id: I938ff7ed7bcbfb363f55821cccd92763dfce449d
This commit is contained in:
parent
dce403f0fb
commit
01c7ade798
@ -1122,18 +1122,15 @@ class GetterBase(object):
|
||||
:return: ``True`` if ``self.source`` has been updated, ``False``
|
||||
otherwise.
|
||||
"""
|
||||
# Subclasses must implement this method
|
||||
# Subclasses must implement this method, but _replace_source should be
|
||||
# called to get a source installed
|
||||
raise NotImplementedError()
|
||||
|
||||
def _replace_source(self, err_msg):
|
||||
# _find_source can modify self.source so stash current source
|
||||
old_source = self.source
|
||||
if not self._find_source():
|
||||
return False
|
||||
|
||||
self.app.error_occurred(old_source.node, err_msg)
|
||||
old_source.close()
|
||||
return True
|
||||
def _replace_source(self, err_msg=''):
|
||||
if self.source:
|
||||
self.app.error_occurred(self.source.node, err_msg)
|
||||
self.source.close()
|
||||
return self._find_source()
|
||||
|
||||
def fast_forward(self, num_bytes):
|
||||
"""
|
||||
@ -1609,7 +1606,7 @@ class GetOrHeadHandler(GetterBase):
|
||||
|
||||
def get_working_response(self, req):
|
||||
res = None
|
||||
if self._find_source():
|
||||
if self._replace_source():
|
||||
res = Response(request=req)
|
||||
res.status = self.source.resp.status
|
||||
update_headers(res, self.source.resp.getheaders())
|
||||
|
@ -4538,12 +4538,14 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
|
||||
self.assertEqual(resp.status_int, 500)
|
||||
self.assertEqual(len(log), self.policy.ec_n_unique_fragments * 2)
|
||||
log_lines = self.app.logger.get_lines_for_level('error')
|
||||
self.assertEqual(2, len(log_lines), log_lines)
|
||||
self.assertIn('Trying to read during GET: ChunkReadTimeout',
|
||||
self.assertEqual(3, len(log_lines), log_lines)
|
||||
self.assertIn('Trying to read next part of EC multi-part GET',
|
||||
log_lines[0])
|
||||
self.assertIn('Trying to read during GET: ChunkReadTimeout',
|
||||
log_lines[1])
|
||||
# not the most graceful ending
|
||||
self.assertIn('Unhandled exception in request: ChunkReadTimeout',
|
||||
log_lines[1])
|
||||
log_lines[2])
|
||||
|
||||
def test_GET_with_multirange_short_resume_body(self):
|
||||
self.app.object_chunk_size = 256
|
||||
@ -5082,12 +5084,17 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
|
||||
md5(resp.body, usedforsecurity=False).hexdigest(),
|
||||
etag)
|
||||
error_lines = self.logger.get_lines_for_level('error')
|
||||
nparity = self.policy.ec_nparity
|
||||
self.assertGreater(len(error_lines), nparity)
|
||||
for line in error_lines[:nparity]:
|
||||
self.assertIn('retrying', line)
|
||||
for line in error_lines[nparity:]:
|
||||
self.assertIn('ChunkReadTimeout (0.01s', line)
|
||||
# all primaries timeout and get error limited
|
||||
error_limit_lines = [
|
||||
line for line in error_lines
|
||||
if 'Trying to read EC fragment during GET (retrying)' in line]
|
||||
self.assertEqual(self.policy.ec_n_unique_fragments,
|
||||
len(error_limit_lines))
|
||||
# all ec_ndata frag getters eventually get a read timeout
|
||||
read_timeout_lines = [
|
||||
line for line in error_lines if 'ChunkReadTimeout (0.01s' in line]
|
||||
self.assertEqual(self.policy.ec_ndata,
|
||||
len(read_timeout_lines))
|
||||
for line in self.logger.logger.records['ERROR']:
|
||||
self.assertIn(req.headers['x-trans-id'], line)
|
||||
|
||||
@ -5170,11 +5177,17 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
|
||||
# resume but won't be able to give us all the right bytes
|
||||
self.assertNotEqual(md5(resp.body).hexdigest(), etag)
|
||||
error_lines = self.logger.get_lines_for_level('error')
|
||||
self.assertEqual(ndata, len(error_lines))
|
||||
for line in error_lines:
|
||||
self.assertIn('ChunkReadTimeout (0.01s', line)
|
||||
for line in self.logger.logger.records['ERROR']:
|
||||
self.assertIn(req.headers['x-trans-id'], line)
|
||||
# only ec_ndata primaries that timeout get error limited (404 or
|
||||
# different etag primaries do not get error limited)
|
||||
error_limit_lines = [
|
||||
line for line in error_lines
|
||||
if 'Trying to read EC fragment during GET (retrying)' in line]
|
||||
self.assertEqual(self.policy.ec_ndata, len(error_limit_lines))
|
||||
# all ec_ndata frag getters eventually get a read timeout
|
||||
read_timeout_lines = [
|
||||
line for line in error_lines if 'ChunkReadTimeout (0.01s' in line]
|
||||
self.assertEqual(self.policy.ec_ndata,
|
||||
len(read_timeout_lines))
|
||||
|
||||
debug_lines = self.logger.get_lines_for_level('debug')
|
||||
nparity = self.policy.ec_nparity
|
||||
|
Loading…
Reference in New Issue
Block a user