Be more specific in logging timeout exceptions
At the moment, grepping through logs to determine what's happening with timeouts on a provider is difficult because for some errors the cause of the timeout is on a different line than the provider in question. Give each timeout a specific named exception, and then when we catch the exceptions, log them specifically with node id, provider and then the additional descriptive text from the timeout exception. This should allow for easy grepping through logs to find specific instances of types of timeouts - or of all timeouts. Also add a corresponding success debug log so that comparitive greps/counts are also easy. Change-Id: I889bd9b5d92f77ce9ff86415c775fe1cd9545bbc
This commit is contained in:
parent
536f7feab0
commit
eed395d637
@ -23,3 +23,19 @@ class BuilderInvalidCommandError(BuilderError):
|
||||
|
||||
class DibFailedError(BuilderError):
|
||||
pass
|
||||
|
||||
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class SSHTimeoutException(TimeoutException):
|
||||
statsd_key = 'error.ssh'
|
||||
|
||||
|
||||
class IPAddTimeoutException(TimeoutException):
|
||||
statsd_key = 'error.ipadd'
|
||||
|
||||
|
||||
class ServerDeleteException(TimeoutException):
|
||||
statsd_key = 'error.serverdelete'
|
||||
|
@ -34,6 +34,7 @@ import zmq
|
||||
import allocation
|
||||
import jenkins_manager
|
||||
import nodedb
|
||||
import exceptions
|
||||
import nodeutils as utils
|
||||
import provider_manager
|
||||
import stats
|
||||
@ -407,6 +408,19 @@ class NodeLauncher(threading.Thread):
|
||||
dt = self.launchNode(session)
|
||||
failed = False
|
||||
statsd_key = 'ready'
|
||||
self.log.debug('Node %s ready in provider: %s' %
|
||||
(self.node_id, self.provider.name))
|
||||
except exceptions.TimeoutException as e:
|
||||
# Don't log exception for timeouts. Each one has
|
||||
# a specific Exception, and we know it's a timeout, so
|
||||
# the traceback in the log is just noise
|
||||
self.log.error("Timeout launching node id: %s "
|
||||
"in provider: %s error: %s" %
|
||||
(self.node_id, self.provider.name,
|
||||
str(e)))
|
||||
dt = int((time.time() - start_time) * 1000)
|
||||
failed = True
|
||||
statsd_key = e.statsd_key
|
||||
except Exception as e:
|
||||
self.log.exception("%s launching node id: %s "
|
||||
"in provider: %s error:" %
|
||||
|
@ -25,6 +25,8 @@ from sshclient import SSHClient
|
||||
import fakeprovider
|
||||
import paramiko
|
||||
|
||||
import exceptions
|
||||
|
||||
log = logging.getLogger("nodepool.utils")
|
||||
|
||||
|
||||
@ -32,14 +34,14 @@ ITERATE_INTERVAL = 2 # How long to sleep while waiting for something
|
||||
# in a loop
|
||||
|
||||
|
||||
def iterate_timeout(max_seconds, purpose):
|
||||
def iterate_timeout(max_seconds, exc, purpose):
|
||||
start = time.time()
|
||||
count = 0
|
||||
while (time.time() < start + max_seconds):
|
||||
count += 1
|
||||
yield count
|
||||
time.sleep(ITERATE_INTERVAL)
|
||||
raise Exception("Timeout waiting for %s" % purpose)
|
||||
raise exc("Timeout waiting for %s" % purpose)
|
||||
|
||||
|
||||
def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
|
||||
@ -47,7 +49,8 @@ def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
|
||||
return fakeprovider.FakeSSHClient()
|
||||
# HPcloud may return ECONNREFUSED or EHOSTUNREACH
|
||||
# for about 30 seconds after adding the IP
|
||||
for count in iterate_timeout(timeout, "ssh access"):
|
||||
for count in iterate_timeout(
|
||||
timeout, exceptions.SSHTimeoutException, "ssh access"):
|
||||
try:
|
||||
client = SSHClient(ip, username, **connect_kwargs)
|
||||
break
|
||||
|
@ -28,6 +28,7 @@ import sys
|
||||
import shade
|
||||
import novaclient
|
||||
|
||||
import exceptions
|
||||
from nodeutils import iterate_timeout
|
||||
from task_manager import Task, TaskManager, ManagerStoppedException
|
||||
|
||||
@ -36,6 +37,14 @@ SERVER_LIST_AGE = 5 # How long to keep a cached copy of the server list
|
||||
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
|
||||
|
||||
|
||||
class ServerCreateException(exceptions.TimeoutException):
|
||||
statsd_key = 'error.servertimeout'
|
||||
|
||||
|
||||
class ImageCreateException(exceptions.TimeoutException):
|
||||
statsd_key = 'error.imagetimeout'
|
||||
|
||||
|
||||
def get_public_ip(server, provider, version=4):
|
||||
for addr in server.addresses.get('public', []):
|
||||
if type(addr) == type(u''): # Rackspace/openstack 1.0
|
||||
@ -456,10 +465,12 @@ class ProviderManager(TaskManager):
|
||||
|
||||
def _waitForResource(self, resource_type, resource_id, timeout):
|
||||
last_status = None
|
||||
for count in iterate_timeout(timeout,
|
||||
"%s %s in %s" % (resource_type,
|
||||
resource_id,
|
||||
self.provider.name)):
|
||||
if resource_type == 'server':
|
||||
exc = ServerCreateException
|
||||
elif resource_type == 'image':
|
||||
exc = ImageCreateException
|
||||
for count in iterate_timeout(timeout, exc,
|
||||
"%s creation" % resource_type):
|
||||
try:
|
||||
if resource_type == 'server':
|
||||
resource = self.getServerFromList(resource_id)
|
||||
@ -497,8 +508,9 @@ class ProviderManager(TaskManager):
|
||||
return self._waitForResource('server', server_id, timeout)
|
||||
|
||||
def waitForServerDeletion(self, server_id, timeout=600):
|
||||
for count in iterate_timeout(600, "server %s deletion in %s" %
|
||||
(server_id, self.provider.name)):
|
||||
for count in iterate_timeout(600,
|
||||
exceptions.ServerDeleteException,
|
||||
"server %s deletion " % server_id):
|
||||
try:
|
||||
self.getServerFromList(server_id)
|
||||
except NotFound:
|
||||
@ -528,8 +540,9 @@ class ProviderManager(TaskManager):
|
||||
# from removing this IP.
|
||||
self.deleteFloatingIP(ip['id'])
|
||||
raise
|
||||
for count in iterate_timeout(600, "ip to be added to %s in %s" %
|
||||
(server_id, self.provider.name)):
|
||||
for count in iterate_timeout(600,
|
||||
exceptions.IPAddTimeoutException,
|
||||
"ip to be added to %s" % server_id):
|
||||
try:
|
||||
newip = self.getFloatingIP(ip['id'])
|
||||
except ManagerStoppedException:
|
||||
|
Loading…
x
Reference in New Issue
Block a user