Revert "Revert "Cleanup down ports""
This reverts commit 7e1b8a7261
.
openstacksdk >=0.19.0 fixes the filtering problems leading to all
ports being deleted. However openstacksdk <0.21.0 has problems with
dogpile.cache so use 0.21.0 as a minimum.
Change-Id: Id642d074cbb645ced5342dda4a1c89987c91a8fc
This commit is contained in:
parent
e459ffa0fd
commit
0cf8144e8c
@ -329,6 +329,12 @@ Nodepool launcher
|
|||||||
* ready
|
* ready
|
||||||
* used
|
* used
|
||||||
|
|
||||||
|
.. zuul:stat:: nodepool.provider.<provider>.downPorts
|
||||||
|
:type: counter
|
||||||
|
|
||||||
|
Number of ports in the DOWN state that have been removed automatically
|
||||||
|
in the cleanup resources phase of the OpenStack driver.
|
||||||
|
|
||||||
.. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
|
.. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
|
||||||
:type: gauge
|
:type: gauge
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ class Dummy(object):
|
|||||||
INSTANCE = 'Instance'
|
INSTANCE = 'Instance'
|
||||||
FLAVOR = 'Flavor'
|
FLAVOR = 'Flavor'
|
||||||
LOCATION = 'Server.Location'
|
LOCATION = 'Server.Location'
|
||||||
|
PORT = 'Port'
|
||||||
|
|
||||||
def __init__(self, kind, **kw):
|
def __init__(self, kind, **kw):
|
||||||
self.__kind = kind
|
self.__kind = kind
|
||||||
@ -105,6 +106,12 @@ class FakeOpenStackCloud(object):
|
|||||||
self._server_list = []
|
self._server_list = []
|
||||||
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
|
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
|
||||||
_get_quota()
|
_get_quota()
|
||||||
|
self._down_ports = [
|
||||||
|
Dummy(Dummy.PORT, id='1a', status='DOWN',
|
||||||
|
device_owner="compute:nova"),
|
||||||
|
Dummy(Dummy.PORT, id='2b', status='DOWN',
|
||||||
|
device_owner=None),
|
||||||
|
]
|
||||||
|
|
||||||
def _get(self, name_or_id, instance_list):
|
def _get(self, name_or_id, instance_list):
|
||||||
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
|
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
|
||||||
@ -286,6 +293,20 @@ class FakeOpenStackCloud(object):
|
|||||||
total_ram_used=8192 * len(self._server_list)
|
total_ram_used=8192 * len(self._server_list)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def list_ports(self, filters=None):
|
||||||
|
if filters and filters.get('status') == 'DOWN':
|
||||||
|
return self._down_ports
|
||||||
|
return []
|
||||||
|
|
||||||
|
def delete_port(self, port_id):
|
||||||
|
tmp_ports = []
|
||||||
|
for port in self._down_ports:
|
||||||
|
if port.id != port_id:
|
||||||
|
tmp_ports.append(port)
|
||||||
|
else:
|
||||||
|
self.log.debug("Deleted port ID: %s", port_id)
|
||||||
|
self._down_ports = tmp_ports
|
||||||
|
|
||||||
|
|
||||||
class FakeUploadFailCloud(FakeOpenStackCloud):
|
class FakeUploadFailCloud(FakeOpenStackCloud):
|
||||||
log = logging.getLogger("nodepool.FakeUploadFailCloud")
|
log = logging.getLogger("nodepool.FakeUploadFailCloud")
|
||||||
|
@ -26,6 +26,7 @@ from nodepool.driver import Provider
|
|||||||
from nodepool.driver.utils import QuotaInformation
|
from nodepool.driver.utils import QuotaInformation
|
||||||
from nodepool.nodeutils import iterate_timeout
|
from nodepool.nodeutils import iterate_timeout
|
||||||
from nodepool.task_manager import TaskManager
|
from nodepool.task_manager import TaskManager
|
||||||
|
from nodepool import stats
|
||||||
from nodepool import version
|
from nodepool import version
|
||||||
from nodepool import zk
|
from nodepool import zk
|
||||||
|
|
||||||
@ -50,6 +51,10 @@ class OpenStackProvider(Provider):
|
|||||||
self._taskmanager = None
|
self._taskmanager = None
|
||||||
self._current_nodepool_quota = None
|
self._current_nodepool_quota = None
|
||||||
self._zk = None
|
self._zk = None
|
||||||
|
self._down_ports = set()
|
||||||
|
self._last_port_cleanup = None
|
||||||
|
self._port_cleanup_interval_secs = 180
|
||||||
|
self._statsd = stats.get_client()
|
||||||
|
|
||||||
def start(self, zk_conn):
|
def start(self, zk_conn):
|
||||||
if self._use_taskmanager:
|
if self._use_taskmanager:
|
||||||
@ -428,6 +433,21 @@ class OpenStackProvider(Provider):
|
|||||||
**meta)
|
**meta)
|
||||||
return image.id
|
return image.id
|
||||||
|
|
||||||
|
def listPorts(self, status=None):
|
||||||
|
'''
|
||||||
|
List known ports.
|
||||||
|
|
||||||
|
:param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'.
|
||||||
|
'''
|
||||||
|
if status:
|
||||||
|
ports = self._client.list_ports(filters={'status': status})
|
||||||
|
else:
|
||||||
|
ports = self._client.list_ports()
|
||||||
|
return ports
|
||||||
|
|
||||||
|
def deletePort(self, port_id):
|
||||||
|
self._client.delete_port(port_id)
|
||||||
|
|
||||||
def listImages(self):
|
def listImages(self):
|
||||||
return self._client.list_images()
|
return self._client.list_images()
|
||||||
|
|
||||||
@ -455,7 +475,7 @@ class OpenStackProvider(Provider):
|
|||||||
self.log.debug('Deleting server %s' % server_id)
|
self.log.debug('Deleting server %s' % server_id)
|
||||||
self.deleteServer(server_id)
|
self.deleteServer(server_id)
|
||||||
|
|
||||||
def cleanupLeakedResources(self):
|
def cleanupLeakedInstances(self):
|
||||||
'''
|
'''
|
||||||
Delete any leaked server instances.
|
Delete any leaked server instances.
|
||||||
|
|
||||||
@ -503,6 +523,63 @@ class OpenStackProvider(Provider):
|
|||||||
node.state = zk.DELETING
|
node.state = zk.DELETING
|
||||||
self._zk.storeNode(node)
|
self._zk.storeNode(node)
|
||||||
|
|
||||||
|
def filterComputePorts(self, ports):
|
||||||
|
'''
|
||||||
|
Return a list of compute ports (or no device owner).
|
||||||
|
|
||||||
|
We are not interested in ports for routers or DHCP.
|
||||||
|
'''
|
||||||
|
ret = []
|
||||||
|
for p in ports:
|
||||||
|
if p.device_owner is None or p.device_owner.startswith("compute:"):
|
||||||
|
ret.append(p)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def cleanupLeakedPorts(self):
|
||||||
|
if not self._last_port_cleanup:
|
||||||
|
self._last_port_cleanup = time.monotonic()
|
||||||
|
ports = self.listPorts(status='DOWN')
|
||||||
|
ports = self.filterComputePorts(ports)
|
||||||
|
self._down_ports = set([p.id for p in ports])
|
||||||
|
return
|
||||||
|
|
||||||
|
# Return if not enough time has passed between cleanup
|
||||||
|
last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
|
||||||
|
if last_check_in_secs <= self._port_cleanup_interval_secs:
|
||||||
|
return
|
||||||
|
|
||||||
|
ports = self.listPorts(status='DOWN')
|
||||||
|
ports = self.filterComputePorts(ports)
|
||||||
|
current_set = set([p.id for p in ports])
|
||||||
|
remove_set = current_set & self._down_ports
|
||||||
|
|
||||||
|
removed_count = 0
|
||||||
|
for port_id in remove_set:
|
||||||
|
try:
|
||||||
|
self.deletePort(port_id)
|
||||||
|
except Exception:
|
||||||
|
self.log.exception("Exception deleting port %s in %s:",
|
||||||
|
port_id, self.provider.name)
|
||||||
|
else:
|
||||||
|
removed_count += 1
|
||||||
|
self.log.debug("Removed DOWN port %s in %s",
|
||||||
|
port_id, self.provider.name)
|
||||||
|
|
||||||
|
if self._statsd and removed_count:
|
||||||
|
key = 'nodepool.provider.%s.downPorts' % (self.provider.name)
|
||||||
|
self._statsd.incr(key, removed_count)
|
||||||
|
|
||||||
|
self._last_port_cleanup = time.monotonic()
|
||||||
|
|
||||||
|
# Rely on OpenStack to tell us the down ports rather than doing our
|
||||||
|
# own set adjustment.
|
||||||
|
ports = self.listPorts(status='DOWN')
|
||||||
|
ports = self.filterComputePorts(ports)
|
||||||
|
self._down_ports = set([p.id for p in ports])
|
||||||
|
|
||||||
|
def cleanupLeakedResources(self):
|
||||||
|
self.cleanupLeakedInstances()
|
||||||
|
self.cleanupLeakedPorts()
|
||||||
if self.provider.clean_floating_ips:
|
if self.provider.clean_floating_ips:
|
||||||
self._client.delete_unattached_floating_ips()
|
self._client.delete_unattached_floating_ips()
|
||||||
|
|
||||||
|
@ -1935,3 +1935,24 @@ class TestLauncher(tests.DBTestCase):
|
|||||||
|
|
||||||
while self.zk.client.exists(path):
|
while self.zk.client.exists(path):
|
||||||
time.sleep(.1)
|
time.sleep(.1)
|
||||||
|
|
||||||
|
def test_leaked_port_cleanup(self):
|
||||||
|
configfile = self.setup_config('node.yaml')
|
||||||
|
self.useBuilder(configfile)
|
||||||
|
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||||
|
pool.cleanup_interval = 1
|
||||||
|
pool.start()
|
||||||
|
self.waitForNodes('fake-label')
|
||||||
|
|
||||||
|
manager = pool.getProviderManager('fake-provider')
|
||||||
|
down_ports = manager.listPorts(status='DOWN')
|
||||||
|
self.assertEqual(2, len(down_ports))
|
||||||
|
self.log.debug("Down ports: %s", down_ports)
|
||||||
|
|
||||||
|
# Change the port cleanup interval to happen quicker
|
||||||
|
manager._port_cleanup_interval_secs = 2
|
||||||
|
while manager.listPorts(status='DOWN'):
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
|
||||||
|
value='2', kind='c')
|
||||||
|
7
releasenotes/notes/port-cleanup-667d476437f03358.yaml
Normal file
7
releasenotes/notes/port-cleanup-667d476437f03358.yaml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Added a new routine to the OpenStack driver cleanup resources phase that
|
||||||
|
will remove any ports reported to be in the DOWN state. Ports will have
|
||||||
|
to be seen as DOWN for at least three minutes before they will be removed.
|
||||||
|
The number of ports removed will be reported to statsd.
|
@ -6,9 +6,8 @@ python-daemon>=2.0.4,<2.1.0
|
|||||||
extras
|
extras
|
||||||
statsd>=3.0
|
statsd>=3.0
|
||||||
PrettyTable>=0.6,<0.8
|
PrettyTable>=0.6,<0.8
|
||||||
# openstacksdk 0.20.0 exposes a bug in keystoneauth. The pin can be removed
|
# openstacksdk before 0.21.0 has issues with dogpile.cache
|
||||||
# once keystoneauth1 has made a release
|
openstacksdk>=0.21.0
|
||||||
openstacksdk>=0.17.2,!=0.18.0,!=0.20.0
|
|
||||||
diskimage-builder>=2.0.0
|
diskimage-builder>=2.0.0
|
||||||
voluptuous
|
voluptuous
|
||||||
kazoo
|
kazoo
|
||||||
|
Loading…
Reference in New Issue
Block a user