Use crm first node to reschedule agents.

This commit is contained in:
Hui Xiang 2014-12-22 17:24:32 +08:00
parent d4648152f1
commit 1b187c661b

View File

@ -12,6 +12,7 @@ cleaned resources on failed nodes.
import os import os
import signal import signal
import sys import sys
import socket
import subprocess import subprocess
import time import time
@ -113,13 +114,29 @@ class MonitorNeutronAgentsDaemon(Daemon):
def get_hostname(self): def get_hostname(self):
if not self.hostname: if not self.hostname:
hostname = subprocess.check_output(['uname', '-n']) self.hostname = socket.get_hostname()
self.hostname = str(hostname).strip()
return self.hostname return self.hostname
def get_root_helper(self): def get_root_helper(self):
return 'sudo' return 'sudo'
def list_nodes(self):
cmd = ['crm', 'node', 'list']
out = subprocess.check_output(cmd)
nodes = []
for line in str(out).split('\n'):
if line != '':
nodes.append(line.split(':')[0])
return nodes
def get_crm_no_1_node(self):
nodes = self.list_nodes()
if nodes:
return nodes[0].split('(')[0] or nodes[0]
else:
LOG.error('Failed to get crm node list.')
return None
def unplug_device(self, device): def unplug_device(self, device):
try: try:
device.link.delete() device.link.delete()
@ -134,6 +151,10 @@ class MonitorNeutronAgentsDaemon(Daemon):
else: else:
LOG.debug('Unable to find bridge for device: %s', device.name) LOG.debug('Unable to find bridge for device: %s', device.name)
def cleanup(self):
self.cleanup_dhcp(None)
self.cleanup_router(None)
def cleanup_dhcp(self, networks): def cleanup_dhcp(self, networks):
namespaces = [] namespaces = []
if networks: if networks:
@ -186,38 +207,46 @@ class MonitorNeutronAgentsDaemon(Daemon):
def is_same_host(self, host): def is_same_host(self, host):
return str(host).strip() == self.get_hostname() return str(host).strip() == self.get_hostname()
def validate_reschedule(self):
crm_no_1_node = self.get_crm_no_1_node()
if not crm_no_1_node:
LOG.error('No crm first node could be found.')
return False
if not self.is_same_host(crm_no_1_node):
LOG.warnning('Only the first crm node %s could reschedule. '
% crm_no_1_node)
return False
return True
def l3_agents_reschedule(self, l3_agents, routers, quantum): def l3_agents_reschedule(self, l3_agents, routers, quantum):
if not self.is_same_host(l3_agents[0]['host']): if not self.validate_reschedule():
LOG.info('Only the first l3 agent %s could reschedule. '
% l3_agents[0]['host'])
return return
index = 0 index = 0
for router_id in routers: for router_id in routers:
agent = index % len(l3_agents) agent = index % len(l3_agents)
LOG.info('Moving router %s from %s to %s' % LOG.info('Moving router %s from %s to %s' %
(router_id, routers[router_id], l3_agents[agent]['id'])) (router_id, routers[router_id], l3_agents[agent]))
quantum.remove_router_from_l3_agent(l3_agent=routers[router_id], quantum.remove_router_from_l3_agent(l3_agent=routers[router_id],
router_id=router_id) router_id=router_id)
quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent]['id'], quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent],
body={'router_id': router_id}) body={'router_id': router_id})
index += 1 index += 1
def dhcp_agents_reschedule(self, dhcp_agents, networks, quantum): def dhcp_agents_reschedule(self, dhcp_agents, networks, quantum):
if not self.is_same_host(dhcp_agents[0]['host']): if not self.validate_reschedule():
LOG.info('Only the first dhcp agent %s could reschedule. '
% dhcp_agents[0]['host'])
return return
index = 0 index = 0
for network_id in networks: for network_id in networks:
agent = index % len(dhcp_agents) agent = index % len(dhcp_agents)
LOG.info('Moving network %s from %s to %s' % (network_id, LOG.info('Moving network %s from %s to %s' % (network_id,
networks[network_id], dhcp_agents[agent]['id'])) networks[network_id], dhcp_agents[agent]))
quantum.remove_network_from_dhcp_agent( quantum.remove_network_from_dhcp_agent(
dhcp_agent=networks[network_id], network_id=network_id) dhcp_agent=networks[network_id], network_id=network_id)
quantum.add_network_to_dhcp_agent( quantum.add_network_to_dhcp_agent(
dhcp_agent=dhcp_agents[agent]['id'], dhcp_agent=dhcp_agents[agent],
body={'network_id': network_id}) body={'network_id': network_id})
index += 1 index += 1
@ -243,7 +272,15 @@ class MonitorNeutronAgentsDaemon(Daemon):
auth_url=auth_url, auth_url=auth_url,
region_name=env['region']) region_name=env['region'])
agents = quantum.list_agents(agent_type=DHCP_AGENT) try:
agents = quantum.list_agents(agent_type=DHCP_AGENT)
except Exception:
self.cleanup()
LOG.error('Failed to get neutron agent list,'
'might be network lost connection,'
'clean up neutron resources.')
return
dhcp_agents = [] dhcp_agents = []
l3_agents = [] l3_agents = []
networks = {} networks = {}
@ -257,7 +294,7 @@ class MonitorNeutronAgentsDaemon(Daemon):
if self.is_same_host(agent['host']): if self.is_same_host(agent['host']):
self.cleanup_dhcp(networks) self.cleanup_dhcp(networks)
else: else:
dhcp_agents.append(agent) dhcp_agents.append(agent['id'])
LOG.info('Active dhcp agents: %s' % agent['id']) LOG.info('Active dhcp agents: %s' % agent['id'])
if not hosted_networks and self.is_same_host(agent['host']): if not hosted_networks and self.is_same_host(agent['host']):
self.cleanup_dhcp(None) self.cleanup_dhcp(None)
@ -274,7 +311,7 @@ class MonitorNeutronAgentsDaemon(Daemon):
if self.is_same_host(agent['host']): if self.is_same_host(agent['host']):
self.cleanup_router(routers) self.cleanup_router(routers)
else: else:
l3_agents.append(agent) l3_agents.append(agent['id'])
LOG.info('Active l3 agents: %s' % agent['id']) LOG.info('Active l3 agents: %s' % agent['id'])
if not hosted_routers and self.is_same_host(agent['host']): if not hosted_routers and self.is_same_host(agent['host']):
self.cleanup_router(None) self.cleanup_router(None)