From 06e69573946403e94c9449b64abc550c8f813bdd Mon Sep 17 00:00:00 2001 From: Hui Xiang Date: Mon, 15 Dec 2014 19:36:46 +0800 Subject: [PATCH] Add: Schedule agents when the execute agent is the first active one. --- files/monitor.py | 80 +++++++++++++++++++++++++++++++++++++---- files/ns_ovs_cleanup.sh | 49 +++++++++++++++++++++---- hooks/quantum_hooks.py | 1 - 3 files changed, 116 insertions(+), 14 deletions(-) diff --git a/files/monitor.py b/files/monitor.py index a0045b54..f1178b62 100644 --- a/files/monitor.py +++ b/files/monitor.py @@ -19,9 +19,12 @@ import signal import sys import time -from oslo.config import cfg import logging as LOG +from oslo.config import cfg +from neutron.agent.linux import ovs_lib +from neutron.agent.linux import ip_lib + class Daemon(object): """A generic daemon class. @@ -110,6 +113,56 @@ class MonitorNeutronAgentsDaemon(Daemon): raise Exception("OpenStack env data uncomplete.") return self.env + def get_hostname(): + return subprocess.check_output(['uname', '-n']) + + def get_root_helper(): + return 'sudo' + + def unplug_device(conf, device): + try: + device.link.delete() + except RuntimeError: + root_helper = self.get_root_helper() + # Maybe the device is OVS port, so try to delete + bridge_name = ovs_lib.get_bridge_for_iface(root_helper, device.name) + if bridge_name: + bridge = ovs_lib.OVSBridge(bridge_name, root_helper) + bridge.delete_port(device.name) + else: + LOG.debug(_('Unable to find bridge for device: %s'), device.name) + + def cleanup_dhcp(networks): + namespaces = [] + for network, agent in networks.iteritems(): + namespaces.append('qdhcp-' + network) + + if namespaces: + LOG.info('Namespaces: %s is going to be deleted.' % namespaces) + destroy_namespaces(namespaces) + + def cleanup_router(routers): + namespaces = [] + for router, agent in routers.iteritems(): + namespaces.append('qrouter-' + router) + + if namespaces: + LOG.info('Namespaces: %s is going to be deleted.' % namespaces) + destroy_namespaces(namespaces) + + def destroy_namespaces(namespaces): + try: + root_helper = self.get_root_helper() + for namespace in namespaces: + ip = ip_lib.IPWrapper(root_helper, namespace) + if ip.netns.exists(namespace): + for device in ip.get_devices(exclude_loopback=True): + unplug_device(device) + + ip.garbage_collect_namespace() + except Exception: + LOG.exception(_('Error unable to destroy namespace: %s'), namespace) + def reassign_agent_resources(self): ''' Use agent scheduler API to detect down agents and re-schedule ''' DHCP_AGENT = "DHCP Agent" @@ -145,9 +198,12 @@ class MonitorNeutronAgentsDaemon(Daemon): quantum.list_networks_on_dhcp_agent( agent['id'])['networks']: networks[network['id']] = agent['id'] + if agent['id'] == self.get_hostname(): + self.cleanup_dhcp(networks) else: dhcp_agents.append(agent['id']) - + LOG.info('Active dhcp agents: %s' % dhcp_agents) + agents = quantum.list_agents(agent_type=L3_AGENT) routers = {} for agent in agents['agents']: @@ -157,8 +213,11 @@ class MonitorNeutronAgentsDaemon(Daemon): quantum.list_routers_on_l3_agent( agent['id'])['routers']: routers[router['id']] = agent['id'] + if agent['id'] == self.get_hostname(): + self.cleanup_router(routers) else: l3_agents.append(agent['id']) + LOG.info('Active l3 agents: %s' % l3_agents) if len(dhcp_agents) == 0 or len(l3_agents) == 0: LOG.info('Unable to relocate resources, there are %s dhcp_agents ' @@ -166,6 +225,12 @@ class MonitorNeutronAgentsDaemon(Daemon): len(l3_agents))) return + if l3_agents[0] != self.get_hostname() or \ + dhcp_agents[0] != self.get_hostname(): + LOG.info('Only the first agent could reschedule. l3 agents: %s ' + 'dhcp agents: %s' % (l3_agents, dhcp_agents)) + return + index = 0 for router_id in routers: agent = index % len(l3_agents) @@ -200,15 +265,16 @@ if __name__ == '__main__': cfg.StrOpt('check_interval', default=15, help='Check Neutron Agents interval.'), - cfg.StrOpt('log_file', - default='/var/log/monitor.log', - help='log file'), +# cfg.StrOpt('log_file', +# default='/var/log/monitor.log', +# help='log file'), ] cfg.CONF.register_cli_opts(opts) cfg.CONF(project='monitor_neutron_agents', default_config_files=[]) - - LOG.basicConfig(filename=cfg.CONF.log_file, level=LOG.INFO) + log_file = '/tmp/monitor.log' + print "log file: %s" % cfg.CONF.log_file + LOG.basicConfig(filename=log_file, level=LOG.INFO) monitor_daemon = MonitorNeutronAgentsDaemon( check_interval=cfg.CONF.check_interval) monitor_daemon.start() diff --git a/files/ns_ovs_cleanup.sh b/files/ns_ovs_cleanup.sh index ba2ae16a..34dee3d2 100755 --- a/files/ns_ovs_cleanup.sh +++ b/files/ns_ovs_cleanup.sh @@ -1,7 +1,7 @@ #! /bin/bash -logger "Start running ns_ovs_cleanup.sh..." logger " ** " +logger "Start running ns_ovs_cleanup.sh..." logger " ** " logger "CRM_notify_task: $CRM_notify_task" @@ -9,15 +9,52 @@ logger "CRM_notify_desc: $CRM_notify_desc" logger "CRM_notify_rsc: $CRM_notify_rsc" logger "CRM_notify_node: $CRM_notify_node" logger " ** " -logger " ** " -if [[ ${CRM_notify_task} == 'start' && $CRM_notify_rsc == 'res_PingCheck' ]]; then +set -x + +DEFAULT_PIDFILE="/tmp/monitor.pid" + +function clean_pid +{ + logger "Clean pid." + if [ -f $DEFAULT_PIDFILE ]; then + pid=`cat $DEFAULT_PIDFILE` + if [ ! -z $pid ]; then + sudo kill -s 9 $pid + rm -f $DEFAULT_PIDFILE + logger "pidfile $DEFAULT_PIDFILE is removed." + fi + else + pid=`ps -aux | grep m\[o\]nitor.py | awk -F' ' '{print $2}'` + if [ ! -z $pid ]; then + sudo kill -s 9 $pid + fi + logger "pid $pid is killed." + fi +} + +#if [[ ${CRM_notify_task} == 'start' && $CRM_notify_rsc == 'res_PingCheck' ]]; then +if [[ $CRM_notify_rsc == 'res_PingCheck' && ${CRM_notify_task} == 'start' ]]; then if [[ ${CRM_notify_desc} == 'OK' ]]; then hostname=`hostname` - logger "monitor error hostname: $CRM_notify_node" - logger "hostname: $hostname" + clean_pid + logger "Executing monitor to reschedule Neutron agents..." - sudo python /usr/local/bin/monitor.py + #sudo python /usr/local/bin/monitor.py >> /dev/null 2>&1 & echo $! > $DEFAULT_PIDFILE + sudo python monitor.py >> /dev/null 2>&1 & echo $! + sleep 3 + pid=`ps -aux | grep m\[o\]nitor.py | awk -F' ' '{print $2}'` + if [ ! -z "$pid" ]; then + echo $pid > $DEFAULT_PIDFILE + fi fi +elif [[ $CRM_notify_rsc == 'res_PingCheck' && ${CRM_notify_task} == 'stop' ]]; then + if [[ ${CRM_notify_desc} == 'OK' ]]; then + clean_pid + fi +elif [[ $CRM_notify_rsc == 'res_PingCheck' && ${CRM_notify_task} == 'monitor' ]]; then + if [[ ${CRM_notify_desc} == 'unknown error' ]]; then + logger "TODO" + fi fi diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index b6026d7c..5287ba1f 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -236,7 +236,6 @@ def ha_relation_joined(): resource_params = { 'res_PingCheck': 'params host_list="{host}" dampen="5s" ' 'debug={debug} multiplier="1000" ' - 'failure_score="10" ' 'op monitor on-fail="restart" interval="10s" ' 'timeout="40s" '.format(host=dns_hosts, debug=debug),