Fix a lot.

This commit is contained in:
Hui Xiang 2014-12-16 15:50:05 +08:00
parent 291554610d
commit 52f27b1c38
6 changed files with 83 additions and 256 deletions

View File

@ -1,171 +0,0 @@
#!/bin/sh
#
#
# Neutron_Legacy_HA OCF
#
# Copyright (c) 2014 Hui Xiang
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="MonitorNeutron" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Monitor Neutron L3/DHCP agents for legacy HA.
</longdesc>
<shortdesc lang="en">Monitor Neutron L3/DHCP agents for legacy HA</shortdesc>
<parameters>
<parameter name="debug" unique="0">
<longdesc lang="en">
Enables to use default attrd_updater verbose logging on every call.
</longdesc>
<shortdesc lang="en">Verbose logging</shortdesc>
<content type="string" default="false"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="60" />
<action name="stop" timeout="20" />
<action name="reload" timeout="100" />
<action name="monitor" depth="0" timeout="60" interval="10"/>
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
#######################################################################
OCF_RESKEY_binary_default="/usr/local/bin/monitor.py"
OCF_RESKEY_user_default="root"
OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
#######################################################################
MonitorNeutron_conditional_log() {
level=$1; shift
if [ ${OCF_RESKEY_debug} = "true" ]; then
ocf_log $level "$*"
fi
}
MonitorNeutron_usage() {
cat <<END
#usage: $0 {start|stop|migrate_to|migrate_from|validate-all|meta-data}
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
MonitorNeutron_start() {
ocf_log info "MonitorNeutron_start"
su ${OCF_RESKEY_user} -s /bin/sh -c "python ${OCF_RESKEY_binary} \
$OCF_RESKEY_additional_parameters"' \
>> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid
if [ $? = $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
ocf_log info "MonitorNeutron started"
}
MonitorNeutron_stop() {
# Try SIGTERM
if [ ! -f $OCF_RESKEY_pid ]; then
ocf_log info "MonitorNeutron already stopped"
#return $OCF_NOT_RUNNING
return $OCF_SUCCESS
fi
pid=`cat $OCF_RESKEY_pid`
if [ -z "$pid" ]; then
ocf_log err "MonitorNeutron pid is empty"
exit $OCF_ERR_GENERIC
fi
ocf_run kill -s TERM $pid
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "MonitorNeutron couldn't be stopped"
return $OCF_SUCCESS
#exit $OCF_ERR_GENERIC
fi
ocf_log info "MonitorNeutron stopped"
return $OCF_SUCCESS
}
MonitorNeutron_validate() {
# Is the state directory writable?
if [ ! -f $OCF_RESKEY_binary ]; then
ocf_log err "No file $OCF_RESKEY_binary exists !"
return $OCF_ERRARGS
fi
return $OCF_SUCCESS
}
MonitorNeutron_monitor() {
return $OCF_SUCCESS
if [ -f ${OCF_RESKEY_pid} ]; then
return $OCF_SUCCESS
fi
return $OCF_ERR_GENERIC
}
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) MonitorNeutron_start;;
stop) MonitorNeutron_stop;;
reload) MonitorNeutron_start;;
monitor) MonitorNeutron_monitor;;
validate-all) MonitorNeutron_validate;;
usage|help) MonitorNeutron_usage
exit $OCF_SUCCESS
;;
*) MonitorNeutron_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit $?

3
files/monitor.conf Normal file
View File

@ -0,0 +1,3 @@
[DEFAULT]
verbose=True
check_interval=15

View File

@ -17,13 +17,15 @@ import fcntl
import os import os
import signal import signal
import sys import sys
import subprocess
import time import time
import logging as LOG
from oslo.config import cfg from oslo.config import cfg
from neutron.agent.linux import ovs_lib from neutron.agent.linux import ovs_lib
from neutron.agent.linux import ip_lib from neutron.agent.linux import ip_lib
from neutron.openstack.common import log as logging
LOG = logging.getLogger(__name__)
class Daemon(object): class Daemon(object):
@ -88,9 +90,9 @@ class Daemon(object):
class MonitorNeutronAgentsDaemon(Daemon): class MonitorNeutronAgentsDaemon(Daemon):
def __init__(self, check_interval=None): def __init__(self):
super(MonitorNeutronAgentsDaemon, self).__init__() super(MonitorNeutronAgentsDaemon, self).__init__()
self.check_interval = check_interval logging.setup('Neuron-HA-Monitor')
LOG.info('Monitor Neutron Agent Loop Init') LOG.info('Monitor Neutron Agent Loop Init')
self.env = {} self.env = {}
@ -113,13 +115,13 @@ class MonitorNeutronAgentsDaemon(Daemon):
raise Exception("OpenStack env data uncomplete.") raise Exception("OpenStack env data uncomplete.")
return self.env return self.env
def get_hostname(): def get_hostname(self):
return subprocess.check_output(['uname', '-n']) return subprocess.check_output(['uname', '-n'])
def get_root_helper(): def get_root_helper(self):
return 'sudo' return 'sudo'
def unplug_device(conf, device): def unplug_device(self, conf, device):
try: try:
device.link.delete() device.link.delete()
except RuntimeError: except RuntimeError:
@ -132,7 +134,7 @@ class MonitorNeutronAgentsDaemon(Daemon):
else: else:
LOG.debug(_('Unable to find bridge for device: %s'), device.name) LOG.debug(_('Unable to find bridge for device: %s'), device.name)
def cleanup_dhcp(networks): def cleanup_dhcp(self, networks):
namespaces = [] namespaces = []
for network, agent in networks.iteritems(): for network, agent in networks.iteritems():
namespaces.append('qdhcp-' + network) namespaces.append('qdhcp-' + network)
@ -141,7 +143,7 @@ class MonitorNeutronAgentsDaemon(Daemon):
LOG.info('Namespaces: %s is going to be deleted.' % namespaces) LOG.info('Namespaces: %s is going to be deleted.' % namespaces)
destroy_namespaces(namespaces) destroy_namespaces(namespaces)
def cleanup_router(routers): def cleanup_router(self, routers):
namespaces = [] namespaces = []
for router, agent in routers.iteritems(): for router, agent in routers.iteritems():
namespaces.append('qrouter-' + router) namespaces.append('qrouter-' + router)
@ -150,7 +152,7 @@ class MonitorNeutronAgentsDaemon(Daemon):
LOG.info('Namespaces: %s is going to be deleted.' % namespaces) LOG.info('Namespaces: %s is going to be deleted.' % namespaces)
destroy_namespaces(namespaces) destroy_namespaces(namespaces)
def destroy_namespaces(namespaces): def destroy_namespaces(self, namespaces):
try: try:
root_helper = self.get_root_helper() root_helper = self.get_root_helper()
for namespace in namespaces: for namespace in namespaces:
@ -163,6 +165,40 @@ class MonitorNeutronAgentsDaemon(Daemon):
except Exception: except Exception:
LOG.exception(_('Error unable to destroy namespace: %s'), namespace) LOG.exception(_('Error unable to destroy namespace: %s'), namespace)
def l3_agents_reschedule(self, l3_agents, routers):
if l3_agents[0] != self.get_hostname():
LOG.info('Only the first agent could reschedule. l3 agents: %s '
'dhcp agents: %s' % (l3_agents))
return
index = 0
for router_id in routers:
agent = index % len(l3_agents)
LOG.info('Moving router %s from %s to %s' %
(router_id, routers[router_id], l3_agents[agent]))
quantum.remove_router_from_l3_agent(l3_agent=routers[router_id],
router_id=router_id)
quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent],
body={'router_id': router_id})
index += 1
def dhcp_agents_reschedule(self, dhcp_agents, networks):
if dhcp_agents[0] != self.get_hostname():
LOG.info('Only the first agent could reschedule. '
'dhcp agents: %s' % dhcp_agents)
return
index = 0
for network_id in networks:
agent = index % len(dhcp_agents)
LOG.info('Moving network %s from %s to %s' %
(network_id, networks[network_id], dhcp_agents[agent]))
quantum.remove_network_from_dhcp_agent(
dhcp_agent=networks[network_id], network_id=network_id)
quantum.add_network_to_dhcp_agent(dhcp_agent=dhcp_agents[agent],
body={'network_id': network_id})
index += 1
def reassign_agent_resources(self): def reassign_agent_resources(self):
''' Use agent scheduler API to detect down agents and re-schedule ''' ''' Use agent scheduler API to detect down agents and re-schedule '''
DHCP_AGENT = "DHCP Agent" DHCP_AGENT = "DHCP Agent"
@ -219,44 +255,27 @@ class MonitorNeutronAgentsDaemon(Daemon):
l3_agents.append(agent['id']) l3_agents.append(agent['id'])
LOG.info('Active l3 agents: %s' % l3_agents) LOG.info('Active l3 agents: %s' % l3_agents)
if len(dhcp_agents) == 0 or len(l3_agents) == 0: if not networks and not routers:
LOG.info('Unable to relocate resources, there are %s dhcp_agents ' LOG.info('No failed agents found, return.')
'and %s l3_agents in this cluster' % (len(dhcp_agents), return
if len(dhcp_agents) == 0 and len(l3_agents) == 0:
LOG.error('Unable to relocate resources, there are %s dhcp_agents '
'and %s l3_agents in this cluster' % (len(dhcp_agents),
len(l3_agents))) len(l3_agents)))
return return
if l3_agents[0] != self.get_hostname() or \ if len(l3_agents) != 0:
dhcp_agents[0] != self.get_hostname(): self.l3_agents_reschedule(l3_agents, routers)
LOG.info('Only the first agent could reschedule. l3 agents: %s '
'dhcp agents: %s' % (l3_agents, dhcp_agents))
return
index = 0 if len(dhcp_agents) != 0:
for router_id in routers: self.dhcp_agents_reschedule(dhcp_agents, networks)
agent = index % len(l3_agents)
LOG.info('Moving router %s from %s to %s' %
(router_id, routers[router_id], l3_agents[agent]))
quantum.remove_router_from_l3_agent(l3_agent=routers[router_id],
router_id=router_id)
quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent],
body={'router_id': router_id})
index += 1
index = 0
for network_id in networks:
agent = index % len(dhcp_agents)
LOG.info('Moving network %s from %s to %s' %
(network_id, networks[network_id], dhcp_agents[agent]))
quantum.remove_network_from_dhcp_agent(
dhcp_agent=networks[network_id], network_id=network_id)
quantum.add_network_to_dhcp_agent(dhcp_agent=dhcp_agents[agent],
body={'network_id': network_id})
index += 1
def run(self): def run(self):
while True: while True:
LOG.info('Monitor Neutron Agent Loop Start') LOG.info('Monitor Neutron Agent Loop Start')
time.sleep(15) LOG.info('sleep %s' % cfg.CONF.check_interval)
time.sleep(float(cfg.CONF.check_interval))
self.reassign_agent_resources() self.reassign_agent_resources()
@ -265,16 +284,10 @@ if __name__ == '__main__':
cfg.StrOpt('check_interval', cfg.StrOpt('check_interval',
default=15, default=15,
help='Check Neutron Agents interval.'), help='Check Neutron Agents interval.'),
# cfg.StrOpt('log_file',
# default='/var/log/monitor.log',
# help='log file'),
] ]
cfg.CONF.register_cli_opts(opts) cfg.CONF.register_cli_opts(opts)
cfg.CONF(project='monitor_neutron_agents', default_config_files=[]) cfg.CONF(project='monitor_neutron_agents', default_config_files=[])
log_file = '/tmp/monitor.log' logging.setup('Neuron-HA-Monitor')
print "log file: %s" % cfg.CONF.log_file monitor_daemon = MonitorNeutronAgentsDaemon()
LOG.basicConfig(filename=log_file, level=LOG.INFO)
monitor_daemon = MonitorNeutronAgentsDaemon(
check_interval=cfg.CONF.check_interval)
monitor_daemon.start() monitor_daemon.start()

View File

@ -2,12 +2,8 @@
logger " ** " logger " ** "
logger "Start running ns_ovs_cleanup.sh..." logger "Start running ns_ovs_cleanup.sh..."
logger " ** " logger "CRM_notify_task: $CRM_notify_task, CRM_notify_desc: $CRM_notify_desc"
logger "CRM_notify_rsc: $CRM_notify_rsc, CRM_notify_node: $CRM_notify_node"
logger "CRM_notify_task: $CRM_notify_task"
logger "CRM_notify_desc: $CRM_notify_desc"
logger "CRM_notify_rsc: $CRM_notify_rsc"
logger "CRM_notify_node: $CRM_notify_node"
logger " ** " logger " ** "
set -x set -x
@ -50,8 +46,8 @@ if [[ $CRM_notify_rsc == 'res_PingCheck' && ${CRM_notify_task} == 'start' ]]; th
check_pid check_pid
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
logger "Executing monitor to reschedule Neutron agents..." logger "Executing monitor to reschedule Neutron agents..."
#sudo python /usr/local/bin/monitor.py >> /dev/null 2>&1 & echo $! > $DEFAULT_PIDFILE sudo python /usr/local/bin/monitor.py --config-file /tmp/monitor.conf \
sudo python /usr/local/bin/monitor.py >> /dev/null 2>&1 & echo $! --log-file /tmp/monitor.log >> /dev/null 2>&1 & echo $!
sleep 3 sleep 3
pid=`ps -aux | grep m\[o\]nitor.py | awk -F' ' '{print $2}'` pid=`ps -aux | grep m\[o\]nitor.py | awk -F' ' '{print $2}'`
if [ ! -z "$pid" ]; then if [ ! -z "$pid" ]; then

View File

@ -1,8 +0,0 @@
# vim: set ft=upstart et ts=2:
description "Reassign Agent Resources for Legacy HA"
author "Hui Xiang <hui.xiang@canonical.com>"
start on runlevel [2345]
stop on runlevel [!2345]
exec start-stop-daemon --start --chuid neutron --exec /usr/local/bin/reassign_agent_services

View File

@ -629,12 +629,6 @@ def copy_file(source_dir, des_dir, f, f_mod=None, update=False):
raise raise
def init_upstart_f_4_reassign_agent_resources():
upstart_f = 'reassign_agent_resources.conf'
exec_dir = '/etc/init'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, upstart_f)
def init_ocf_MonitorNeutron_f(update=False): def init_ocf_MonitorNeutron_f(update=False):
ocf_f = 'MonitorNeutron' ocf_f = 'MonitorNeutron'
exec_dir = '/usr/lib/ocf/resource.d/pacemaker' exec_dir = '/usr/lib/ocf/resource.d/pacemaker'
@ -642,19 +636,19 @@ def init_ocf_MonitorNeutron_f(update=False):
ocf_f, stat.S_IEXEC, update=update) ocf_f, stat.S_IEXEC, update=update)
def get_external_agent_f():
agent = 'monitor_neutron_ha.sh'
exec_dir = '/usr/lib/ocf/resource.d/canonical'
return os.path.join(exec_dir, agent)
def init_external_agent_f(update=False): def init_external_agent_f(update=False):
agent = 'ns_ovs_cleanup.sh' agent = 'monitor_neutron_ha.sh'
exec_dir = '/usr/lib/ocf/resource.d/openstack' exec_dir = '/usr/lib/ocf/resource.d/canonical'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir,
agent, stat.S_IEXEC, update=update) agent, stat.S_IEXEC, update=update)
def init_reassign_agent_services_binary():
service = 'reassign_agent_services'
exec_dir = '/usr/local/bin/'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, service, stat.S_IEXEC)
def init_monitor_daemon(update=False): def init_monitor_daemon(update=False):
service = 'monitor.py' service = 'monitor.py'
exec_dir = '/usr/local/bin/' exec_dir = '/usr/local/bin/'
@ -662,20 +656,20 @@ def init_monitor_daemon(update=False):
service, stat.S_IEXEC, update=update) service, stat.S_IEXEC, update=update)
def init_monitor_conf_files(update=False):
conf = 'monitor.conf'
exec_dir = '/tmp'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir,
conf, update=update)
def install_legacy_ha_files(update=False): def install_legacy_ha_files(update=False):
if config('ha-legacy-mode'): if config('ha-legacy-mode'):
init_ocf_MonitorNeutron_f(update=update) init_ocf_MonitorNeutron_f(update=update)
init_external_agent_f(update=update) init_external_agent_f(update=update)
# init_reassign_agent_services_binary()
init_monitor_daemon(update=update) init_monitor_daemon(update=update)
def get_external_agent_f():
agent = 'ns_ovs_cleanup.sh'
exec_dir = '/usr/lib/ocf/resource.d/openstack'
return os.path.join(exec_dir, agent)
def cache_env_data(): def cache_env_data():
env = NetworkServiceContext()() env = NetworkServiceContext()()
if not env: if not env: