Support Neutron Legacy HA and Scale feature.

This commit is contained in:
Hui Xiang 2014-12-05 16:35:54 +08:00
parent b9dc0b7ff6
commit 34c698f4fe
8 changed files with 680 additions and 3 deletions

View File

@ -115,3 +115,15 @@ options:
.
This network will be used for tenant network traffic in overlay
networks.
# Legacy HA
ha-legacy-mode:
type: boolean
default: False
description: |
Support HA ACTIVE/PASSIVE mode with pacemaker and corosync before neutron
native HA feature landed to Juno.
ocf_ping_debug:
type: boolean
default: False
dns_hosts:
type: string

222
files/MonitorNeutron Normal file
View File

@ -0,0 +1,222 @@
#!/bin/sh
#
#
# Neutron_Legacy_HA OCF
#
# Copyright (c) 2014 Hui Xiang
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="ping">
<version>1.0</version>
<longdesc lang="en">
Every time the monitor action is run, this resource agent records (in the CIB) the current number of ping nodes the host can connect to.
It is essentially the same as pingd except that it uses the system ping tool to obtain the results.
</longdesc>
<shortdesc lang="en">node connectivity</shortdesc>
<parameters>
<parameter name="pidfile" unique="0">
<longdesc lang="en">PID file</longdesc>
<shortdesc lang="en">PID file</shortdesc>
<content type="string" default="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}" />
</parameter>
<parameter name="dampen" unique="0">
<longdesc lang="en">
The time to wait (dampening) further changes occur
</longdesc>
<shortdesc lang="en">Dampening interval</shortdesc>
<content type="integer" default="5s"/>
</parameter>
<parameter name="name" unique="0">
<longdesc lang="en">
The name of the attributes to set. This is the name to be used in the constraints.
</longdesc>
<shortdesc lang="en">Attribute name</shortdesc>
<content type="string" default="pingd"/>
</parameter>
<parameter name="multiplier" unique="0">
<longdesc lang="en">
The number by which to multiply the number of connected ping nodes by
</longdesc>
<shortdesc lang="en">Value multiplier</shortdesc>
<content type="integer" default=""/>
</parameter>
<parameter name="host_list" unique="0" required="1">
<longdesc lang="en">
The list of ping nodes to count.
</longdesc>
<shortdesc lang="en">Host list</shortdesc>
<content type="string" default=""/>
</parameter>
<parameter name="attempts" unique="0">
<longdesc lang="en">
Number of ping attempts, per host, before declaring it dead
</longdesc>
<shortdesc lang="en">no. of ping attempts</shortdesc>
<content type="integer" default="2"/>
</parameter>
<parameter name="timeout" unique="0">
<longdesc lang="en">
How long, in seconds, to wait before declaring a ping lost
</longdesc>
<shortdesc lang="en">ping timeout in seconds</shortdesc>
<content type="integer" default="2"/>
</parameter>
<parameter name="options" unique="0">
<longdesc lang="en">
A catch all for any other options that need to be passed to ping.
</longdesc>
<shortdesc lang="en">Extra Options</shortdesc>
<content type="string" default=""/>
</parameter>
<parameter name="failure_score" unique="0">
<longdesc lang="en">
Resource is failed if the score is less than failure_score.
Default never fails.
</longdesc>
<shortdesc lang="en">failure_score</shortdesc>
<content type="integer" default=""/>
</parameter>
<parameter name="debug" unique="0">
<longdesc lang="en">
Enables to use default attrd_updater verbose logging on every call.
</longdesc>
<shortdesc lang="en">Verbose logging</shortdesc>
<content type="string" default="false"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="60" />
<action name="stop" timeout="20" />
<action name="reload" timeout="100" />
<action name="monitor" depth="0" timeout="60" interval="10"/>
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
#######################################################################
OCF_RESKEY_binary_default="/usr/local/bin/monitor.py"
OCF_RESKEY_user_default="neutron"
OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
#######################################################################
MonitorNeutron_conditional_log() {
level=$1; shift
if [ ${OCF_RESKEY_debug} = "true" ]; then
ocf_log $level "$*"
fi
}
MonitorNeutron_usage() {
cat <<END
usage: $0 {start|stop|migrate_to|migrate_from|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
MonitorNeutron_start() {
echo "MonitorNeutron_start"
su ${OCF_RESKEY_user} -s /bin/sh -c "python ${OCF_RESKEY_binary} $OCF_RESKEY_additional_parameters" \
' >> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid
ping_monitor
if [ $? = $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
ocf_log info "MonitorNeutron started"
}
MonitorNeutron_stop() {
# Try SIGTERM
pid=`cat $OCF_RESKEY_pid`
ocf_run kill -s TERM $pid
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "MonitorNeutron couldn't be stopped"
exit $OCF_ERR_GENERIC
fi
ocf_log info "MonitorNeutron stopped"
return $OCF_SUCCESS
}
MonitorNeutron_validate() {
# Is the state directory writable?
if [ ! -f $OCF_RESKEY_binary ]; then
ocf_log err "No file $OCF_RESKEY_binary exists !"
return $OCF_ERRARGS
fi
return $OCF_SUCCESS
}
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) MonitorNeutron_start;;
stop) MonitorNeutron_stop;;
reload) MonitorNeutron_start;;
validate-all) MonitorNeutron_validate;;
usage|help) MonitorNeutron_usage
exit $OCF_SUCCESS
;;
*) MonitorNeutron_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit $?

263
files/monitor.py Normal file
View File

@ -0,0 +1,263 @@
# Copyright 2012 New Dream Network, LLC (DreamHost)
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import atexit
import fcntl
import os
import signal
import sys
from neutron.openstack.common import log as logging
LOG = logging.getLogger(__name__)
class Pidfile(object):
def __init__(self, pidfile, procname, uuid=None):
self.pidfile = pidfile
self.procname = procname
self.uuid = uuid
try:
self.fd = os.open(pidfile, os.O_CREAT | os.O_RDWR)
fcntl.flock(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except IOError:
LOG.exception(_("Error while handling pidfile: %s"), pidfile)
sys.exit(1)
def __str__(self):
return self.pidfile
def unlock(self):
if not not fcntl.flock(self.fd, fcntl.LOCK_UN):
raise IOError(_('Unable to unlock pid file'))
def write(self, pid):
os.ftruncate(self.fd, 0)
os.write(self.fd, "%d" % pid)
os.fsync(self.fd)
def read(self):
try:
pid = int(os.read(self.fd, 128))
os.lseek(self.fd, 0, os.SEEK_SET)
return pid
except ValueError:
return
def is_running(self):
pid = self.read()
if not pid:
return False
cmdline = '/proc/%s/cmdline' % pid
try:
with open(cmdline, "r") as f:
exec_out = f.readline()
return self.procname in exec_out and (not self.uuid or
self.uuid in exec_out)
except IOError:
return False
class Daemon(object):
"""A generic daemon class.
Usage: subclass the Daemon class and override the run() method
"""
def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null',
stderr='/dev/null', procname='python', uuid=None):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.procname = procname
self.pidfile = Pidfile(pidfile, procname, uuid)
def _fork(self):
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError:
LOG.exception(_('Fork failed'))
sys.exit(1)
def daemonize(self):
"""Daemonize process by doing Stevens double fork."""
# fork first time
self._fork()
# decouple from parent environment
os.chdir("/")
os.setsid()
os.umask(0)
# fork second time
self._fork()
# redirect standard file descriptors
# redirect standard file descriptors
sys.stdout.flush()
sys.stderr.flush()
stdin = open(self.stdin, 'r')
stdout = open(self.stdout, 'a+')
stderr = open(self.stderr, 'a+', 0)
os.dup2(stdin.fileno(), sys.stdin.fileno())
os.dup2(stdout.fileno(), sys.stdout.fileno())
os.dup2(stderr.fileno(), sys.stderr.fileno())
# write pidfile
atexit.register(self.delete_pid)
signal.signal(signal.SIGTERM, self.handle_sigterm)
self.pidfile.write(os.getpid())
def delete_pid(self):
os.remove(str(self.pidfile))
def handle_sigterm(self, signum, frame):
sys.exit(0)
def start(self):
"""Start the daemon."""
if self.pidfile.is_running():
self.pidfile.unlock()
message = _('Pidfile %s already exist. Daemon already running?')
LOG.error(message, self.pidfile)
sys.exit(1)
# Start the daemon
self.daemonize()
self.run()
def run(self):
"""Override this method when subclassing Daemon.
start() will call this method after the process has daemonized.
"""
pass
class MonitorNeutronAgentsDaemon(Daemon):
def __init__(self, check_interval=None):
self.check_interval = check_interval
log('Monitor Neutron Agent Loop Init')
def get_env():
env = {}
with open('/etc/legacy_ha_env_data', 'r') as f:
f.readline()
data = f.split('=').strip()
if data and data[0] and data[1]:
env[data[0]] = env[data[1]]
else:
raise Exception("OpenStack env data uncomplete.")
return env
def reassign_agent_resources():
''' Use agent scheduler API to detect down agents and re-schedule '''
env = get_env()
if not env:
log('Unable to re-assign resources at this time')
return
try:
from quantumclient.v2_0 import client
except ImportError:
''' Try to import neutronclient instead for havana+ '''
from neutronclient.v2_0 import client
auth_url = '%(auth_protocol)s://%(keystone_host)s:%(auth_port)s/v2.0' % env
quantum = client.Client(username=env['service_username'],
password=env['service_password'],
tenant_name=env['service_tenant'],
auth_url=auth_url,
region_name=env['region'])
partner_gateways = [unit_private_ip().split('.')[0]]
for partner_gateway in relations_of_type(reltype='cluster'):
gateway_hostname = get_hostname(partner_gateway['private-address'])
partner_gateways.append(gateway_hostname.partition('.')[0])
agents = quantum.list_agents(agent_type=DHCP_AGENT)
dhcp_agents = []
l3_agents = []
networks = {}
for agent in agents['agents']:
if not agent['alive']:
log('DHCP Agent %s down' % agent['id'])
for network in \
quantum.list_networks_on_dhcp_agent(
agent['id'])['networks']:
networks[network['id']] = agent['id']
else:
if agent['host'].partition('.')[0] in partner_gateways:
dhcp_agents.append(agent['id'])
agents = quantum.list_agents(agent_type=L3_AGENT)
routers = {}
for agent in agents['agents']:
if not agent['alive']:
log('L3 Agent %s down' % agent['id'])
for router in \
quantum.list_routers_on_l3_agent(
agent['id'])['routers']:
routers[router['id']] = agent['id']
else:
if agent['host'].split('.')[0] in partner_gateways:
l3_agents.append(agent['id'])
if len(dhcp_agents) == 0 or len(l3_agents) == 0:
log('Unable to relocate resources, there are %s dhcp_agents and %s \
l3_agents in this cluster' % (len(dhcp_agents), len(l3_agents)))
return
index = 0
for router_id in routers:
agent = index % len(l3_agents)
log('Moving router %s from %s to %s' %
(router_id, routers[router_id], l3_agents[agent]))
quantum.remove_router_from_l3_agent(l3_agent=routers[router_id],
router_id=router_id)
quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent],
body={'router_id': router_id})
index += 1
index = 0
for network_id in networks:
agent = index % len(dhcp_agents)
log('Moving network %s from %s to %s' %
(network_id, networks[network_id], dhcp_agents[agent]))
quantum.remove_network_from_dhcp_agent(dhcp_agent=networks[network_id],
network_id=network_id)
quantum.add_network_to_dhcp_agent(dhcp_agent=dhcp_agents[agent],
body={'network_id': network_id})
index += 1
def run():
log('Monitor Neutron Agent Loop Start')
time.sleep(self.check_interval)
reassign_agent_resources()
def main():
opts = [
cfg.StrOpt('check_interval',
default=15,
help=_('Check Neutron Agents interval.')),
]
cfg.CONF.register_cli_opts(opts)
cfg.CONF(project='monitor_neutron_agents', default_config_files=[])
monitor_daemon = MonitorNeutronAgentsDaemon(
check_interval=cfg.CONF.check_interval)
monitor_daemon.start()

13
files/ns_ovs_cleanup.sh Executable file
View File

@ -0,0 +1,13 @@
#! /bin/bash
if [[ ${CRM_notify_task} == 'monitor' && ${CRM_notify_desc} == 'unknown error' &&
$CRM_notify_rsc == 'res_PingCheck' ]]; then
hostname=`hostname`
if [ $hostname == $CRM_notify_node ]; then
echo "Cleaning up namespace and ovs on node $CRM_notify_node !"
for ns in $(ip netns list |grep 'qrouter-'); do ip netns delete $ns; done;
for ns in $(ip netns list |grep 'qdhcp-'); do ip netns delete $ns; done;
neutron-ovs-cleanup
echo "Cleaning done."
fi
fi

View File

@ -0,0 +1,8 @@
# vim: set ft=upstart et ts=2:
description "Reassign Agent Resources for Legacy HA"
author "Hui Xiang <hui.xiang@canonical.com>"
start on runlevel [2345]
stop on runlevel [!2345]
exec start-stop-daemon --start --chuid neutron --exec /usr/local/bin/reassign_agent_services

View File

@ -23,7 +23,8 @@ from charmhelpers.core.host import (
lsb_release,
)
from charmhelpers.contrib.hahelpers.cluster import(
eligible_leader
eligible_leader,
get_hacluster_config
)
from charmhelpers.contrib.hahelpers.apache import(
install_ca_cert
@ -45,7 +46,11 @@ from quantum_utils import (
valid_plugin,
configure_ovs,
reassign_agent_resources,
stop_services
stop_services,
cache_env_data,
get_dns_host,
get_external_agent_f,
install_legacy_ha_files
)
hooks = Hooks()
@ -70,6 +75,9 @@ def install():
log('Please provide a valid plugin config', level=ERROR)
sys.exit(1)
# Legacy HA for Icehouse
install_legacy_ha_files()
@hooks.hook('config-changed')
@restart_on_change(restart_map())
@ -103,6 +111,7 @@ def config_changed():
def upgrade_charm():
install()
config_changed()
install_legacy_ha_files(update=True)
@hooks.hook('shared-db-relation-joined')
@ -206,6 +215,51 @@ def cluster_departed():
def stop():
stop_services()
@hooks.hook('ha-relation-joined')
@hooks.hook('ha-relation-changed')
def ha_relation_joined():
if config('ha-legacy-mode'):
cache_env_data()
dns_hosts = get_dns_host()
debug = config('ocf_ping_debug')
external_agent = get_external_agent_f()
cluster_config = get_hacluster_config(excludes_key=['vip'])
resources = {
'res_PingCheck': 'ocf:pacemaker:ping',
'res_ClusterMon': 'ocf:pacemaker:ClusterMon',
'res_MonitorHA': 'ocf:pacemaker:MonitorNeutron'
}
resource_params = {
'res_PingCheck': 'params host_list={host} dampen="5s" '
'debug={debug} multiplier="100" '
'failure_score="100" '
'op monitor on-fail="restart" interval="10s" '
'timeout="1000s" '.format(host=dns_hosts,
debug=debug),
'res_ClusterMon': 'params user="root" update="30" '
'extra_options="-E {external_agent} '
'op monitor on-fail="restart" interval="10s"'
.format(external_agent=external_agent),
'res_MonitorHA': 'op monitor interval="5s" '
'location needs_connectivity res_MonitorHA'
'rule pingd: defined pingd'
#'rule -inf: not_defined pingd or pingd lte 0'
}
clones = {
'cl_PingCheck': 'res_PingCheck',
'cl_ClusterMon': 'res_ClusterMon'
}
relation_set(corosync_bindiface=cluster_config['ha-bindiface'],
corosync_mcastport=cluster_config['ha-mcastport'],
resources=resources,
resource_params=resource_params,
clones=clones)
if __name__ == '__main__':
try:
hooks.execute(sys.argv)

View File

@ -1,11 +1,17 @@
import os
import shutil
import stat
import subprocess
from charmhelpers.core.host import (
service_running,
service_stop,
service_restart,
lsb_release
lsb_release,
mkdir
)
from charmhelpers.core.hookenv import (
log,
ERROR,
config,
relations_of_type,
unit_private_ip,
@ -145,6 +151,8 @@ EARLY_PACKAGES = {
N1KV: []
}
LEGACY_HA_TEMPLATE_FILES = 'files'
def get_early_packages():
'''Return a list of package for pre-install based on configured plugin'''
@ -577,3 +585,97 @@ def configure_ovs():
if data_port_ctx and data_port_ctx['data_port']:
add_bridge_port(DATA_BRIDGE, data_port_ctx['data_port'],
promisc=True)
def get_dns_host():
dns_hosts = ['8.8.8.8 ']
try:
nameservers = subprocess.check_output(['grep', 'nameserver',
'/etc/resolv.conf'])
for ns in nameservers:
dns_hosts.append(ns.split(' ')[1].split('\n')[0].strip() + ' ')
except Exception:
log('Failed to get nameserver from resolv.conf !', level=ERROR)
if config('dns_hosts'):
dnss = config('dns_hosts').split(' ')
for dns in dnss:
dns_hosts.append(dns + ' ')
return ''.join(dns_hosts)
def copy_file(source_dir, des_dir, f, f_mod=None, update=False):
if not os.path.isdir(des_dir):
mkdir(des_dir)
log('Directory created at: %s' % des_dir)
if not os.path.isfile(os.path.join(des_dir, f)) or update:
try:
source_f = os.path.join(source_dir, f)
des_f = os.path.join(des_dir, f)
shutil.copy2(source_f, des_dir)
if f_mod:
os.chmod(des_f, f_mod)
except IOError:
log('Failed to copy file from %s to %s.' %
(source_f, des_dir), level=ERROR)
raise
def init_upstart_f_4_reassign_agent_resources():
upstart_f = 'reassign_agent_resources.conf'
exec_dir = '/etc/init'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, upstart_f)
def init_ocf_MonitorNeutron_f(update=False):
ocf_f = 'MonitorNeutron'
exec_dir = '/usr/lib/ocf/resource.d/pacemaker'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir,
ocf_f, update=update)
def init_external_agent_f(update=False):
agent = 'ns_cleanup.sh'
exec_dir = '/usr/lib/ocf/resource.d/openstack'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir,
agent, stat.S_IEXEC, update=update)
def init_reassign_agent_services_binary():
service = 'reassign_agent_services'
exec_dir = '/usr/local/bin/'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, service, stat.S_IEXEC)
def init_monitor_daemon(update=False):
service = 'monitor.py'
exec_dir = '/usr/local/bin/'
copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir,
service, stat.S_IEXEC, update=update)
def install_legacy_ha_files(update=False):
if config('ha-legacy-mode'):
init_ocf_MonitorNeutron_f(update=update)
init_external_agent_f(update=update)
#init_reassign_agent_services_binary()
init_monitor_daemon(update=update)
def get_external_agent_f():
agent = 'ns_cleanup.sh'
exec_dir = '/usr/lib/ocf/resource.d/openstack'
return os.path.join(exec_dir, agent)
def cache_env_data():
env = NetworkServiceContext()()
if not env:
log('Unable to get NetworkServiceContext at this time', level=ERROR)
return
with open('/etc/legacy_ha_env_data', 'w') as f:
for k, v in env.items():
f.write(''.join(k, '=', v, '\n'))

View File

@ -29,6 +29,9 @@ requires:
interface: rabbitmq
neutron-plugin-api:
interface: neutron-plugin-api
ha:
interface: hacluster
scope: container
peers:
cluster:
interface: quantum-gateway-ha