diff --git a/config.yaml b/config.yaml index a265454e..84ad40f1 100644 --- a/config.yaml +++ b/config.yaml @@ -115,3 +115,15 @@ options: . This network will be used for tenant network traffic in overlay networks. + # Legacy HA + ha-legacy-mode: + type: boolean + default: False + description: | + Support HA ACTIVE/PASSIVE mode with pacemaker and corosync before neutron + native HA feature landed to Juno. + ocf_ping_debug: + type: boolean + default: False + dns_hosts: + type: string diff --git a/files/MonitorNeutron b/files/MonitorNeutron new file mode 100644 index 00000000..fbb71318 --- /dev/null +++ b/files/MonitorNeutron @@ -0,0 +1,222 @@ +#!/bin/sh +# +# +# Neutron_Legacy_HA OCF +# +# Copyright (c) 2014 Hui Xiang +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs} +. ${OCF_FUNCTIONS} +: ${__OCF_ACTION=$1} + +####################################################################### + +meta_data() { + cat < + + +1.0 + + +Every time the monitor action is run, this resource agent records (in the CIB) the current number of ping nodes the host can connect to. +It is essentially the same as pingd except that it uses the system ping tool to obtain the results. + +node connectivity + + + + +PID file +PID file + + + + + +The time to wait (dampening) further changes occur + +Dampening interval + + + + + +The name of the attributes to set. This is the name to be used in the constraints. + +Attribute name + + + + + +The number by which to multiply the number of connected ping nodes by + +Value multiplier + + + + + +The list of ping nodes to count. + +Host list + + + + + +Number of ping attempts, per host, before declaring it dead + +no. of ping attempts + + + + + +How long, in seconds, to wait before declaring a ping lost + +ping timeout in seconds + + + + + +A catch all for any other options that need to be passed to ping. + +Extra Options + + + + + +Resource is failed if the score is less than failure_score. +Default never fails. + +failure_score + + + + + +Enables to use default attrd_updater verbose logging on every call. + +Verbose logging + + + + + + + + + + + + + +END +} + +####################################################################### +OCF_RESKEY_binary_default="/usr/local/bin/monitor.py" +OCF_RESKEY_user_default="neutron" +OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} + +####################################################################### + +MonitorNeutron_conditional_log() { + level=$1; shift + if [ ${OCF_RESKEY_debug} = "true" ]; then + ocf_log $level "$*" + fi +} + +MonitorNeutron_usage() { + cat <> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid + + ping_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + ocf_log info "MonitorNeutron started" +} + +MonitorNeutron_stop() { + # Try SIGTERM + pid=`cat $OCF_RESKEY_pid` + ocf_run kill -s TERM $pid + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "MonitorNeutron couldn't be stopped" + exit $OCF_ERR_GENERIC + fi + + ocf_log info "MonitorNeutron stopped" + return $OCF_SUCCESS +} + +MonitorNeutron_validate() { + # Is the state directory writable? + if [ ! -f $OCF_RESKEY_binary ]; then + ocf_log err "No file $OCF_RESKEY_binary exists !" + return $OCF_ERRARGS + fi + return $OCF_SUCCESS +} + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) MonitorNeutron_start;; +stop) MonitorNeutron_stop;; +reload) MonitorNeutron_start;; +validate-all) MonitorNeutron_validate;; +usage|help) MonitorNeutron_usage + exit $OCF_SUCCESS + ;; +*) MonitorNeutron_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? diff --git a/files/monitor.py b/files/monitor.py new file mode 100644 index 00000000..79716d6b --- /dev/null +++ b/files/monitor.py @@ -0,0 +1,263 @@ +# Copyright 2012 New Dream Network, LLC (DreamHost) +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import atexit +import fcntl +import os +import signal +import sys + +from neutron.openstack.common import log as logging + +LOG = logging.getLogger(__name__) + + +class Pidfile(object): + def __init__(self, pidfile, procname, uuid=None): + self.pidfile = pidfile + self.procname = procname + self.uuid = uuid + try: + self.fd = os.open(pidfile, os.O_CREAT | os.O_RDWR) + fcntl.flock(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + except IOError: + LOG.exception(_("Error while handling pidfile: %s"), pidfile) + sys.exit(1) + + def __str__(self): + return self.pidfile + + def unlock(self): + if not not fcntl.flock(self.fd, fcntl.LOCK_UN): + raise IOError(_('Unable to unlock pid file')) + + def write(self, pid): + os.ftruncate(self.fd, 0) + os.write(self.fd, "%d" % pid) + os.fsync(self.fd) + + def read(self): + try: + pid = int(os.read(self.fd, 128)) + os.lseek(self.fd, 0, os.SEEK_SET) + return pid + except ValueError: + return + + def is_running(self): + pid = self.read() + if not pid: + return False + + cmdline = '/proc/%s/cmdline' % pid + try: + with open(cmdline, "r") as f: + exec_out = f.readline() + return self.procname in exec_out and (not self.uuid or + self.uuid in exec_out) + except IOError: + return False + + +class Daemon(object): + """A generic daemon class. + + Usage: subclass the Daemon class and override the run() method + """ + def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', + stderr='/dev/null', procname='python', uuid=None): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.procname = procname + self.pidfile = Pidfile(pidfile, procname, uuid) + + def _fork(self): + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError: + LOG.exception(_('Fork failed')) + sys.exit(1) + + def daemonize(self): + """Daemonize process by doing Stevens double fork.""" + # fork first time + self._fork() + + # decouple from parent environment + os.chdir("/") + os.setsid() + os.umask(0) + + # fork second time + self._fork() + + # redirect standard file descriptors + # redirect standard file descriptors + sys.stdout.flush() + sys.stderr.flush() + stdin = open(self.stdin, 'r') + stdout = open(self.stdout, 'a+') + stderr = open(self.stderr, 'a+', 0) + os.dup2(stdin.fileno(), sys.stdin.fileno()) + os.dup2(stdout.fileno(), sys.stdout.fileno()) + os.dup2(stderr.fileno(), sys.stderr.fileno()) + + # write pidfile + atexit.register(self.delete_pid) + signal.signal(signal.SIGTERM, self.handle_sigterm) + self.pidfile.write(os.getpid()) + def delete_pid(self): + os.remove(str(self.pidfile)) + + def handle_sigterm(self, signum, frame): + sys.exit(0) + + def start(self): + """Start the daemon.""" + + if self.pidfile.is_running(): + self.pidfile.unlock() + message = _('Pidfile %s already exist. Daemon already running?') + LOG.error(message, self.pidfile) + sys.exit(1) + + # Start the daemon + self.daemonize() + self.run() + + def run(self): + """Override this method when subclassing Daemon. + + start() will call this method after the process has daemonized. + """ + pass + + +class MonitorNeutronAgentsDaemon(Daemon): + def __init__(self, check_interval=None): + self.check_interval = check_interval + log('Monitor Neutron Agent Loop Init') + + def get_env(): + env = {} + with open('/etc/legacy_ha_env_data', 'r') as f: + f.readline() + data = f.split('=').strip() + if data and data[0] and data[1]: + env[data[0]] = env[data[1]] + else: + raise Exception("OpenStack env data uncomplete.") + return env + + def reassign_agent_resources(): + ''' Use agent scheduler API to detect down agents and re-schedule ''' + env = get_env() + if not env: + log('Unable to re-assign resources at this time') + return + try: + from quantumclient.v2_0 import client + except ImportError: + ''' Try to import neutronclient instead for havana+ ''' + from neutronclient.v2_0 import client + + auth_url = '%(auth_protocol)s://%(keystone_host)s:%(auth_port)s/v2.0' % env + quantum = client.Client(username=env['service_username'], + password=env['service_password'], + tenant_name=env['service_tenant'], + auth_url=auth_url, + region_name=env['region']) + + partner_gateways = [unit_private_ip().split('.')[0]] + for partner_gateway in relations_of_type(reltype='cluster'): + gateway_hostname = get_hostname(partner_gateway['private-address']) + partner_gateways.append(gateway_hostname.partition('.')[0]) + + agents = quantum.list_agents(agent_type=DHCP_AGENT) + dhcp_agents = [] + l3_agents = [] + networks = {} + for agent in agents['agents']: + if not agent['alive']: + log('DHCP Agent %s down' % agent['id']) + for network in \ + quantum.list_networks_on_dhcp_agent( + agent['id'])['networks']: + networks[network['id']] = agent['id'] + else: + if agent['host'].partition('.')[0] in partner_gateways: + dhcp_agents.append(agent['id']) + + agents = quantum.list_agents(agent_type=L3_AGENT) + routers = {} + for agent in agents['agents']: + if not agent['alive']: + log('L3 Agent %s down' % agent['id']) + for router in \ + quantum.list_routers_on_l3_agent( + agent['id'])['routers']: + routers[router['id']] = agent['id'] + else: + if agent['host'].split('.')[0] in partner_gateways: + l3_agents.append(agent['id']) + + if len(dhcp_agents) == 0 or len(l3_agents) == 0: + log('Unable to relocate resources, there are %s dhcp_agents and %s \ + l3_agents in this cluster' % (len(dhcp_agents), len(l3_agents))) + return + + index = 0 + for router_id in routers: + agent = index % len(l3_agents) + log('Moving router %s from %s to %s' % + (router_id, routers[router_id], l3_agents[agent])) + quantum.remove_router_from_l3_agent(l3_agent=routers[router_id], + router_id=router_id) + quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent], + body={'router_id': router_id}) + index += 1 + + index = 0 + for network_id in networks: + agent = index % len(dhcp_agents) + log('Moving network %s from %s to %s' % + (network_id, networks[network_id], dhcp_agents[agent])) + quantum.remove_network_from_dhcp_agent(dhcp_agent=networks[network_id], + network_id=network_id) + quantum.add_network_to_dhcp_agent(dhcp_agent=dhcp_agents[agent], + body={'network_id': network_id}) + index += 1 + + def run(): + log('Monitor Neutron Agent Loop Start') + time.sleep(self.check_interval) + reassign_agent_resources() + + +def main(): + opts = [ + cfg.StrOpt('check_interval', + default=15, + help=_('Check Neutron Agents interval.')), + ] + + cfg.CONF.register_cli_opts(opts) + cfg.CONF(project='monitor_neutron_agents', default_config_files=[]) + + monitor_daemon = MonitorNeutronAgentsDaemon( + check_interval=cfg.CONF.check_interval) + monitor_daemon.start() diff --git a/files/ns_ovs_cleanup.sh b/files/ns_ovs_cleanup.sh new file mode 100755 index 00000000..9f54ab18 --- /dev/null +++ b/files/ns_ovs_cleanup.sh @@ -0,0 +1,13 @@ +#! /bin/bash + +if [[ ${CRM_notify_task} == 'monitor' && ${CRM_notify_desc} == 'unknown error' && + $CRM_notify_rsc == 'res_PingCheck' ]]; then + hostname=`hostname` + if [ $hostname == $CRM_notify_node ]; then + echo "Cleaning up namespace and ovs on node $CRM_notify_node !" + for ns in $(ip netns list |grep 'qrouter-'); do ip netns delete $ns; done; + for ns in $(ip netns list |grep 'qdhcp-'); do ip netns delete $ns; done; + neutron-ovs-cleanup + echo "Cleaning done." + fi +fi diff --git a/files/reassign_agent_resources.conf b/files/reassign_agent_resources.conf new file mode 100644 index 00000000..55615f47 --- /dev/null +++ b/files/reassign_agent_resources.conf @@ -0,0 +1,8 @@ +# vim: set ft=upstart et ts=2: +description "Reassign Agent Resources for Legacy HA" +author "Hui Xiang " + +start on runlevel [2345] +stop on runlevel [!2345] + +exec start-stop-daemon --start --chuid neutron --exec /usr/local/bin/reassign_agent_services diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index eb7bfb50..61124070 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -23,7 +23,8 @@ from charmhelpers.core.host import ( lsb_release, ) from charmhelpers.contrib.hahelpers.cluster import( - eligible_leader + eligible_leader, + get_hacluster_config ) from charmhelpers.contrib.hahelpers.apache import( install_ca_cert @@ -45,7 +46,11 @@ from quantum_utils import ( valid_plugin, configure_ovs, reassign_agent_resources, - stop_services + stop_services, + cache_env_data, + get_dns_host, + get_external_agent_f, + install_legacy_ha_files ) hooks = Hooks() @@ -70,6 +75,9 @@ def install(): log('Please provide a valid plugin config', level=ERROR) sys.exit(1) + # Legacy HA for Icehouse + install_legacy_ha_files() + @hooks.hook('config-changed') @restart_on_change(restart_map()) @@ -103,6 +111,7 @@ def config_changed(): def upgrade_charm(): install() config_changed() + install_legacy_ha_files(update=True) @hooks.hook('shared-db-relation-joined') @@ -206,6 +215,51 @@ def cluster_departed(): def stop(): stop_services() + +@hooks.hook('ha-relation-joined') +@hooks.hook('ha-relation-changed') +def ha_relation_joined(): + if config('ha-legacy-mode'): + cache_env_data() + dns_hosts = get_dns_host() + debug = config('ocf_ping_debug') + external_agent = get_external_agent_f() + + cluster_config = get_hacluster_config(excludes_key=['vip']) + resources = { + 'res_PingCheck': 'ocf:pacemaker:ping', + 'res_ClusterMon': 'ocf:pacemaker:ClusterMon', + 'res_MonitorHA': 'ocf:pacemaker:MonitorNeutron' + } + resource_params = { + 'res_PingCheck': 'params host_list={host} dampen="5s" ' + 'debug={debug} multiplier="100" ' + 'failure_score="100" ' + 'op monitor on-fail="restart" interval="10s" ' + 'timeout="1000s" '.format(host=dns_hosts, + debug=debug), + 'res_ClusterMon': 'params user="root" update="30" ' + 'extra_options="-E {external_agent} ' + 'op monitor on-fail="restart" interval="10s"' + .format(external_agent=external_agent), + 'res_MonitorHA': 'op monitor interval="5s" ' + 'location needs_connectivity res_MonitorHA' + 'rule pingd: defined pingd' + #'rule -inf: not_defined pingd or pingd lte 0' + } + + clones = { + 'cl_PingCheck': 'res_PingCheck', + 'cl_ClusterMon': 'res_ClusterMon' + } + + relation_set(corosync_bindiface=cluster_config['ha-bindiface'], + corosync_mcastport=cluster_config['ha-mcastport'], + resources=resources, + resource_params=resource_params, + clones=clones) + + if __name__ == '__main__': try: hooks.execute(sys.argv) diff --git a/hooks/quantum_utils.py b/hooks/quantum_utils.py index 55a12807..87d3965e 100644 --- a/hooks/quantum_utils.py +++ b/hooks/quantum_utils.py @@ -1,11 +1,17 @@ +import os +import shutil +import stat +import subprocess from charmhelpers.core.host import ( service_running, service_stop, service_restart, - lsb_release + lsb_release, + mkdir ) from charmhelpers.core.hookenv import ( log, + ERROR, config, relations_of_type, unit_private_ip, @@ -145,6 +151,8 @@ EARLY_PACKAGES = { N1KV: [] } +LEGACY_HA_TEMPLATE_FILES = 'files' + def get_early_packages(): '''Return a list of package for pre-install based on configured plugin''' @@ -577,3 +585,97 @@ def configure_ovs(): if data_port_ctx and data_port_ctx['data_port']: add_bridge_port(DATA_BRIDGE, data_port_ctx['data_port'], promisc=True) + + +def get_dns_host(): + dns_hosts = ['8.8.8.8 '] + try: + nameservers = subprocess.check_output(['grep', 'nameserver', + '/etc/resolv.conf']) + for ns in nameservers: + dns_hosts.append(ns.split(' ')[1].split('\n')[0].strip() + ' ') + except Exception: + log('Failed to get nameserver from resolv.conf !', level=ERROR) + + if config('dns_hosts'): + dnss = config('dns_hosts').split(' ') + for dns in dnss: + dns_hosts.append(dns + ' ') + + return ''.join(dns_hosts) + + +def copy_file(source_dir, des_dir, f, f_mod=None, update=False): + if not os.path.isdir(des_dir): + mkdir(des_dir) + log('Directory created at: %s' % des_dir) + + if not os.path.isfile(os.path.join(des_dir, f)) or update: + try: + source_f = os.path.join(source_dir, f) + des_f = os.path.join(des_dir, f) + shutil.copy2(source_f, des_dir) + if f_mod: + os.chmod(des_f, f_mod) + except IOError: + log('Failed to copy file from %s to %s.' % + (source_f, des_dir), level=ERROR) + raise + + +def init_upstart_f_4_reassign_agent_resources(): + upstart_f = 'reassign_agent_resources.conf' + exec_dir = '/etc/init' + copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, upstart_f) + + +def init_ocf_MonitorNeutron_f(update=False): + ocf_f = 'MonitorNeutron' + exec_dir = '/usr/lib/ocf/resource.d/pacemaker' + copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, + ocf_f, update=update) + + +def init_external_agent_f(update=False): + agent = 'ns_cleanup.sh' + exec_dir = '/usr/lib/ocf/resource.d/openstack' + copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, + agent, stat.S_IEXEC, update=update) + + +def init_reassign_agent_services_binary(): + service = 'reassign_agent_services' + exec_dir = '/usr/local/bin/' + copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, service, stat.S_IEXEC) + + +def init_monitor_daemon(update=False): + service = 'monitor.py' + exec_dir = '/usr/local/bin/' + copy_file(LEGACY_HA_TEMPLATE_FILES, exec_dir, + service, stat.S_IEXEC, update=update) + + +def install_legacy_ha_files(update=False): + if config('ha-legacy-mode'): + init_ocf_MonitorNeutron_f(update=update) + init_external_agent_f(update=update) + #init_reassign_agent_services_binary() + init_monitor_daemon(update=update) + + +def get_external_agent_f(): + agent = 'ns_cleanup.sh' + exec_dir = '/usr/lib/ocf/resource.d/openstack' + return os.path.join(exec_dir, agent) + + +def cache_env_data(): + env = NetworkServiceContext()() + if not env: + log('Unable to get NetworkServiceContext at this time', level=ERROR) + return + + with open('/etc/legacy_ha_env_data', 'w') as f: + for k, v in env.items(): + f.write(''.join(k, '=', v, '\n')) diff --git a/metadata.yaml b/metadata.yaml index f24dde9f..e7828c29 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -29,6 +29,9 @@ requires: interface: rabbitmq neutron-plugin-api: interface: neutron-plugin-api + ha: + interface: hacluster + scope: container peers: cluster: interface: quantum-gateway-ha