fff75c5eb4
When neutron routers need migration, make neutron-ha-tool's monitor action return OCF_ERR_GENERIC not OCF_NOT_RUNNING. This is based on the OCF Resource Agent Developer’s Guide, which says in the section for OCF_ERR_GENERIC: The action returned a generic error. A resource agent should use this exit code only when none of the more specific error codes, defined below, accurately describes the problem. The cluster resource manager interprets this exit code as a soft error. This means that unless specifically configured otherwise, the resource manager will attempt to recover a resource which failed with OCF_ERR_GENERIC in-place — usually by restarting the resource on the same node. -- http://www.linux-ha.org/doc/dev-guides/_literal_ocf_err_generic_literal_1.html and also in the section for OCF_NOT_RUNNING: If the resource is not running due to an error condition, the monitor action should instead return one of the OCF_ERR_ exit codes or OCF_FAILED_MASTER. -- http://www.linux-ha.org/doc/dev-guides/_literal_ocf_not_running_literal_7.html Change-Id: I55f78a5c341a8a552e06a252a9c6836877c0cf77
340 lines
11 KiB
Bash
340 lines
11 KiB
Bash
#!/bin/sh
|
|
#
|
|
#
|
|
# OpenStack HA tool for Neutron (neutron-ha-tool)
|
|
#
|
|
# This resource agent wraps the neutron-ha-tool Python script.
|
|
# It can be used to monitor neutron for the availability of the
|
|
# l3-agents and migrate routers away from agents that are
|
|
# currently offline. Additionally it makes sure that dns and dhcp
|
|
# configuration is synchronized across all dhcp-agents. The
|
|
# neutron-ha-tool was originally part of the openstack-network
|
|
# cookbook for Chef. However as of icehouse it got dropped
|
|
# from upstream, and is now maintained here:
|
|
#
|
|
# https://github.com/SUSE-Cloud/cookbook-openstack-network/blob/neutron-ha-tool-maintenance/files/default/neutron-ha-tool.py
|
|
#
|
|
# You can see a brief explanation of how this RA works in this
|
|
# video:
|
|
#
|
|
# https://youtu.be/vBZgtHgSdOY?t=33m39s
|
|
#
|
|
# Authors: Ralf Haferkamp
|
|
# Mainly inspired by the Neutron L3 resource agent written by Emilien Macchi
|
|
#
|
|
# Support: openstack@lists.openstack.org
|
|
# License: Apache Software License (ASL) 2.0
|
|
#
|
|
#
|
|
# See usage() function below for more details ...
|
|
#
|
|
# OCF instance parameters:
|
|
# OCF_RESKEY_binary
|
|
# OCF_RESKEY_os_auth_url
|
|
# OCF_RESKEY_os_region_name
|
|
# OCF_RESKEY_os_username
|
|
# OCF_RESKEY_os_password
|
|
# OCF_RESKEY_os_tenant_name
|
|
#######################################################################
|
|
# Initialization:
|
|
|
|
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
|
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
|
|
|
#######################################################################
|
|
|
|
# Fill in some defaults if no values are specified
|
|
|
|
OCF_RESKEY_binary_default="neutron-ha-tool"
|
|
OCF_RESKEY_os_auth_url_default="http://localhost:5000/v2"
|
|
OCF_RESKEY_os_region_name_default=""
|
|
OCF_RESKEY_os_username_default="admin"
|
|
OCF_RESKEY_os_password_default=""
|
|
OCF_RESKEY_os_tenant_name_default="admin"
|
|
OCF_RESKEY_os_insecure_default="0"
|
|
OCF_RESKEY_os_cacert_default=""
|
|
|
|
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
|
|
: ${OCF_RESKEY_os_auth_url=${OCF_RESKEY_os_auth_url_default}}
|
|
: ${OCF_RESKEY_os_region_name=${OCF_RESKEY_os_region_name_default}}
|
|
: ${OCF_RESKEY_os_tenant_name=${OCF_RESKEY_os_tenant_name_default}}
|
|
: ${OCF_RESKEY_os_username=${OCF_RESKEY_os_username_default}}
|
|
: ${OCF_RESKEY_os_password=${OCF_RESKEY_os_password_default}}
|
|
: ${OCF_RESKEY_os_insecure=${OCF_RESKEY_os_insecure_default}}
|
|
: ${OCF_RESKEY_os_cacert=${OCF_RESKEY_os_cacert_default}}
|
|
|
|
#######################################################################
|
|
|
|
usage() {
|
|
cat <<UEND
|
|
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
|
|
|
|
$0 manages the Neutron HA tool (neutron-ha-tool) as an HA resource
|
|
|
|
The 'start' operation triggers a migrations of all routers on offline
|
|
l3-agents to l3-agents that are actually online.
|
|
The 'stop' is basically noop.
|
|
The 'validate-all' operation reports whether the parameters are valid.
|
|
The 'meta-data' operation reports this RA's meta-data information.
|
|
The 'status' operation reports whether the networking service is running.
|
|
The 'monitor' operation reports whether there are some routers assigned
|
|
to l3-agents that are currently offline.
|
|
|
|
UEND
|
|
}
|
|
|
|
meta_data() {
|
|
cat <<END
|
|
<?xml version="1.0"?>
|
|
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
|
<resource-agent name="neutron-ha-tool">
|
|
<version>1.0</version>
|
|
|
|
<longdesc lang="en">
|
|
This resource agent wraps the Neutron HA Tool (neutron-ha-tool)
|
|
and can be used to check neutron for offline l3-agents that
|
|
have routers assigend and migrate those routers to a different
|
|
(online) l3-agent.
|
|
</longdesc>
|
|
<shortdesc lang="en">Manages the OpenStack Neutron HA Tool (neutron-ha-tool)</shortdesc>
|
|
<parameters>
|
|
|
|
<parameter name="binary" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Location of the OpenStack Neutron HA Tool binary (neutron-ha-tool)
|
|
</longdesc>
|
|
<shortdesc lang="en">OpenStack Neutron HA Tool binary (neutron-ha-tool)</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_binary_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="os_auth_url" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
The URL pointing to the Keystone instance to use for authentication.
|
|
</longdesc>
|
|
<shortdesc lang="en">Keystone URL</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_os_auth_url_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="os_region_name" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
The region name to use for authentication against keystone.
|
|
</longdesc>
|
|
<shortdesc lang="en">Keystone region name</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_os_region_name_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="os_password" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
The password to use for authentication against keystone.
|
|
</longdesc>
|
|
<shortdesc lang="en">Password for authentication</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_os_password_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="os_tenant_name" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
The Tenant to use for authentication against keystone.
|
|
</longdesc>
|
|
<shortdesc lang="en">Tenant name for authentication</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_os_tenant_name_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="os_username" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
OpenStack Username for authentication.
|
|
</longdesc>
|
|
<shortdesc lang="en">OpenStack Username</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_os_username_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="os_insecure" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Disable SSL certificate verification.
|
|
</longdesc>
|
|
<shortdesc lang="en">Disable SSL certificate verification</shortdesc>
|
|
<content type="boolean" default="${OCF_RESKEY_os_insecure_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="os_cacert" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Filename of a SSL CA Certificate Bundle to use for Server Certificate
|
|
verification.
|
|
</longdesc>
|
|
<shortdesc lang="en">SSL CA Bundle file</shortdesc>
|
|
<content type="boolean" default="${OCF_RESKEY_os_cacert_default}" />
|
|
</parameter>
|
|
</parameters>
|
|
|
|
<actions>
|
|
<action name="start" timeout="20" />
|
|
<action name="stop" timeout="20" />
|
|
<action name="status" timeout="20" />
|
|
<action name="monitor" timeout="30" interval="20" />
|
|
<action name="validate-all" timeout="5" />
|
|
<action name="meta-data" timeout="5" />
|
|
</actions>
|
|
</resource-agent>
|
|
END
|
|
}
|
|
|
|
#######################################################################
|
|
# Functions invoked by resource manager actions
|
|
|
|
neutron_ha_tool_validate() {
|
|
check_binary $OCF_RESKEY_binary
|
|
if [ -n "$OCF_RESKEY_os_cacert" ]; then
|
|
if [ ! -f "$OCF_RESKEY_os_cacert" ]; then
|
|
ocf_log err "Failed to verify CA Certifcate Bundle" \
|
|
"($OCF_RESKEY_os_cacert)"
|
|
return 1
|
|
fi
|
|
fi
|
|
true
|
|
}
|
|
|
|
neutron_ha_tool_status() {
|
|
# There is not much to do here, since there is no daemon to check for.
|
|
# Just pretend we're running successfully
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
neutron_ha_tool_monitor() {
|
|
if ! [ -e "$statefile" ]; then
|
|
# neutron-ha-tool is run on a single node at a time, i.e. in
|
|
# active/passive mode. So we use this state file to keep
|
|
# track of whether it's active on the current node, and if
|
|
# Pacemaker does a probe on a node where it's not active, we
|
|
# skip the l3-agent check and always return OCF_NOT_RUNNING,
|
|
# otherwise we'd get messages from pengine like:
|
|
#
|
|
# error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on
|
|
# 2 nodes attempting recovery
|
|
# warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active
|
|
# for more information.
|
|
#
|
|
# and Pacemaker could attempt unnecessary recovery according to the
|
|
# value of the cluster-wide "multiple-active" option.
|
|
ocf_log debug "neutron-ha-tool not currently active on this node; " \
|
|
"skipping l3-agent check"
|
|
return $OCF_NOT_RUNNING
|
|
fi
|
|
|
|
INSECURE=""
|
|
if ocf_is_true $OCF_RESKEY_os_insecure; then
|
|
INSECURE="--insecure"
|
|
fi
|
|
|
|
${OCF_RESKEY_binary} --l3-agent-check --quiet $INSECURE
|
|
|
|
rc=$?
|
|
if [ $rc -eq 2 ]; then
|
|
ocf_log err "Some Neutron routers need migration."
|
|
return $OCF_ERR_GENERIC
|
|
fi
|
|
|
|
ocf_log debug "Neutron HA Tool (neutron-ha-tool) monitor succeeded"
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
neutron_ha_tool_start() {
|
|
touch "$statefile"
|
|
if ! [ -e "$statefile" ]; then
|
|
ocf_log err "Failed to create $statefile - aborting!"
|
|
return $OCF_ERR_GENERIC
|
|
fi
|
|
|
|
INSECURE=""
|
|
if ocf_is_true $OCF_RESKEY_os_insecure; then
|
|
INSECURE="--insecure"
|
|
fi
|
|
|
|
# Remain backwards-compatible with older neutron-ha-tool.py which
|
|
# don't support --retry.
|
|
retry=""
|
|
if ${OCF_RESKEY_binary} --help | grep -q -- --retry; then
|
|
retry="--retry"
|
|
fi
|
|
|
|
${OCF_RESKEY_binary} --replicate-dhcp $retry $INSECURE
|
|
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "Neutron HA Tool failed to replicate networks to DHCP" \
|
|
"agents."
|
|
return $OCF_ERR_GENERIC
|
|
fi
|
|
|
|
${OCF_RESKEY_binary} --l3-agent-migrate $retry --now $INSECURE
|
|
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "Neutron HA Tool failed to migrate routers away from" \
|
|
"offline L3 agents."
|
|
return $OCF_ERR_GENERIC
|
|
fi
|
|
ocf_log debug "Neutron HA Tool (neutron-ha-tool) router migration" \
|
|
"succeeded."
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
neutron_ha_tool_stop() {
|
|
rm -f "$statefile"
|
|
if [ -e "$statefile" ]; then
|
|
ocf_log err "Uh-oh - failed to remove $statefile!"
|
|
# If we can't even remove a file in tmpfs (/run), something
|
|
# is *really* badly wrong, so fence the node.
|
|
return $OCF_ERR_GENERIC
|
|
fi
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
#######################################################################
|
|
|
|
case "$1" in
|
|
meta-data)
|
|
meta_data
|
|
exit $OCF_SUCCESS
|
|
;;
|
|
usage|help)
|
|
usage
|
|
exit $OCF_SUCCESS
|
|
;;
|
|
esac
|
|
|
|
# Anything except meta-data and help must pass validation
|
|
neutron_ha_tool_validate || exit $?
|
|
|
|
# OPENSTACK env variables
|
|
export OS_AUTH_URL=$OCF_RESKEY_os_auth_url
|
|
export OS_REGION_NAME=$OCF_RESKEY_os_region_name
|
|
export OS_TENANT_NAME=$OCF_RESKEY_os_tenant_name
|
|
export OS_USERNAME=$OCF_RESKEY_os_username
|
|
export OS_PASSWORD=$OCF_RESKEY_os_password
|
|
if [ -n "$OCF_RESKEY_os_cacert" ]; then
|
|
export OS_CACERT=$OCF_RESKEY_os_cacert
|
|
fi
|
|
|
|
statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active"
|
|
|
|
# What kind of method was invoked?
|
|
case "$1" in
|
|
start)
|
|
neutron_ha_tool_start
|
|
;;
|
|
stop)
|
|
neutron_ha_tool_stop
|
|
;;
|
|
status)
|
|
neutron_ha_tool_status
|
|
;;
|
|
monitor)
|
|
neutron_ha_tool_monitor
|
|
;;
|
|
validate-all)
|
|
;;
|
|
*)
|
|
usage
|
|
exit $OCF_ERR_UNIMPLEMENTED
|
|
;;
|
|
esac
|