openstack-resource-agents/ocf/neutron-ha-tool
Adam Spiers fff75c5eb4 neutron-ha-tool: fix monitor return code
When neutron routers need migration, make neutron-ha-tool's monitor
action return OCF_ERR_GENERIC not OCF_NOT_RUNNING.  This is based on the
OCF Resource Agent Developer’s Guide, which says in the section for
OCF_ERR_GENERIC:

    The action returned a generic error. A resource agent should use
    this exit code only when none of the more specific error codes,
    defined below, accurately describes the problem.

    The cluster resource manager interprets this exit code as a soft
    error. This means that unless specifically configured otherwise, the
    resource manager will attempt to recover a resource which failed
    with OCF_ERR_GENERIC in-place — usually by restarting the resource
    on the same node.

      -- http://www.linux-ha.org/doc/dev-guides/_literal_ocf_err_generic_literal_1.html

and also in the section for OCF_NOT_RUNNING:

    If the resource is not running due to an error condition, the
    monitor action should instead return one of the OCF_ERR_ exit codes
    or OCF_FAILED_MASTER.

      -- http://www.linux-ha.org/doc/dev-guides/_literal_ocf_not_running_literal_7.html

Change-Id: I55f78a5c341a8a552e06a252a9c6836877c0cf77
2016-04-01 20:27:11 +01:00

340 lines
11 KiB
Bash

#!/bin/sh
#
#
# OpenStack HA tool for Neutron (neutron-ha-tool)
#
# This resource agent wraps the neutron-ha-tool Python script.
# It can be used to monitor neutron for the availability of the
# l3-agents and migrate routers away from agents that are
# currently offline. Additionally it makes sure that dns and dhcp
# configuration is synchronized across all dhcp-agents. The
# neutron-ha-tool was originally part of the openstack-network
# cookbook for Chef. However as of icehouse it got dropped
# from upstream, and is now maintained here:
#
# https://github.com/SUSE-Cloud/cookbook-openstack-network/blob/neutron-ha-tool-maintenance/files/default/neutron-ha-tool.py
#
# You can see a brief explanation of how this RA works in this
# video:
#
# https://youtu.be/vBZgtHgSdOY?t=33m39s
#
# Authors: Ralf Haferkamp
# Mainly inspired by the Neutron L3 resource agent written by Emilien Macchi
#
# Support: openstack@lists.openstack.org
# License: Apache Software License (ASL) 2.0
#
#
# See usage() function below for more details ...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_os_auth_url
# OCF_RESKEY_os_region_name
# OCF_RESKEY_os_username
# OCF_RESKEY_os_password
# OCF_RESKEY_os_tenant_name
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# Fill in some defaults if no values are specified
OCF_RESKEY_binary_default="neutron-ha-tool"
OCF_RESKEY_os_auth_url_default="http://localhost:5000/v2"
OCF_RESKEY_os_region_name_default=""
OCF_RESKEY_os_username_default="admin"
OCF_RESKEY_os_password_default=""
OCF_RESKEY_os_tenant_name_default="admin"
OCF_RESKEY_os_insecure_default="0"
OCF_RESKEY_os_cacert_default=""
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_os_auth_url=${OCF_RESKEY_os_auth_url_default}}
: ${OCF_RESKEY_os_region_name=${OCF_RESKEY_os_region_name_default}}
: ${OCF_RESKEY_os_tenant_name=${OCF_RESKEY_os_tenant_name_default}}
: ${OCF_RESKEY_os_username=${OCF_RESKEY_os_username_default}}
: ${OCF_RESKEY_os_password=${OCF_RESKEY_os_password_default}}
: ${OCF_RESKEY_os_insecure=${OCF_RESKEY_os_insecure_default}}
: ${OCF_RESKEY_os_cacert=${OCF_RESKEY_os_cacert_default}}
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
$0 manages the Neutron HA tool (neutron-ha-tool) as an HA resource
The 'start' operation triggers a migrations of all routers on offline
l3-agents to l3-agents that are actually online.
The 'stop' is basically noop.
The 'validate-all' operation reports whether the parameters are valid.
The 'meta-data' operation reports this RA's meta-data information.
The 'status' operation reports whether the networking service is running.
The 'monitor' operation reports whether there are some routers assigned
to l3-agents that are currently offline.
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="neutron-ha-tool">
<version>1.0</version>
<longdesc lang="en">
This resource agent wraps the Neutron HA Tool (neutron-ha-tool)
and can be used to check neutron for offline l3-agents that
have routers assigend and migrate those routers to a different
(online) l3-agent.
</longdesc>
<shortdesc lang="en">Manages the OpenStack Neutron HA Tool (neutron-ha-tool)</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the OpenStack Neutron HA Tool binary (neutron-ha-tool)
</longdesc>
<shortdesc lang="en">OpenStack Neutron HA Tool binary (neutron-ha-tool)</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="os_auth_url" unique="0" required="0">
<longdesc lang="en">
The URL pointing to the Keystone instance to use for authentication.
</longdesc>
<shortdesc lang="en">Keystone URL</shortdesc>
<content type="string" default="${OCF_RESKEY_os_auth_url_default}" />
</parameter>
<parameter name="os_region_name" unique="0" required="0">
<longdesc lang="en">
The region name to use for authentication against keystone.
</longdesc>
<shortdesc lang="en">Keystone region name</shortdesc>
<content type="string" default="${OCF_RESKEY_os_region_name_default}" />
</parameter>
<parameter name="os_password" unique="0" required="0">
<longdesc lang="en">
The password to use for authentication against keystone.
</longdesc>
<shortdesc lang="en">Password for authentication</shortdesc>
<content type="string" default="${OCF_RESKEY_os_password_default}" />
</parameter>
<parameter name="os_tenant_name" unique="0" required="0">
<longdesc lang="en">
The Tenant to use for authentication against keystone.
</longdesc>
<shortdesc lang="en">Tenant name for authentication</shortdesc>
<content type="string" default="${OCF_RESKEY_os_tenant_name_default}" />
</parameter>
<parameter name="os_username" unique="0" required="0">
<longdesc lang="en">
OpenStack Username for authentication.
</longdesc>
<shortdesc lang="en">OpenStack Username</shortdesc>
<content type="string" default="${OCF_RESKEY_os_username_default}" />
</parameter>
<parameter name="os_insecure" unique="0" required="0">
<longdesc lang="en">
Disable SSL certificate verification.
</longdesc>
<shortdesc lang="en">Disable SSL certificate verification</shortdesc>
<content type="boolean" default="${OCF_RESKEY_os_insecure_default}" />
</parameter>
<parameter name="os_cacert" unique="0" required="0">
<longdesc lang="en">
Filename of a SSL CA Certificate Bundle to use for Server Certificate
verification.
</longdesc>
<shortdesc lang="en">SSL CA Bundle file</shortdesc>
<content type="boolean" default="${OCF_RESKEY_os_cacert_default}" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="status" timeout="20" />
<action name="monitor" timeout="30" interval="20" />
<action name="validate-all" timeout="5" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}
#######################################################################
# Functions invoked by resource manager actions
neutron_ha_tool_validate() {
check_binary $OCF_RESKEY_binary
if [ -n "$OCF_RESKEY_os_cacert" ]; then
if [ ! -f "$OCF_RESKEY_os_cacert" ]; then
ocf_log err "Failed to verify CA Certifcate Bundle" \
"($OCF_RESKEY_os_cacert)"
return 1
fi
fi
true
}
neutron_ha_tool_status() {
# There is not much to do here, since there is no daemon to check for.
# Just pretend we're running successfully
return $OCF_SUCCESS
}
neutron_ha_tool_monitor() {
if ! [ -e "$statefile" ]; then
# neutron-ha-tool is run on a single node at a time, i.e. in
# active/passive mode. So we use this state file to keep
# track of whether it's active on the current node, and if
# Pacemaker does a probe on a node where it's not active, we
# skip the l3-agent check and always return OCF_NOT_RUNNING,
# otherwise we'd get messages from pengine like:
#
# error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on
# 2 nodes attempting recovery
# warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active
# for more information.
#
# and Pacemaker could attempt unnecessary recovery according to the
# value of the cluster-wide "multiple-active" option.
ocf_log debug "neutron-ha-tool not currently active on this node; " \
"skipping l3-agent check"
return $OCF_NOT_RUNNING
fi
INSECURE=""
if ocf_is_true $OCF_RESKEY_os_insecure; then
INSECURE="--insecure"
fi
${OCF_RESKEY_binary} --l3-agent-check --quiet $INSECURE
rc=$?
if [ $rc -eq 2 ]; then
ocf_log err "Some Neutron routers need migration."
return $OCF_ERR_GENERIC
fi
ocf_log debug "Neutron HA Tool (neutron-ha-tool) monitor succeeded"
return $OCF_SUCCESS
}
neutron_ha_tool_start() {
touch "$statefile"
if ! [ -e "$statefile" ]; then
ocf_log err "Failed to create $statefile - aborting!"
return $OCF_ERR_GENERIC
fi
INSECURE=""
if ocf_is_true $OCF_RESKEY_os_insecure; then
INSECURE="--insecure"
fi
# Remain backwards-compatible with older neutron-ha-tool.py which
# don't support --retry.
retry=""
if ${OCF_RESKEY_binary} --help | grep -q -- --retry; then
retry="--retry"
fi
${OCF_RESKEY_binary} --replicate-dhcp $retry $INSECURE
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "Neutron HA Tool failed to replicate networks to DHCP" \
"agents."
return $OCF_ERR_GENERIC
fi
${OCF_RESKEY_binary} --l3-agent-migrate $retry --now $INSECURE
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "Neutron HA Tool failed to migrate routers away from" \
"offline L3 agents."
return $OCF_ERR_GENERIC
fi
ocf_log debug "Neutron HA Tool (neutron-ha-tool) router migration" \
"succeeded."
return $OCF_SUCCESS
}
neutron_ha_tool_stop() {
rm -f "$statefile"
if [ -e "$statefile" ]; then
ocf_log err "Uh-oh - failed to remove $statefile!"
# If we can't even remove a file in tmpfs (/run), something
# is *really* badly wrong, so fence the node.
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
}
#######################################################################
case "$1" in
meta-data)
meta_data
exit $OCF_SUCCESS
;;
usage|help)
usage
exit $OCF_SUCCESS
;;
esac
# Anything except meta-data and help must pass validation
neutron_ha_tool_validate || exit $?
# OPENSTACK env variables
export OS_AUTH_URL=$OCF_RESKEY_os_auth_url
export OS_REGION_NAME=$OCF_RESKEY_os_region_name
export OS_TENANT_NAME=$OCF_RESKEY_os_tenant_name
export OS_USERNAME=$OCF_RESKEY_os_username
export OS_PASSWORD=$OCF_RESKEY_os_password
if [ -n "$OCF_RESKEY_os_cacert" ]; then
export OS_CACERT=$OCF_RESKEY_os_cacert
fi
statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active"
# What kind of method was invoked?
case "$1" in
start)
neutron_ha_tool_start
;;
stop)
neutron_ha_tool_stop
;;
status)
neutron_ha_tool_status
;;
monitor)
neutron_ha_tool_monitor
;;
validate-all)
;;
*)
usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac