Merge from quantum-gateway chunk

This commit is contained in:
Hui Xiang 2015-01-15 17:40:34 +08:00
commit e777443c5a
15 changed files with 528 additions and 590 deletions

View File

@ -9,3 +9,4 @@ include:
- contrib.python.packages
- contrib.storage.linux
- payload.execd
- contrib.charmsupport

View File

@ -104,6 +104,16 @@ options:
default: nova
type: string
description: Database name
nagios_context:
default: "juju"
type: string
description: |
Used by the nrpe-external-master subordinate charm.
A string that will be prepended to instance name to set the host name
in nagios. So for instance the hostname would be something like:
juju-myservice-0
If you're running multiple environments with the same services in them
this allows you to differentiate between them.
# Network configuration options
# by default all access is over 'private-address'
os-data-network:

View File

@ -1,155 +0,0 @@
#!/bin/sh
#
#
# NeutronAgentMon OCF RA.
# Starts crm_mon in background which logs cluster status as
# html to the specified file.
#
# Copyright 2014 Canonical Ltd.
#
# Authors: Hui Xiang <hui.xiang@canonical.com>
# Edward Hope-Morley <edward.hope-morley@canonical.com>
#
# OCF instance parameters:
# OCF_RESKEY_file
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="NeutronAgentMon">
<version>1.0</version>
<longdesc lang="en">
This is a NeutronAgentMon Resource Agent.
It monitors the 'neutron-ha-monitor daemon' status.
</longdesc>
<shortdesc lang="en">Monitor '/usr/local/bin/neutron-ha-monitor.py' in the background.</shortdesc>
<parameters>
<parameter name="file" unique="0">
<longdesc lang="en">
The file we want to run as a daemon.
</longdesc>
<shortdesc lang="en">The file we want to run as a daemon.</shortdesc>
<content type="string" default="/usr/local/bin/neutron-ha-monitor.py" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" depth="0" timeout="20" interval="60" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
#######################################################################
NeutronAgentMon_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
NeutronAgentMon_exit() {
if [ $1 != 0 ]; then
exit $OCF_ERR_GENERIC
else
exit $OCF_SUCCESS
fi
}
NeutronAgentMon_start() {
pid=`sudo ps -aux | grep neutron-ha-m\[o\]nitor.py | awk -F' ' '{print $2}'`
if [ -z $pid ]; then
ocf_log info "[NeutronAgentMon_start] Start Monitor daemon."
sudo mkdir -p /var/log/neutron-ha
sudo python /usr/local/bin/neutron-ha-monitor.py \
--config-file /var/lib/juju-neutron-ha/neutron-ha-monitor.conf \
--log-file /var/log/neutron-ha/monitor.log >> /dev/null 2>&1 & echo $!
sleep 5
else
ocf_log warn "[NeutronAgentMon_start] Monitor daemon already running."
fi
NeutronAgentMon_exit $?
}
NeutronAgentMon_stop() {
pid=`sudo ps -aux | grep neutron-ha-m\[o\]nitor.py | awk -F' ' '{print $2}'`
if [ ! -z $pid ]; then
sudo kill -s 9 $pid
ocf_log info "[NeutronAgentMon_stop] Pid $pid is killed."
else
ocf_log warn "[NeutronAgentMon_stop] Monitor daemon already stopped."
fi
NeutronAgentMon_exit 0
}
NeutronAgentMon_monitor() {
pid=`sudo ps -aux | grep neutron-ha-m\[o\]nitor.py | awk -F' ' '{print $2}'`
if [ ! -z $pid ]; then
ocf_log info "[NeutronAgentMon_monitor] success."
exit $OCF_SUCCESS
fi
exit $OCF_NOT_RUNNING
}
NeutronAgentMon_validate() {
# Existence of the user
if [ -f $OCF_RESKEY_file ]; then
echo "Validate OK"
return $OCF_SUCCESS
else
ocf_log err "The file $OCF_RESKEY_file does not exist!"
exit $OCF_ERR_ARGS
fi
}
if [ $# -ne 1 ]; then
NeutronAgentMon_usage
exit $OCF_ERR_ARGS
fi
: ${OCF_RESKEY_update:="15000"}
: ${OCF_RESKEY_pidfile:="/tmp/NeutronAgentMon_${OCF_RESOURCE_INSTANCE}.pid"}
: ${OCF_RESKEY_htmlfile:="/tmp/NeutronAgentMon_${OCF_RESOURCE_INSTANCE}.html"}
OCF_RESKEY_update=`expr $OCF_RESKEY_update / 1000`
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) NeutronAgentMon_start
;;
stop) NeutronAgentMon_stop
;;
monitor) NeutronAgentMon_monitor
;;
validate-all) NeutronAgentMon_validate
;;
usage|help) NeutronAgentMon_usage
exit $OCF_SUCCESS
;;
*) NeutronAgentMon_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit $?

View File

@ -1,4 +0,0 @@
[DEFAULT]
verbose=True
#debug=True
check_interval=8

View File

@ -1,430 +0,0 @@
# Copyright 2014 Canonical Ltd.
#
# Authors: Hui Xiang <hui.xiang@canonical.com>
# Joshua Zhang <joshua.zhang@canonical.com>
# Edward Hope-Morley <edward.hope-morley@canonical.com>
#
"""
Helpers for monitoring Neutron agents, reschedule failed agents,
cleaned resources on failed nodes.
"""
import os
import re
import sys
import signal
import socket
import subprocess
import time
from oslo.config import cfg
from neutron.agent.linux import ovs_lib
from neutron.agent.linux import ip_lib
from neutron.common import exceptions
from neutron.openstack.common import log as logging
LOG = logging.getLogger(__name__)
class Daemon(object):
"""A generic daemon class.
Usage: subclass the Daemon class and override the run() method
"""
def __init__(self, stdin='/dev/null', stdout='/dev/null',
stderr='/dev/null', procname='python'):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.procname = procname
def _fork(self):
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError:
LOG.exception('Fork failed')
sys.exit(1)
def daemonize(self):
"""Daemonize process by doing Stevens double fork."""
# fork first time
self._fork()
# decouple from parent environment
os.chdir("/")
os.setsid()
os.umask(0)
# fork second time
self._fork()
# redirect standard file descriptors
sys.stdout.flush()
sys.stderr.flush()
stdin = open(self.stdin, 'r')
stdout = open(self.stdout, 'a+')
stderr = open(self.stderr, 'a+', 0)
os.dup2(stdin.fileno(), sys.stdin.fileno())
os.dup2(stdout.fileno(), sys.stdout.fileno())
os.dup2(stderr.fileno(), sys.stderr.fileno())
signal.signal(signal.SIGTERM, self.handle_sigterm)
def handle_sigterm(self, signum, frame):
sys.exit(0)
def start(self):
"""Start the daemon."""
self.daemonize()
self.run()
def run(self):
"""Override this method when subclassing Daemon.
start() will call this method after the process has daemonized.
"""
pass
class MonitorNeutronAgentsDaemon(Daemon):
def __init__(self):
super(MonitorNeutronAgentsDaemon, self).__init__()
logging.setup('Neuron-HA-Monitor')
LOG.info('Monitor Neutron Agent Loop Init')
self.hostname = None
self.env = {}
def get_env(self):
envrc_f = '/etc/legacy_ha_envrc'
envrc_f_m = False
if os.path.isfile(envrc_f):
ctime = time.ctime(os.stat(envrc_f).st_ctime)
mtime = time.ctime(os.stat(envrc_f).st_mtime)
if ctime != mtime:
envrc_f_m = True
if not self.env or envrc_f_m:
with open(envrc_f, 'r') as f:
for line in f:
data = line.strip().split('=')
if data and data[0] and data[1]:
self.env[data[0]] = data[1]
else:
raise Exception("OpenStack env data uncomplete.")
return self.env
def get_hostname(self):
if not self.hostname:
self.hostname = socket.gethostname()
return self.hostname
def get_root_helper(self):
return 'sudo'
def list_monitor_res(self):
# List crm resource 'cl_monitor' running node
nodes = []
cmd = ['crm', 'resource', 'show', 'cl_monitor']
output = subprocess.check_output(cmd)
pattern = re.compile('resource cl_monitor is running on: (.*) ')
nodes = pattern.findall(output)
return nodes
def get_crm_res_lead_node(self):
nodes = self.list_monitor_res()
if nodes:
return nodes[0].strip()
else:
LOG.error('Failed to get crm resource.')
return None
def unplug_device(self, device):
try:
device.link.delete()
except RuntimeError:
root_helper = self.get_root_helper()
# Maybe the device is OVS port, so try to delete
bridge_name = ovs_lib.get_bridge_for_iface(root_helper,
device.name)
if bridge_name:
bridge = ovs_lib.OVSBridge(bridge_name, root_helper)
bridge.delete_port(device.name)
else:
LOG.debug('Unable to find bridge for device: %s', device.name)
def get_pattern(self, key, text):
if not key or not text:
LOG.debug('Invalid key(%s) or text(%s)' % (key, text))
return None
pattern = re.compile('%s' % key)
result = pattern.findall(text)
return result
def _cleanup(self, key1, key2):
namespaces = []
if key1:
for k in key1.iterkeys():
namespaces.append(key2 + '-' + k)
else:
try:
cmd = ['sudo', 'ip', 'netns']
ns = subprocess.check_output(cmd)
namespaces = self.get_pattern('(%s.*)' % key2, ns)
except RuntimeError as e:
LOG.error('Failed to list namespace, (%s)' % e)
if namespaces:
LOG.info('Namespaces: %s is going to be deleted.' % namespaces)
self.destroy_namespaces(namespaces)
def cleanup_dhcp(self, networks):
self._cleanup(networks, 'qdhcp')
def cleanup_router(self, routers):
self._cleanup(routers, 'qrouter')
def destroy_namespaces(self, namespaces):
try:
root_helper = self.get_root_helper()
for namespace in namespaces:
ip = ip_lib.IPWrapper(root_helper, namespace)
if ip.netns.exists(namespace):
for device in ip.get_devices(exclude_loopback=True):
self.unplug_device(device)
ip.garbage_collect_namespace()
except Exception:
LOG.exception('Error unable to destroy namespace: %s', namespace)
def is_same_host(self, host):
return str(host).strip() == self.get_hostname()
def validate_reschedule(self):
crm_no_1_node = self.get_crm_res_lead_node()
if not crm_no_1_node:
LOG.error('No crm first node could be found.')
return False
if not self.is_same_host(crm_no_1_node):
LOG.warn('Only the first crm node %s could reschedule. '
% crm_no_1_node)
return False
return True
def l3_agents_reschedule(self, l3_agents, routers, quantum):
if not self.validate_reschedule():
return
index = 0
for router_id in routers:
agent = index % len(l3_agents)
LOG.info('Moving router %s from %s to %s' %
(router_id, routers[router_id], l3_agents[agent]))
try:
quantum.remove_router_from_l3_agent(l3_agent=routers[router_id],
router_id=router_id)
except exceptions.NeutronException as e:
LOG.error('Remove router raised exception: %s' % e)
try:
quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent],
body={'router_id': router_id})
except exceptions.NeutronException as e:
LOG.error('Add router raised exception: %s' % e)
index += 1
def dhcp_agents_reschedule(self, dhcp_agents, networks, quantum):
if not self.validate_reschedule():
return
index = 0
for network_id in networks:
agent = index % len(dhcp_agents)
LOG.info('Moving network %s from %s to %s' % (network_id,
networks[network_id], dhcp_agents[agent]))
try:
quantum.remove_network_from_dhcp_agent(
dhcp_agent=networks[network_id], network_id=network_id)
except exceptions.NeutronException as e:
LOG.error('Remove network raised exception: %s' % e)
try:
quantum.add_network_to_dhcp_agent(
dhcp_agent=dhcp_agents[agent],
body={'network_id': network_id})
except exceptions.NeutronException as e:
LOG.error('Add network raised exception: %s' % e)
index += 1
def get_quantum_client(self):
env = self.get_env()
if not env:
LOG.info('Unable to re-assign resources at this time')
return None
try:
from quantumclient.v2_0 import client
except ImportError:
# Try to import neutronclient instead for havana+
from neutronclient.v2_0 import client
auth_url = '%(auth_protocol)s://%(keystone_host)s:%(auth_port)s/v2.0' \
% env
quantum = client.Client(username=env['service_username'],
password=env['service_password'],
tenant_name=env['service_tenant'],
auth_url=auth_url,
region_name=env['region'])
return quantum
def reassign_agent_resources(self, quantum=None):
"""Use agent scheduler API to detect down agents and re-schedule"""
if not quantum:
LOG.error('Failed to get quantum client.')
return
try:
DHCP_AGENT = "DHCP Agent"
L3_AGENT = "L3 Agent"
agents = quantum.list_agents(agent_type=DHCP_AGENT)
except exceptions.NeutronException as e:
LOG.error('Failed to get quantum agents, %s' % e)
return
dhcp_agents = []
l3_agents = []
networks = {}
for agent in agents['agents']:
hosted_networks = quantum.list_networks_on_dhcp_agent(
agent['id'])['networks']
if not agent['alive']:
LOG.info('DHCP Agent %s down' % agent['id'])
for network in hosted_networks:
networks[network['id']] = agent['id']
if self.is_same_host(agent['host']):
self.cleanup_dhcp(networks)
else:
dhcp_agents.append(agent['id'])
LOG.info('Active dhcp agents: %s' % agent['id'])
if not hosted_networks and self.is_same_host(agent['host']):
self.cleanup_dhcp(None)
agents = quantum.list_agents(agent_type=L3_AGENT)
routers = {}
for agent in agents['agents']:
hosted_routers = quantum.list_routers_on_l3_agent(
agent['id'])['routers']
if not agent['alive']:
LOG.info('L3 Agent %s down' % agent['id'])
for router in hosted_routers:
routers[router['id']] = agent['id']
if self.is_same_host(agent['host']):
self.cleanup_router(routers)
else:
l3_agents.append(agent['id'])
LOG.info('Active l3 agents: %s' % agent['id'])
if not hosted_routers and self.is_same_host(agent['host']):
self.cleanup_router(None)
if not networks and not routers:
LOG.info('No networks and routers hosted on failed agents.')
return
if len(dhcp_agents) == 0 and len(l3_agents) == 0:
LOG.error('Unable to relocate resources, there are %s dhcp_agents '
'and %s l3_agents in this cluster' % (len(dhcp_agents),
len(l3_agents)))
return
if len(l3_agents) > 0:
self.l3_agents_reschedule(l3_agents, routers, quantum)
# new l3 node will not create a tunnel if don't restart ovs process
if len(dhcp_agents) > 0:
self.dhcp_agents_reschedule(dhcp_agents, networks, quantum)
def check_ovs_tunnel(self, quantum=None):
if not quantum:
LOG.error('Failed to get quantum client.')
return
try:
OVS_AGENT = 'Open vSwitch agent'
agents = quantum.list_agents(agent_type=OVS_AGENT)
except exceptions.NeutronException as e:
LOG.error('No ovs agent found on localhost, error:%s.' % e)
return
for agent in agents['agents']:
if self.is_same_host(agent['host']):
conf = agent['configurations']
if 'gre' in conf['tunnel_types'] and conf['l2_population'] \
and conf['devices']:
LOG.warning('local ovs agent:%s' % agent)
ovs_output = subprocess.check_output(['ovs-vsctl',
'list-ports', 'br-tun'])
ports = ovs_output.strip().split('\n')
look_up_gre_port = False
for port in ports:
if port.startswith('gre-'):
look_up_gre_port = True
break
if not look_up_gre_port:
try:
LOG.error('Found namespace, but no ovs tunnel is created,'
'restart ovs agent.')
cmd = ['sudo', 'service', 'neutron-plugin-openvswitch-agent',
'restart']
subprocess.call(cmd)
except subprocess.CalledProcessError:
LOG.error('Failed to restart neutron-plugin-openvswitch-agent.')
def check_local_agents(self):
services = ['openvswitch-switch', 'neutron-dhcp-agent',
'neutron-metadata-agent', 'neutron-vpn-agent']
for s in services:
status = ['sudo', 'service', s, 'status']
restart = ['sudo', 'service', s, 'restart']
start = ['sudo', 'service', s, 'start']
stop = 'neutron-vpn-agent stop/waiting'
try:
output = subprocess.check_output(status)
if output.strip() == stop:
subprocess.check_output(start)
if s == 'neutron-metadata-agent':
subprocess.check_output(['sudo', 'service',
'neutron-vpn-agent',
'restart'])
except subprocess.CalledProcessError:
LOG.error('Restart service: %s' % s)
subprocess.check_output(restart)
if s == 'neutron-metadata-agent':
subprocess.check_output(['sudo', 'service',
'neutron-vpn-agent',
'restart'])
def run(self):
while True:
LOG.info('Monitor Neutron HA Agent Loop Start')
quantum = self.get_quantum_client()
self.reassign_agent_resources(quantum=quantum)
self.check_ovs_tunnel(quantum=quantum)
self.check_local_agents()
LOG.info('sleep %s' % cfg.CONF.check_interval)
time.sleep(float(cfg.CONF.check_interval))
if __name__ == '__main__':
opts = [
cfg.StrOpt('check_interval',
default=8,
help='Check Neutron Agents interval.'),
]
cfg.CONF.register_cli_opts(opts)
cfg.CONF(project='monitor_neutron_agents', default_config_files=[])
logging.setup('Neuron-HA-Monitor')
monitor_daemon = MonitorNeutronAgentsDaemon()
monitor_daemon.start()

View File

@ -0,0 +1,308 @@
"""Compatibility with the nrpe-external-master charm"""
# Copyright 2012 Canonical Ltd.
#
# Authors:
# Matthew Wedgwood <matthew.wedgwood@canonical.com>
import subprocess
import pwd
import grp
import os
import re
import shlex
import yaml
from charmhelpers.core.hookenv import (
config,
local_unit,
log,
relation_ids,
relation_set,
relations_of_type,
)
from charmhelpers.core.host import service
# This module adds compatibility with the nrpe-external-master and plain nrpe
# subordinate charms. To use it in your charm:
#
# 1. Update metadata.yaml
#
# provides:
# (...)
# nrpe-external-master:
# interface: nrpe-external-master
# scope: container
#
# and/or
#
# provides:
# (...)
# local-monitors:
# interface: local-monitors
# scope: container
#
# 2. Add the following to config.yaml
#
# nagios_context:
# default: "juju"
# type: string
# description: |
# Used by the nrpe subordinate charms.
# A string that will be prepended to instance name to set the host name
# in nagios. So for instance the hostname would be something like:
# juju-myservice-0
# If you're running multiple environments with the same services in them
# this allows you to differentiate between them.
# nagios_servicegroups:
# default: ""
# type: string
# description: |
# A comma-separated list of nagios servicegroups.
# If left empty, the nagios_context will be used as the servicegroup
#
# 3. Add custom checks (Nagios plugins) to files/nrpe-external-master
#
# 4. Update your hooks.py with something like this:
#
# from charmsupport.nrpe import NRPE
# (...)
# def update_nrpe_config():
# nrpe_compat = NRPE()
# nrpe_compat.add_check(
# shortname = "myservice",
# description = "Check MyService",
# check_cmd = "check_http -w 2 -c 10 http://localhost"
# )
# nrpe_compat.add_check(
# "myservice_other",
# "Check for widget failures",
# check_cmd = "/srv/myapp/scripts/widget_check"
# )
# nrpe_compat.write()
#
# def config_changed():
# (...)
# update_nrpe_config()
#
# def nrpe_external_master_relation_changed():
# update_nrpe_config()
#
# def local_monitors_relation_changed():
# update_nrpe_config()
#
# 5. ln -s hooks.py nrpe-external-master-relation-changed
# ln -s hooks.py local-monitors-relation-changed
class CheckException(Exception):
pass
class Check(object):
shortname_re = '[A-Za-z0-9-_]+$'
service_template = ("""
#---------------------------------------------------
# This file is Juju managed
#---------------------------------------------------
define service {{
use active-service
host_name {nagios_hostname}
service_description {nagios_hostname}[{shortname}] """
"""{description}
check_command check_nrpe!{command}
servicegroups {nagios_servicegroup}
}}
""")
def __init__(self, shortname, description, check_cmd):
super(Check, self).__init__()
# XXX: could be better to calculate this from the service name
if not re.match(self.shortname_re, shortname):
raise CheckException("shortname must match {}".format(
Check.shortname_re))
self.shortname = shortname
self.command = "check_{}".format(shortname)
# Note: a set of invalid characters is defined by the
# Nagios server config
# The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()=
self.description = description
self.check_cmd = self._locate_cmd(check_cmd)
def _locate_cmd(self, check_cmd):
search_path = (
'/usr/lib/nagios/plugins',
'/usr/local/lib/nagios/plugins',
)
parts = shlex.split(check_cmd)
for path in search_path:
if os.path.exists(os.path.join(path, parts[0])):
command = os.path.join(path, parts[0])
if len(parts) > 1:
command += " " + " ".join(parts[1:])
return command
log('Check command not found: {}'.format(parts[0]))
return ''
def write(self, nagios_context, hostname, nagios_servicegroups=None):
nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format(
self.command)
with open(nrpe_check_file, 'w') as nrpe_check_config:
nrpe_check_config.write("# check {}\n".format(self.shortname))
nrpe_check_config.write("command[{}]={}\n".format(
self.command, self.check_cmd))
if not os.path.exists(NRPE.nagios_exportdir):
log('Not writing service config as {} is not accessible'.format(
NRPE.nagios_exportdir))
else:
self.write_service_config(nagios_context, hostname,
nagios_servicegroups)
def write_service_config(self, nagios_context, hostname,
nagios_servicegroups=None):
for f in os.listdir(NRPE.nagios_exportdir):
if re.search('.*{}.cfg'.format(self.command), f):
os.remove(os.path.join(NRPE.nagios_exportdir, f))
if not nagios_servicegroups:
nagios_servicegroups = nagios_context
templ_vars = {
'nagios_hostname': hostname,
'nagios_servicegroup': nagios_servicegroups,
'description': self.description,
'shortname': self.shortname,
'command': self.command,
}
nrpe_service_text = Check.service_template.format(**templ_vars)
nrpe_service_file = '{}/service__{}_{}.cfg'.format(
NRPE.nagios_exportdir, hostname, self.command)
with open(nrpe_service_file, 'w') as nrpe_service_config:
nrpe_service_config.write(str(nrpe_service_text))
def run(self):
subprocess.call(self.check_cmd)
class NRPE(object):
nagios_logdir = '/var/log/nagios'
nagios_exportdir = '/var/lib/nagios/export'
nrpe_confdir = '/etc/nagios/nrpe.d'
def __init__(self, hostname=None):
super(NRPE, self).__init__()
self.config = config()
self.nagios_context = self.config['nagios_context']
if 'nagios_servicegroups' in self.config:
self.nagios_servicegroups = self.config['nagios_servicegroups']
else:
self.nagios_servicegroups = 'juju'
self.unit_name = local_unit().replace('/', '-')
if hostname:
self.hostname = hostname
else:
self.hostname = "{}-{}".format(self.nagios_context, self.unit_name)
self.checks = []
def add_check(self, *args, **kwargs):
self.checks.append(Check(*args, **kwargs))
def write(self):
try:
nagios_uid = pwd.getpwnam('nagios').pw_uid
nagios_gid = grp.getgrnam('nagios').gr_gid
except:
log("Nagios user not set up, nrpe checks not updated")
return
if not os.path.exists(NRPE.nagios_logdir):
os.mkdir(NRPE.nagios_logdir)
os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid)
nrpe_monitors = {}
monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}}
for nrpecheck in self.checks:
nrpecheck.write(self.nagios_context, self.hostname,
self.nagios_servicegroups)
nrpe_monitors[nrpecheck.shortname] = {
"command": nrpecheck.command,
}
service('restart', 'nagios-nrpe-server')
for rid in relation_ids("local-monitors"):
relation_set(relation_id=rid, monitors=yaml.dump(monitors))
def get_nagios_hostcontext(relation_name='nrpe-external-master'):
"""
Query relation with nrpe subordinate, return the nagios_host_context
:param str relation_name: Name of relation nrpe sub joined to
"""
for rel in relations_of_type(relation_name):
if 'nagios_hostname' in rel:
return rel['nagios_host_context']
def get_nagios_hostname(relation_name='nrpe-external-master'):
"""
Query relation with nrpe subordinate, return the nagios_hostname
:param str relation_name: Name of relation nrpe sub joined to
"""
for rel in relations_of_type(relation_name):
if 'nagios_hostname' in rel:
return rel['nagios_hostname']
def get_nagios_unit_name(relation_name='nrpe-external-master'):
"""
Return the nagios unit name prepended with host_context if needed
:param str relation_name: Name of relation nrpe sub joined to
"""
host_context = get_nagios_hostcontext(relation_name)
if host_context:
unit = "%s:%s" % (host_context, local_unit())
else:
unit = local_unit()
return unit
def add_init_service_checks(nrpe, services, unit_name):
"""
Add checks for each service in list
:param NRPE nrpe: NRPE object to add check to
:param list services: List of services to check
:param str unit_name: Unit name to use in check description
"""
for svc in services:
upstart_init = '/etc/init/%s.conf' % svc
sysv_init = '/etc/init.d/%s' % svc
if os.path.exists(upstart_init):
nrpe.add_check(
shortname=svc,
description='process check {%s}' % unit_name,
check_cmd='check_upstart_job %s' % svc
)
elif os.path.exists(sysv_init):
cronpath = '/etc/cron.d/nagios-service-check-%s' % svc
cron_file = ('*/5 * * * * root '
'/usr/local/lib/nagios/plugins/check_exit_status.pl '
'-s /etc/init.d/%s status > '
'/var/lib/nagios/service-check-%s.txt\n' % (svc,
svc)
)
f = open(cronpath, 'w')
f.write(cron_file)
f.close()
nrpe.add_check(
shortname=svc,
description='process check {%s}' % unit_name,
check_cmd='check_status_file.py -f '
'/var/lib/nagios/service-check-%s.txt' % svc,
)

View File

@ -0,0 +1,159 @@
'''
Functions for managing volumes in juju units. One volume is supported per unit.
Subordinates may have their own storage, provided it is on its own partition.
Configuration stanzas::
volume-ephemeral:
type: boolean
default: true
description: >
If false, a volume is mounted as sepecified in "volume-map"
If true, ephemeral storage will be used, meaning that log data
will only exist as long as the machine. YOU HAVE BEEN WARNED.
volume-map:
type: string
default: {}
description: >
YAML map of units to device names, e.g:
"{ rsyslog/0: /dev/vdb, rsyslog/1: /dev/vdb }"
Service units will raise a configure-error if volume-ephemeral
is 'true' and no volume-map value is set. Use 'juju set' to set a
value and 'juju resolved' to complete configuration.
Usage::
from charmsupport.volumes import configure_volume, VolumeConfigurationError
from charmsupport.hookenv import log, ERROR
def post_mount_hook():
stop_service('myservice')
def post_mount_hook():
start_service('myservice')
if __name__ == '__main__':
try:
configure_volume(before_change=pre_mount_hook,
after_change=post_mount_hook)
except VolumeConfigurationError:
log('Storage could not be configured', ERROR)
'''
# XXX: Known limitations
# - fstab is neither consulted nor updated
import os
from charmhelpers.core import hookenv
from charmhelpers.core import host
import yaml
MOUNT_BASE = '/srv/juju/volumes'
class VolumeConfigurationError(Exception):
'''Volume configuration data is missing or invalid'''
pass
def get_config():
'''Gather and sanity-check volume configuration data'''
volume_config = {}
config = hookenv.config()
errors = False
if config.get('volume-ephemeral') in (True, 'True', 'true', 'Yes', 'yes'):
volume_config['ephemeral'] = True
else:
volume_config['ephemeral'] = False
try:
volume_map = yaml.safe_load(config.get('volume-map', '{}'))
except yaml.YAMLError as e:
hookenv.log("Error parsing YAML volume-map: {}".format(e),
hookenv.ERROR)
errors = True
if volume_map is None:
# probably an empty string
volume_map = {}
elif not isinstance(volume_map, dict):
hookenv.log("Volume-map should be a dictionary, not {}".format(
type(volume_map)))
errors = True
volume_config['device'] = volume_map.get(os.environ['JUJU_UNIT_NAME'])
if volume_config['device'] and volume_config['ephemeral']:
# asked for ephemeral storage but also defined a volume ID
hookenv.log('A volume is defined for this unit, but ephemeral '
'storage was requested', hookenv.ERROR)
errors = True
elif not volume_config['device'] and not volume_config['ephemeral']:
# asked for permanent storage but did not define volume ID
hookenv.log('Ephemeral storage was requested, but there is no volume '
'defined for this unit.', hookenv.ERROR)
errors = True
unit_mount_name = hookenv.local_unit().replace('/', '-')
volume_config['mountpoint'] = os.path.join(MOUNT_BASE, unit_mount_name)
if errors:
return None
return volume_config
def mount_volume(config):
if os.path.exists(config['mountpoint']):
if not os.path.isdir(config['mountpoint']):
hookenv.log('Not a directory: {}'.format(config['mountpoint']))
raise VolumeConfigurationError()
else:
host.mkdir(config['mountpoint'])
if os.path.ismount(config['mountpoint']):
unmount_volume(config)
if not host.mount(config['device'], config['mountpoint'], persist=True):
raise VolumeConfigurationError()
def unmount_volume(config):
if os.path.ismount(config['mountpoint']):
if not host.umount(config['mountpoint'], persist=True):
raise VolumeConfigurationError()
def managed_mounts():
'''List of all mounted managed volumes'''
return filter(lambda mount: mount[0].startswith(MOUNT_BASE), host.mounts())
def configure_volume(before_change=lambda: None, after_change=lambda: None):
'''Set up storage (or don't) according to the charm's volume configuration.
Returns the mount point or "ephemeral". before_change and after_change
are optional functions to be called if the volume configuration changes.
'''
config = get_config()
if not config:
hookenv.log('Failed to read volume configuration', hookenv.CRITICAL)
raise VolumeConfigurationError()
if config['ephemeral']:
if os.path.ismount(config['mountpoint']):
before_change()
unmount_volume(config)
after_change()
return 'ephemeral'
else:
# persistent storage
if os.path.ismount(config['mountpoint']):
mounts = dict(managed_mounts())
if mounts.get(config['mountpoint']) != config['device']:
before_change()
unmount_volume(config)
mount_volume(config)
after_change()
else:
before_change()
mount_volume(config)
after_change()
return config['mountpoint']

View File

@ -53,6 +53,7 @@ UBUNTU_OPENSTACK_RELEASE = OrderedDict([
('saucy', 'havana'),
('trusty', 'icehouse'),
('utopic', 'juno'),
('vivid', 'kilo'),
])
@ -64,6 +65,7 @@ OPENSTACK_CODENAMES = OrderedDict([
('2013.2', 'havana'),
('2014.1', 'icehouse'),
('2014.2', 'juno'),
('2015.1', 'kilo'),
])
# The ugly duckling
@ -84,6 +86,7 @@ SWIFT_CODENAMES = OrderedDict([
('2.0.0', 'juno'),
('2.1.0', 'juno'),
('2.2.0', 'juno'),
('2.2.1', 'kilo'),
])
DEFAULT_LOOPBACK_SIZE = '5G'
@ -289,6 +292,9 @@ def configure_installation_source(rel):
'juno': 'trusty-updates/juno',
'juno/updates': 'trusty-updates/juno',
'juno/proposed': 'trusty-proposed/juno',
'kilo': 'trusty-updates/kilo',
'kilo/updates': 'trusty-updates/kilo',
'kilo/proposed': 'trusty-proposed/kilo',
}
try:

View File

@ -64,9 +64,16 @@ CLOUD_ARCHIVE_POCKETS = {
'trusty-juno/updates': 'trusty-updates/juno',
'trusty-updates/juno': 'trusty-updates/juno',
'juno/proposed': 'trusty-proposed/juno',
'juno/proposed': 'trusty-proposed/juno',
'trusty-juno/proposed': 'trusty-proposed/juno',
'trusty-proposed/juno': 'trusty-proposed/juno',
# Kilo
'kilo': 'trusty-updates/kilo',
'trusty-kilo': 'trusty-updates/kilo',
'trusty-kilo/updates': 'trusty-updates/kilo',
'trusty-updates/kilo': 'trusty-updates/kilo',
'kilo/proposed': 'trusty-proposed/kilo',
'trusty-kilo/proposed': 'trusty-proposed/kilo',
'trusty-proposed/kilo': 'trusty-proposed/kilo',
}
# The order of this list is very important. Handlers should be listed in from

View File

@ -0,0 +1 @@
quantum_hooks.py

View File

@ -0,0 +1 @@
quantum_hooks.py

View File

@ -36,10 +36,13 @@ from charmhelpers.contrib.openstack.utils import (
from charmhelpers.payload.execd import execd_preinstall
from charmhelpers.core.sysctl import create as create_sysctl
from charmhelpers.contrib.charmsupport import nrpe
import sys
from quantum_utils import (
register_configs,
restart_map,
services,
do_openstack_upgrade,
get_packages,
get_early_packages,
@ -92,6 +95,7 @@ def config_changed():
global CONFIGS
if openstack_upgrade_available(get_common_package()):
CONFIGS = do_openstack_upgrade()
update_nrpe_config()
sysctl_dict = config('sysctl')
if sysctl_dict:
@ -234,6 +238,32 @@ def stop():
stop_services()
@hooks.hook('nrpe-external-master-relation-joined',
'nrpe-external-master-relation-changed')
def update_nrpe_config():
# python-dbus is used by check_upstart_job
apt_install('python-dbus')
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname)
nrpe.add_init_service_checks(nrpe_setup, services(), current_unit)
cronpath = '/etc/cron.d/nagios-netns-check'
cron_template = ('*/5 * * * * root '
'/usr/local/lib/nagios/plugins/check_netns.sh '
'> /var/lib/nagios/netns-check.txt\n'
)
f = open(cronpath, 'w')
f.write(cron_template)
f.close()
nrpe_setup.add_check(
shortname="netns",
description='Network Namespace check {%s}' % current_unit,
check_cmd='check_status_file.py -f /var/lib/nagios/netns-check.txt'
)
nrpe_setup.write()
@hooks.hook('ha-relation-joined')
@hooks.hook('ha-relation-changed')
def ha_relation_joined():

View File

@ -16,6 +16,9 @@ description: |
categories:
- openstack
provides:
nrpe-external-master:
interface: nrpe-external-master
scope: container
quantum-network-service:
interface: quantum
requires:

View File

@ -42,6 +42,7 @@ TO_PATCH = [
'b64decode',
'is_relation_made',
'create_sysctl',
'update_nrpe_config',
'update_legacy_ha_files',
'add_hostname_to_hosts'
]