diff --git a/neutron/templates/bin/_health-probe.py.tpl b/neutron/templates/bin/_health-probe.py.tpl new file mode 100644 index 0000000000..af702e1eae --- /dev/null +++ b/neutron/templates/bin/_health-probe.py.tpl @@ -0,0 +1,266 @@ +#!/usr/bin/env python2 + +# Copyright 2019 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Health probe script for OpenStack agents that uses RPC/unix domain socket for +communication. Sends message to agent through rpc call method and expects a +reply. It is expected to receive a failure from the agent's RPC server as the +method does not exist. + +Script returns failure to Kubernetes only when + a. agent is not reachable or + b. agent times out sending a reply. + +sys.stderr.write() writes to pod's events on failures. + +Usage example for Neutron L3 agent: +# python health-probe.py --config-file /etc/neutron/neutron.conf \ +# --config-file /etc/neutron/l3_agent.ini --agent-queue-name l3_agent + +Usage example for Neutron metadata agent: +# python health-probe.py --config-file /etc/neutron/neutron.conf \ +# --config-file /etc/neutron/metadata_agent.ini +""" + +import httplib2 +from six.moves import http_client as httplib +import os +import psutil +import socket +import sys + +from oslo_config import cfg +from oslo_context import context +from oslo_log import log +import oslo_messaging + +rabbit_port = 5672 +tcp_established = "ESTABLISHED" +log.logging.basicConfig(level=log.ERROR) + + +def check_agent_status(transport): + """Verify agent status. Return success if agent consumes message""" + try: + target = oslo_messaging.Target(topic=cfg.CONF.agent_queue_name, + server=socket.gethostname()) + client = oslo_messaging.RPCClient(transport, target, + timeout=60, + retry=2) + client.call(context.RequestContext(), + 'pod_health_probe_method_ignore_errors') + except oslo_messaging.exceptions.MessageDeliveryFailure: + # Log to pod events + sys.stderr.write("Health probe unable to reach message bus") + sys.exit(0) # return success + except oslo_messaging.rpc.client.RemoteError as re: + if ("Endpoint does not support RPC method" in re.message) or \ + ("Endpoint does not support RPC version" in re.message): + sys.exit(0) # Call reached the agent + else: + sys.stderr.write("Health probe unable to reach agent") + sys.exit(1) # return failure + except oslo_messaging.exceptions.MessagingTimeout: + sys.stderr.write("Health probe timed out. Agent is down or response " + "timed out") + sys.exit(1) # return failure + except Exception as ex: + sys.stderr.write("Health probe caught exception sending message to " + "agent: %s" % ex.message) + sys.exit(0) + except: + sys.stderr.write("Health probe caught exception sending message to" + " agent") + sys.exit(0) + + +def sriov_readiness_check(): + """Checks the sriov configuration on the sriov nic's""" + return_status = 1 + with open('/etc/neutron/plugins/ml2/sriov_agent.ini') as nic: + for phy in nic: + if "physical_device_mappings" in phy: + phy_dev = phy.split('=', 1)[1] + phy_dev1 = phy_dev.rstrip().split(',') + if not phy_dev1: + sys.stderr.write("No Physical devices" + " configured as SRIOV NICs") + sys.exit(1) + for intf in phy_dev1: + phy, dev = intf.split(':') + try: + with open('/sys/class/net/%s/device/' + 'sriov_numvfs' % dev) as f: + for line in f: + numvfs = line.rstrip('\n') + if numvfs: + return_status = 0 + except IOError: + sys.stderr.write("IOError:No sriov_numvfs config file") + sys.exit(return_status) + + +def tcp_socket_state_check(agentq): + """Check if the tcp socket to rabbitmq is in Established state""" + rabbit_sock_count = 0 + parentId = 0 + if agentq == "l3_agent": + proc = "neutron-l3-agen" + elif agentq == "dhcp_agent": + proc = "neutron-dhcp-ag" + elif agentq == "q-agent-notifier-tunnel-update": + proc = "neutron-openvsw" + else: + proc = "neutron-metadat" + + for pr in psutil.pids(): + try: + p = psutil.Process(pr) + if p.name() == proc: + if parentId == 0: + parentId = p.pid + else: + if p.ppid() == parentId: + continue + pcon = p.connections() + for con in pcon: + try: + port = con.raddr[1] + status = con.status + except IndexError: + continue + if port == rabbit_port and status == tcp_established: + rabbit_sock_count = rabbit_sock_count + 1 + except psutil.NoSuchProcess: + continue + + if rabbit_sock_count == 0: + sys.stderr.write("RabbitMQ sockets not Established") + # Do not kill the pod if RabbitMQ is not reachable/down + if not cfg.CONF.liveness_probe: + sys.exit(1) + + +class UnixDomainHTTPConnection(httplib.HTTPConnection): + """Connection class for HTTP over UNIX domain socket.""" + + def __init__(self, host, port=None, strict=None, timeout=None, + proxy_info=None): + httplib.HTTPConnection.__init__(self, host, port, strict) + self.timeout = timeout + self.socket_path = cfg.CONF.metadata_proxy_socket + + def connect(self): + self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + if self.timeout: + self.sock.settimeout(self.timeout) + self.sock.connect(self.socket_path) + + +def test_socket_liveness(): + """Test if agent can respond to message over the socket""" + cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False, + required=False)) + cfg.CONF(sys.argv[1:]) + + agentq = "metadata_agent" + tcp_socket_state_check(agentq) + + try: + metadata_proxy_socket = cfg.CONF.metadata_proxy_socket + except cfg.NoSuchOptError: + cfg.CONF.register_opt(cfg.StrOpt( + 'metadata_proxy_socket', + default='/var/lib/neutron/openstack-helm/metadata_proxy')) + + headers = {'X-Forwarded-For': '169.254.169.254', + 'X-Neutron-Router-ID': 'pod-health-probe-check-ignore-errors'} + + h = httplib2.Http(timeout=30) + + try: + resp, content = h.request( + 'http://169.254.169.254', + method='GET', + headers=headers, + connection_type=UnixDomainHTTPConnection) + except socket.error as se: + msg = "Socket error: Health probe failed to connect to " \ + "Neutron Metadata agent: " + if se.strerror: + sys.stderr.write(msg + se.strerror) + elif se.message: + sys.stderr.write(msg + se.message) + sys.exit(1) # return failure + except Exception as ex: + sys.stderr.write("Health probe caught exception sending message to " + "Neutron Metadata agent: %s" % ex.message) + sys.exit(0) # return success + + if resp.status >= 500: # Probe expects HTTP error code 404 + msg = "Health probe failed: Neutron Metadata agent failed to" \ + " process request: " + sys.stderr.write(msg + str(resp.__dict__)) + sys.exit(1) # return failure + + +def test_rpc_liveness(): + """Test if agent can consume message from queue""" + oslo_messaging.set_transport_defaults(control_exchange='neutron') + + rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit', + title='RabbitMQ options') + cfg.CONF.register_group(rabbit_group) + cfg.CONF.register_cli_opt(cfg.StrOpt('agent-queue-name')) + cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False, + required=False)) + + cfg.CONF(sys.argv[1:]) + + try: + transport = oslo_messaging.get_transport(cfg.CONF) + except Exception as ex: + sys.stderr.write("Message bus driver load error: %s" % ex.message) + sys.exit(0) # return success + + if not cfg.CONF.transport_url or \ + not cfg.CONF.agent_queue_name: + sys.stderr.write("Both message bus URL and agent queue name are " + "required for Health probe to work") + sys.exit(0) # return success + + try: + cfg.CONF.set_override('rabbit_max_retries', 2, + group=rabbit_group) # 3 attempts + except cfg.NoSuchOptError as ex: + cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2), + group=rabbit_group) + + agentq = cfg.CONF.agent_queue_name + tcp_socket_state_check(agentq) + + check_agent_status(transport) + + +if __name__ == "__main__": + if "sriov_agent.ini" in ','.join(sys.argv): + sriov_readiness_check() + elif "metadata_agent.ini" not in ','.join(sys.argv): + test_rpc_liveness() + else: + test_socket_liveness() + + sys.exit(0) # return success diff --git a/neutron/templates/configmap-bin.yaml b/neutron/templates/configmap-bin.yaml index 3cee123839..0ee992df7d 100644 --- a/neutron/templates/configmap-bin.yaml +++ b/neutron/templates/configmap-bin.yaml @@ -45,6 +45,8 @@ data: {{- include "helm-toolkit.scripts.keystone_endpoints" . | indent 4 }} ks-user.sh: | {{- include "helm-toolkit.scripts.keystone_user" . | indent 4 }} + health-probe.py: | +{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} neutron-dhcp-agent.sh: | {{ tuple "bin/_neutron-dhcp-agent.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} neutron-l3-agent.sh: | diff --git a/neutron/templates/daemonset-dhcp-agent.yaml b/neutron/templates/daemonset-dhcp-agent.yaml index 8142852c23..013a7b2725 100644 --- a/neutron/templates/daemonset-dhcp-agent.yaml +++ b/neutron/templates/daemonset-dhcp-agent.yaml @@ -65,6 +65,35 @@ spec: {{ tuple $envAll $envAll.Values.pod.resources.agent.dhcp | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} securityContext: privileged: true + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/dhcp_agent.ini + - --agent-queue-name + - dhcp_agent + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 65 + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/dhcp_agent.ini + - --agent-queue-name + - dhcp_agent + - --liveness-probe + initialDelaySeconds: 120 + periodSeconds: 90 + timeoutSeconds: 70 command: - /tmp/neutron-dhcp-agent.sh volumeMounts: @@ -72,6 +101,10 @@ spec: mountPath: /tmp/neutron-dhcp-agent.sh subPath: neutron-dhcp-agent.sh readOnly: true + - name: neutron-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: neutron-etc mountPath: /etc/neutron/neutron.conf subPath: neutron.conf diff --git a/neutron/templates/daemonset-l3-agent.yaml b/neutron/templates/daemonset-l3-agent.yaml index 078e4cbee6..a33b248028 100644 --- a/neutron/templates/daemonset-l3-agent.yaml +++ b/neutron/templates/daemonset-l3-agent.yaml @@ -65,6 +65,35 @@ spec: {{ tuple $envAll $envAll.Values.pod.resources.agent.l3 | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} securityContext: privileged: true + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/l3_agent.ini + - --agent-queue-name + - l3_agent + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 65 + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/l3_agent.ini + - --agent-queue-name + - l3_agent + - --liveness-probe + initialDelaySeconds: 120 + periodSeconds: 90 + timeoutSeconds: 70 command: - /tmp/neutron-l3-agent.sh volumeMounts: @@ -72,6 +101,10 @@ spec: mountPath: /tmp/neutron-l3-agent.sh subPath: neutron-l3-agent.sh readOnly: true + - name: neutron-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: neutron-etc mountPath: /etc/neutron/neutron.conf subPath: neutron.conf diff --git a/neutron/templates/daemonset-metadata-agent.yaml b/neutron/templates/daemonset-metadata-agent.yaml index a55c469a2b..f04fbac53c 100644 --- a/neutron/templates/daemonset-metadata-agent.yaml +++ b/neutron/templates/daemonset-metadata-agent.yaml @@ -86,6 +86,31 @@ spec: {{ tuple $envAll $envAll.Values.pod.resources.agent.metadata | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} securityContext: privileged: true + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/metadata_agent.ini + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 35 + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/metadata_agent.ini + - --liveness-probe + initialDelaySeconds: 90 + periodSeconds: 60 + timeoutSeconds: 45 command: - /tmp/neutron-metadata-agent.sh volumeMounts: @@ -93,6 +118,10 @@ spec: mountPath: /tmp/neutron-metadata-agent.sh subPath: neutron-metadata-agent.sh readOnly: true + - name: neutron-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: neutron-etc mountPath: /etc/neutron/neutron.conf subPath: neutron.conf diff --git a/neutron/templates/daemonset-ovs-agent.yaml b/neutron/templates/daemonset-ovs-agent.yaml index e58f30e96e..632947b94e 100644 --- a/neutron/templates/daemonset-ovs-agent.yaml +++ b/neutron/templates/daemonset-ovs-agent.yaml @@ -157,11 +157,30 @@ spec: - bash - -c - 'ovs-vsctl list-br | grep -q br-int' + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/plugins/ml2/openvswitch_agent.ini + - --agent-queue-name + - q-agent-notifier-tunnel-update + - --liveness-probe + initialDelaySeconds: 120 + periodSeconds: 90 + timeoutSeconds: 70 volumeMounts: - name: neutron-bin mountPath: /tmp/neutron-openvswitch-agent.sh subPath: neutron-openvswitch-agent.sh readOnly: true + - name: neutron-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: pod-shared mountPath: /tmp/pod-shared - name: neutron-etc diff --git a/neutron/templates/daemonset-sriov-agent.yaml b/neutron/templates/daemonset-sriov-agent.yaml index 8afb7c6935..b3fe276214 100644 --- a/neutron/templates/daemonset-sriov-agent.yaml +++ b/neutron/templates/daemonset-sriov-agent.yaml @@ -128,11 +128,27 @@ spec: privileged: true command: - /tmp/neutron-sriov-agent.sh + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/neutron/neutron.conf + - --config-file + - /etc/neutron/sriov_agent.ini + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 volumeMounts: - name: neutron-bin mountPath: /tmp/neutron-sriov-agent.sh subPath: neutron-sriov-agent.sh readOnly: true + - name: neutron-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: pod-shared mountPath: /tmp/pod-shared - name: neutron-etc diff --git a/neutron/templates/deployment-server.yaml b/neutron/templates/deployment-server.yaml index 673b458ab2..10c1993e7d 100644 --- a/neutron/templates/deployment-server.yaml +++ b/neutron/templates/deployment-server.yaml @@ -78,6 +78,10 @@ spec: readinessProbe: tcpSocket: port: {{ tuple "network" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + livenessProbe: + tcpSocket: + port: {{ tuple "network" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + initialDelaySeconds: 60 volumeMounts: - name: neutron-bin mountPath: /tmp/neutron-server.sh