From d467d685a3863edc5fed7c352961b3235a5e3a99 Mon Sep 17 00:00:00 2001 From: Oleksii Grudev Date: Wed, 13 Nov 2019 15:28:14 +0200 Subject: [PATCH] Fix health probe for several conductor workers It was observed that when increasing amount of conductor workers from default "1" to higher value the readiness probe fails to check rabbitmq connections for conductor processes - it happens since the script is trying to obtain rabbitmq connections for parent conductor process which in case of workers>1 doesn`t open rabbit connections but spawns child processes which handle rabbitmq connections instead. This patch removes the "check-all-pids" option, keeps the logic but simplifies and fastens he code - instead of checking all processes when "check-all-pids" option was set (however regardless of "sock_count value" if only one process opens connection the check returns positive result) processes will be checked one-by-one until the first one with open rabbitmq connection(s) is found. Change-Id: I72be0bbdefcba77a55b6ceed6e192c9621c069eb --- nova/templates/bin/_health-probe.py.tpl | 18 ++---------------- nova/templates/deployment-scheduler.yaml | 2 -- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/nova/templates/bin/_health-probe.py.tpl b/nova/templates/bin/_health-probe.py.tpl index 2cde2d42c9..d78e70139d 100644 --- a/nova/templates/bin/_health-probe.py.tpl +++ b/nova/templates/bin/_health-probe.py.tpl @@ -97,18 +97,10 @@ def check_service_status(transport): def tcp_socket_status(process, ports): """Check the tcp socket status on a process""" - sock_count = 0 - parentId = 0 for p in psutil.process_iter(): try: with p.oneshot(): if process in " ".join(p.cmdline()): - if parentId == 0: - parentId = p.pid - else: - if p.ppid() == parentId and \ - not cfg.CONF.check_all_pids: - continue pcon = p.connections() for con in pcon: try: @@ -117,14 +109,10 @@ def tcp_socket_status(process, ports): except IndexError: continue if rport in ports and status == tcp_established: - sock_count = sock_count + 1 + return 1 except psutil.Error: continue - - if sock_count == 0: - return 0 - else: - return 1 + return 0 def configured_port_in_conf(): @@ -198,8 +186,6 @@ def test_rpc_liveness(): cfg.CONF.register_cli_opt(cfg.StrOpt('service-queue-name')) cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False, required=False)) - cfg.CONF.register_cli_opt(cfg.BoolOpt('check-all-pids', default=False, - required=False)) cfg.CONF.register_cli_opt(cfg.BoolOpt('use-fqdn', default=False, required=False)) diff --git a/nova/templates/deployment-scheduler.yaml b/nova/templates/deployment-scheduler.yaml index 05ee94923d..cb9e9df35c 100644 --- a/nova/templates/deployment-scheduler.yaml +++ b/nova/templates/deployment-scheduler.yaml @@ -68,7 +68,6 @@ spec: - /etc/nova/nova.conf - --service-queue-name - scheduler - - --check-all-pids initialDelaySeconds: 80 periodSeconds: 90 timeoutSeconds: 70 @@ -82,7 +81,6 @@ spec: - --service-queue-name - scheduler - --liveness-probe - - --check-all-pids initialDelaySeconds: 120 periodSeconds: 90 timeoutSeconds: 70