Fix health-probe concurrency and timings
Changed Nova and Neutron health-probe script to exit if previous probe process is still running. The health-probe has RPC call timeout of 60 seconds and has 2 retries. In worst case scenario the probe process can run a little over 180 seconds. Changing the periodSeconds so that probe starts after previous one is complete. Also changing timeoutSeconds value a little to give little more extra time for the probe to finish. Increasing the liveness probe periods as they are not do critical which will reduce the resource usage for the probes. Co-authored-by: Randeep Jalli <rj2083@att.com> Change-Id: Ife1c381d663c1e271a5099bdc6d0dfefb00d8d73
This commit is contained in:
parent
5827236ad2
commit
414b10fab0
@ -39,8 +39,10 @@ Usage example for Neutron metadata agent:
|
||||
|
||||
import httplib2
|
||||
from six.moves import http_client as httplib
|
||||
import json
|
||||
import os
|
||||
import psutil
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
|
||||
@ -292,8 +294,36 @@ def test_rpc_liveness():
|
||||
|
||||
check_agent_status(transport)
|
||||
|
||||
def check_pid_running(pid):
|
||||
if psutil.pid_exists(int(pid)):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if "liveness-probe" in ','.join(sys.argv):
|
||||
pidfile = "/tmp/liveness.pid" #nosec
|
||||
else:
|
||||
pidfile = "/tmp/readiness.pid" #nosec
|
||||
data = {}
|
||||
if os.path.isfile(pidfile):
|
||||
with open(pidfile,'r') as f:
|
||||
data = json.load(f)
|
||||
if check_pid_running(data['pid']):
|
||||
if data['exit_count'] > 1:
|
||||
# Third time in, kill the previous process
|
||||
os.kill(int(data['pid']), signal.SIGTERM)
|
||||
else:
|
||||
data['exit_count'] = data['exit_count'] + 1
|
||||
with open(pidfile, 'w') as f:
|
||||
json.dump(data, f)
|
||||
sys.exit(0)
|
||||
data['pid'] = os.getpid()
|
||||
data['exit_count'] = 0
|
||||
with open(pidfile, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
if "sriov_agent.ini" in ','.join(sys.argv):
|
||||
sriov_readiness_check()
|
||||
elif "metadata_agent.ini" not in ','.join(sys.argv):
|
||||
|
@ -81,8 +81,8 @@ spec:
|
||||
- --use-fqdn
|
||||
{{- end }}
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 65
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
@ -99,8 +99,8 @@ spec:
|
||||
- --use-fqdn
|
||||
{{- end }}
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
command:
|
||||
- /tmp/neutron-l2gw-agent.sh
|
||||
volumeMounts:
|
||||
|
@ -341,28 +341,28 @@ pod:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 65
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
liveness:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
l3_agent:
|
||||
l3_agent:
|
||||
readiness:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 65
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
liveness:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
lb_agent:
|
||||
lb_agent:
|
||||
readiness:
|
||||
@ -373,14 +373,14 @@ pod:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 65
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
liveness:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
ovs_agent:
|
||||
ovs_agent:
|
||||
readiness:
|
||||
@ -390,16 +390,16 @@ pod:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
sriov_agent:
|
||||
sriov_agent:
|
||||
readiness:
|
||||
enabled: true
|
||||
params:
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 65
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
server:
|
||||
server:
|
||||
readiness:
|
||||
|
@ -33,7 +33,10 @@ Usage example for Nova Compute:
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import psutil
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
|
||||
@ -218,8 +221,36 @@ def test_rpc_liveness():
|
||||
|
||||
check_service_status(transport)
|
||||
|
||||
def check_pid_running(pid):
|
||||
if psutil.pid_exists(int(pid)):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if "liveness-probe" in ','.join(sys.argv):
|
||||
pidfile = "/tmp/liveness.pid" #nosec
|
||||
else:
|
||||
pidfile = "/tmp/readiness.pid" #nosec
|
||||
data = {}
|
||||
if os.path.isfile(pidfile):
|
||||
with open(pidfile,'r') as f:
|
||||
data = json.load(f)
|
||||
if check_pid_running(data['pid']):
|
||||
if data['exit_count'] > 1:
|
||||
# Third time in, kill the previous process
|
||||
os.kill(int(data['pid']), signal.SIGTERM)
|
||||
else:
|
||||
data['exit_count'] = data['exit_count'] + 1
|
||||
with open(pidfile, 'w') as f:
|
||||
json.dump(data, f)
|
||||
sys.exit(0)
|
||||
data['pid'] = os.getpid()
|
||||
data['exit_count'] = 0
|
||||
with open(pidfile, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
test_rpc_liveness()
|
||||
|
||||
sys.exit(0) # return success
|
||||
|
@ -203,8 +203,8 @@ spec:
|
||||
- --use-fqdn
|
||||
{{- end }}
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
@ -219,8 +219,8 @@ spec:
|
||||
- --use-fqdn
|
||||
{{- end }}
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
command:
|
||||
- /tmp/nova-compute.sh
|
||||
volumeMounts:
|
||||
|
@ -69,8 +69,8 @@ spec:
|
||||
- --service-queue-name
|
||||
- conductor
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
@ -82,8 +82,8 @@ spec:
|
||||
- conductor
|
||||
- --liveness-probe
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
command:
|
||||
- /tmp/nova-conductor.sh
|
||||
volumeMounts:
|
||||
|
@ -69,8 +69,8 @@ spec:
|
||||
- --service-queue-name
|
||||
- consoleauth
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
@ -82,8 +82,8 @@ spec:
|
||||
- consoleauth
|
||||
- --liveness-probe
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
command:
|
||||
- /tmp/nova-consoleauth.sh
|
||||
volumeMounts:
|
||||
|
@ -69,8 +69,8 @@ spec:
|
||||
- --service-queue-name
|
||||
- scheduler
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 190
|
||||
timeoutSeconds: 185
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
@ -82,8 +82,8 @@ spec:
|
||||
- scheduler
|
||||
- --liveness-probe
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
periodSeconds: 600
|
||||
timeoutSeconds: 580
|
||||
command:
|
||||
- /tmp/nova-scheduler.sh
|
||||
volumeMounts:
|
||||
|
Loading…
Reference in New Issue
Block a user