From eaf07202a9ee8197a31450cbc5fa4294dd82d396 Mon Sep 17 00:00:00 2001
From: Li Zhu
Date: Tue, 25 Oct 2022 17:39:27 -0400
Subject: [PATCH] Improve remote install robustness
Adding retries to handle the following types of failure:
1. Create communication session failed - Failed to create session.
2. Unable to establish Redfish client connections to BMC at
(Server not reachable, return code: 503).
3. Fail to set System Power State to On/Off.
Test Plan:
PASS: Retries work properly when session creation fails.
PASS: Retries work properly when Unable to establish Redfish client
connection to BMC.
PASS: Retries work properly when returning 500 error in the "Power Off
Host" stage.
PASS: rvmc script executed successfully without above errors.
Story: 2010144
Task: 46761
Signed-off-by: Li Zhu
Change-Id: I6bb2e0822a51770b181181b49a86fb51d6dca18b
---
tools/rvmc/docker/rvmc.py | 146 ++++++++++++++++++++++++++++----------
1 file changed, 107 insertions(+), 39 deletions(-)
diff --git a/tools/rvmc/docker/rvmc.py b/tools/rvmc/docker/rvmc.py
index bcdeaa61..637d6a0f 100755
--- a/tools/rvmc/docker/rvmc.py
+++ b/tools/rvmc/docker/rvmc.py
@@ -1,7 +1,7 @@
#!/usr/bin/python3
###############################################################################
#
-# Copyright (c) 2019-2020 Wind River Systems, Inc.
+# Copyright (c) 2019-2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -126,6 +126,7 @@ import yaml
# Import Redfish Python Library
# Module: https://pypi.org/project/redfish/
import redfish
+from redfish.rest.v1 import InvalidCredentialsError
FEATURE_NAME = 'Redfish Virtual Media Controller'
@@ -177,6 +178,14 @@ def ilog(string):
sys.stdout.write("\n%s Info : %s" % (t(), string))
+def wlog(string):
+ """
+ Warning Log Utility
+ """
+
+ sys.stdout.write("\n%s Warn : %s" % (t(), string))
+
+
def elog(string):
"""
Error Log Utility
@@ -274,6 +283,21 @@ RETRY_DELAY_SECS = 10
# 2 second delay constant
DELAY_2_SECS = 2
+# max number of establishing BMC connection attempts
+MAX_CONNECTION_ATTEMPTS = 3
+# interval in seconds between BMC connection attempts
+CONNECTION_RETRY_INTERVAL = 15
+
+# max number of session creation attempts
+MAX_SESSION_CREATION_ATTEMPTS = 3
+# interval in seconds between session creation attempts
+SESSION_CREATION_RETRY_INTERVAL = 15
+
+# max number of retries for http transient error (e.g. response status: 500)
+MAX_HTTP_TRANSIENT_ERROR_RETRIES = 5
+# interval in seconds between http request retries
+HTTP_REQUEST_RETRY_INTERVAL = 10
+
def is_ipv6_address(address):
"""
@@ -454,7 +478,7 @@ class VmcObject(object):
dlog1("Password : %s" % self.pw_encoded)
dlog1("Image : %s" % self.img)
- def make_request(self, operation=None, path=None, payload=None):
+ def make_request(self, operation=None, path=None, payload=None, retry=-1):
"""
Issue a Redfish http request,
Check response,
@@ -467,6 +491,10 @@ class VmcObject(object):
:type path: str
:param payload: POST or PATCH payload data
:type payload: dictionary
+ :param retry: The number of retries. The default value -1 means
+ disabling retry. If the number in
+ [0 .. MAX_HTTP_TRANSIENT_ERROR_RETRIES), the retry will be executed.
+ :type retry: int
:returns True if request succeeded (200,202(accepted),204(no content)
"""
@@ -477,28 +505,34 @@ class VmcObject(object):
url = self.url
before_request_time = datetime.datetime.now().replace(microsecond=0)
+ request_log = "Request : %s %s" % (operation, url)
try:
- dlog3("Request : %s %s" % (operation, url))
if operation == GET:
- dlog3("Headers : %s : %s" % (operation, GET_HEADERS))
+ request_log += "\nHeaders : %s : %s" % \
+ (operation, GET_HEADERS)
self.response = self.redfish_obj.get(url, headers=GET_HEADERS)
elif operation == POST:
- dlog3("Headers : %s : %s" % (operation, POST_HEADERS))
- dlog3("Payload : %s" % payload)
+ request_log += "\nHeaders : %s : %s" % \
+ (operation, POST_HEADERS)
+ request_log += "\nPayload : %s" % payload
self.response = self.redfish_obj.post(url,
body=payload,
headers=POST_HEADERS)
elif operation == PATCH:
- dlog3("Headers : %s : %s" % (operation, PATCH_HEADERS))
- dlog3("Payload : %s" % payload)
+ request_log += "\nHeaders : %s : %s" % \
+ (operation, PATCH_HEADERS)
+ request_log += "\nPayload : %s" % payload
self.response = self.redfish_obj.patch(url,
body=payload,
headers=PATCH_HEADERS)
else:
+ dlog3(request_log)
elog("Unsupported operation: %s" % operation)
return False
+ dlog3(request_log)
+
except Exception as ex:
elog("Failed operation on '%s' (%s)" % (url, ex))
@@ -507,7 +541,20 @@ class VmcObject(object):
delta = after_request_time - before_request_time
# if we got a response, check its status
if self.check_ok_status(url, operation, delta.seconds) is False:
- self._exit(1)
+ if retry < 0 or retry >= MAX_HTTP_TRANSIENT_ERROR_RETRIES:
+ elog("Failed in an error response:\n%s" % self.response)
+ self._exit(1)
+ else:
+ retry += 1
+ wlog("Got an error response for: \n%s" % request_log)
+ ilog("Make request: retry (%i of %i) in %i secs." %
+ (retry, MAX_HTTP_TRANSIENT_ERROR_RETRIES,
+ HTTP_REQUEST_RETRY_INTERVAL))
+ time.sleep(HTTP_REQUEST_RETRY_INTERVAL)
+ self.make_request(operation=operation,
+ path=path,
+ payload=payload,
+ retry=retry)
# handle 204 success with no content ; clear last response
if self.response.status == 204:
@@ -725,26 +772,34 @@ class VmcObject(object):
ilog("BMC Ping Ok : %s (%i)" % (self.ip, ping_count))
# try to connect
- connect_error = False
- try:
- # One time Redfish Client Object Create
- self.redfish_obj = \
- redfish.redfish_client(base_url=self.uri,
- username=self.un,
- password=self.pw,
- default_prefix=REDFISH_ROOT_PATH)
- if self.redfish_obj is None:
- connect_error = True
- elog("Unable to establish %s to BMC at %s" %
- (stage, self.uri))
- except Exception as ex:
- connect_error = True
- elog("Unable to establish %s to BMC at %s (%s)" %
- (stage, self.uri, ex))
+ fail_counter = 0
+ err_msg = "Unable to establish %s to BMC at %s." % (stage, self.uri)
+ while fail_counter < MAX_CONNECTION_ATTEMPTS:
+ ex_log = ""
+ try:
+ # One time Redfish Client Object Create
+ self.redfish_obj = \
+ redfish.redfish_client(base_url=self.uri,
+ username=self.un,
+ password=self.pw,
+ default_prefix=REDFISH_ROOT_PATH)
+ if self.redfish_obj is None:
+ fail_counter += 1
+ else:
+ return
+ except Exception as ex:
+ fail_counter += 1
+ ex_log = " (%s)" % str(ex)
- if connect_error is True:
- alog("Check BMC ip address is pingable and supports Redfish")
- self._exit(1)
+ if fail_counter < MAX_CONNECTION_ATTEMPTS:
+ wlog(err_msg + " Retry (%i/%i) in %i secs." %
+ (fail_counter, MAX_CONNECTION_ATTEMPTS - 1,
+ CONNECTION_RETRY_INTERVAL) + ex_log)
+ time.sleep(CONNECTION_RETRY_INTERVAL)
+
+ elog(err_msg)
+ alog("Check BMC ip address is pingable and supports Redfish")
+ self._exit(1)
###########################################################################
# Redfish Root Query
@@ -784,14 +839,27 @@ class VmcObject(object):
stage = 'Create Communication Session'
slog(stage)
- try:
- self.redfish_obj.login(auth="session")
- dlog1("Session : Open")
- self.session = True
-
- except Exception as ex:
- elog("Failed to Create session ; %s" % ex)
- self._exit(1)
+ fail_counter = 0
+ while fail_counter < MAX_SESSION_CREATION_ATTEMPTS:
+ try:
+ self.redfish_obj.login(auth="session")
+ dlog1("Session : Open")
+ self.session = True
+ return
+ except InvalidCredentialsError:
+ elog("Failed to Create session due to invalid credentials.")
+ alog("Check BMC username and password in config file")
+ self._exit(1)
+ except Exception as ex:
+ err_msg = "Failed to Create session ; %s." % str(ex)
+ fail_counter += 1
+ if fail_counter >= MAX_SESSION_CREATION_ATTEMPTS:
+ elog(err_msg)
+ self._exit(1)
+ wlog(err_msg + " Retry (%i/%i) in %i secs."
+ % (fail_counter, MAX_SESSION_CREATION_ATTEMPTS - 1,
+ CONNECTION_RETRY_INTERVAL))
+ time.sleep(SESSION_CREATION_RETRY_INTERVAL)
###########################################################################
# Query Redfish Managers
@@ -911,7 +979,8 @@ class VmcObject(object):
self._exit(1)
if self.make_request(operation=GET,
- path=self.systems_member_url) is False:
+ path=self.systems_member_url,
+ retry=0) is False:
elog("Unable to get %s from %s" %
(info, self.systems_member_url))
self._exit(1)
@@ -1048,7 +1117,7 @@ class VmcObject(object):
poll_count = 0
MAX_STATE_POLL_COUNT = 60 # some servers take longer than 10 seconds
while poll_count < MAX_STATE_POLL_COUNT and self.power_state != state:
- time.sleep(1)
+ time.sleep(3)
poll_count = poll_count + 1
# get systems info
@@ -1307,7 +1376,6 @@ class VmcObject(object):
while poll_count < MAX_POLL_COUNT and ejecting:
# verify the image is not in inserted
poll_count = poll_count + 1
- vm_eject = self.vm_actions.get(eject_media_label)
if self.make_request(operation=GET,
path=self.vm_url) is True:
if self.get_key_value('Inserted') is False: