maas-rack image: improve IPMI error handling
MAAS uses ipmipower to power nodes on and off. The ipmipower command sometimes fails with the message: x.x.x.x: BMC error but exits with return code of 0. Because MAAS is not aware of this specific error, and is also not able to detect the power state from the output, the built in retry logic is not used, resulting in failed deployments or nodes in Power Error. This patch adds "BMC error" to the list of known retriable errors, and also adds additional retries. Error strings from ipmipower 1.4: http://git.savannah.gnu.org/cgit/freeipmi.git/tree/ipmipower/ipmipower_output.c?h=Release-1_4_0_branch#n52 IPMI_ERRORS known to MAAS 2.3: https://git.launchpad.net/maas/tree/src/provisioningserver/drivers/power/ipmi.py?h=2.3#n50 Change-Id: Ia4b10e47855d67ba81e9ab800be3d780f8b38cac
This commit is contained in:
parent
d79b355188
commit
03ebbaaca3
26
images/maas-rack-controller/2.3_ipmi_error.patch
Normal file
26
images/maas-rack-controller/2.3_ipmi_error.patch
Normal file
@ -0,0 +1,26 @@
|
||||
diff --git a/src/provisioningserver/drivers/power/ipmi.py b/src/provisioningserver/drivers/power/ipmi.py
|
||||
index 219ee268b..acdf2065d 100644
|
||||
--- a/src/provisioningserver/drivers/power/ipmi.py
|
||||
+++ b/src/provisioningserver/drivers/power/ipmi.py
|
||||
@@ -143,6 +143,12 @@ IPMI_ERRORS = {
|
||||
" MAAS performed several retries. Please wait and try again."),
|
||||
'exception': PowerConnError
|
||||
},
|
||||
+ 'BMC error': {
|
||||
+ 'message': (
|
||||
+ "Device not responding correctly while performing power action."
|
||||
+ " MAAS performed several retries. Please wait and try again."),
|
||||
+ 'exception': PowerConnError
|
||||
+ },
|
||||
'could not find inband device': {
|
||||
'message': (
|
||||
"An inband device could not be found."
|
||||
@@ -209,7 +215,7 @@ class IPMIPowerDriver(PowerDriver):
|
||||
'mac_address', "Power MAC", scope=SETTING_SCOPE.NODE)
|
||||
]
|
||||
ip_extractor = make_ip_extractor('power_address')
|
||||
- wait_time = (4, 8, 16, 32)
|
||||
+ wait_time = (4, 4, 8, 8, 16, 16, 32, 32)
|
||||
|
||||
def detect_missing_packages(self):
|
||||
if not shell.has_command_available('ipmipower'):
|
@ -56,11 +56,14 @@ COPY 2.3_mac_address.patch /tmp/2.3_mac_address.patch
|
||||
# sh8121att: patch so query for RPC info contains proper Host header
|
||||
copy 2.3_hostheader.patch /tmp/2.3_hostheader.patch
|
||||
COPY 2.3_secure_headers.patch /tmp/2.3_secure_headers.patch
|
||||
# Patch so maas knows that "BMC error" is retriable
|
||||
COPY 2.3_ipmi_error.patch /tmp/2.3_ipmi_error.patch
|
||||
|
||||
RUN cd /usr/lib/python3/dist-packages/provisioningserver/utils && patch network.py < /tmp/2.3_nic_filter.patch
|
||||
RUN cd /usr/lib/python3/dist-packages/provisioningserver/utils && patch ipaddr.py < /tmp/2.3_mac_address.patch
|
||||
RUN cd /usr/lib/python3/dist-packages/provisioningserver/rpc && patch clusterservice.py < /tmp/2.3_hostheader.patch
|
||||
RUN cd /usr/lib/python3/dist-packages/twisted/web && patch server.py < /tmp/2.3_secure_headers.patch
|
||||
RUN cd /usr/lib/python3/dist-packages/provisioningserver/drivers/power && patch ipmi.py < /tmp/2.3_ipmi_error.patch
|
||||
|
||||
# echo journalctl logs to the container's stdout
|
||||
COPY scripts/journalctl-to-tty.service /etc/systemd/system/journalctl-to-tty.service
|
||||
|
Loading…
Reference in New Issue
Block a user