Add a wait time between http request retries
Maintenance interfaces with sysinv, sm and the vim using http requests. Request timeout's have an implicit delay between retries. However, command failures or outright connection failures don't. This has only become obvious in mtce's communication with the vim where there appears to be a process startup timing change that leads to the 'vim' not being ready to handle commands before mtcAgent startup starts sending them after a platform services group startup by sm. This update adds a 10 second http retry wait as a configuration option to mtc.conf. The mtcAgent loads this value at startup and uses it in a new HTTP__RETRY_WAIT state of http request work FSM. The number of retries remains unchanged. This update is only forcing a minimum wait time between retries, regardless of cause. Failure path testing was done using Fault Insertion Testing (FIT). Test Plan: PASS: Verify the reported issue is resolved by this update. PASS: Verify http retry config value load on process startup. PASS: Verify updated value is used over a process -sighup. PASS: Verify default value if new mtc.conf config value is not found. PASS: Verify http connection failure http retry handling. PASS: Verify http request timeout failure retry handling. PASS: Verify http request operation failure retry handling. Regression: PASS: Build and install ISO - Standard and AIO DX. PASS: Verify http failures do not fail a lock operation. PASS: Verify host unlock fails if its http done queue shows failures. PASS: Verify host swact. PASS: Verify handling of random and persistent http errors involving the need for retries. Closes-Bug: 2047958 Change-Id: Icc758b0782be2a4f2882efd56f5de1a8dddea490 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
5a3a5ce8ea
commit
191c0aa6a8
@ -131,6 +131,11 @@
|
||||
#define FIT_CODE__STOP_HOST_SERVICES (71)
|
||||
|
||||
#define FIT_CODE__SOCKET_SETUP (72)
|
||||
#define FIT_CODE__READ_JSON_FROM_FILE (73)
|
||||
|
||||
#define FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED (75)
|
||||
#define FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT (76)
|
||||
#define FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS (77)
|
||||
|
||||
/***************** Process Fit Codes ********************************/
|
||||
|
||||
|
@ -2,10 +2,10 @@
|
||||
#define __INCLUDE_HTTPUTIL_H__
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream> /* for ... string */
|
||||
@ -93,12 +93,14 @@ typedef enum {
|
||||
HTTP__RECEIVE_WAIT = 1,
|
||||
HTTP__RECEIVE = 2,
|
||||
HTTP__FAILURE = 3,
|
||||
HTTP__DONE_FAIL = 4,
|
||||
HTTP__DONE_PASS = 5,
|
||||
HTTP__STAGES = 6
|
||||
HTTP__RETRY_WAIT = 4,
|
||||
HTTP__DONE_FAIL = 5,
|
||||
HTTP__DONE_PASS = 6,
|
||||
HTTP__STAGES = 7
|
||||
} httpStages_enum ;
|
||||
|
||||
#define HTTP_RECEIVE_WAIT_MSEC (10)
|
||||
#define HTTP_RETRY_WAIT_SECS (10)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -1,10 +1,10 @@
|
||||
#ifndef __INCLUDE_NODELOG_HH__
|
||||
#define __INCLUDE_NODELOG_HH__
|
||||
/*
|
||||
* Copyright (c) 2013-2017,2023 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Copyright (c) 2013-2017, 2023-2024 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -116,6 +116,7 @@ typedef struct
|
||||
int start_delay ; /**< startup delay, added for pmon */
|
||||
int api_retries ; /**< api retries before failure */
|
||||
int bmc_reset_delay ; /**< secs delay before bmc reset */
|
||||
int http_retry_wait ; /**< secs to wait between http reg retries */
|
||||
int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
|
||||
bool hostwd_reboot_on_err ; /**< should hostwd reboot on fault detected */
|
||||
bool hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */
|
||||
|
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2020, 2023 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -314,6 +314,7 @@ nodeLinkClass::nodeLinkClass()
|
||||
sysinv_timeout = HTTP_SYSINV_CRIT_TIMEOUT ;
|
||||
sysinv_noncrit_timeout = HTTP_SYSINV_NONC_TIMEOUT ;
|
||||
work_queue_timeout = MTC_WORKQUEUE_TIMEOUT ;
|
||||
http_retry_wait = HTTP_RETRY_WAIT_SECS ;
|
||||
|
||||
/* Init the auto recovery threshold and intervals to zero until
|
||||
* modified by daemon config */
|
||||
|
@ -1866,6 +1866,9 @@ public:
|
||||
* time for crashdumps to complete. */
|
||||
int bmc_reset_delay ;
|
||||
|
||||
/** seconds to wait between http request retries */
|
||||
int http_retry_wait ;
|
||||
|
||||
/* collectd event handler */
|
||||
int collectd_notify_handler ( string & hostname,
|
||||
string & resource,
|
||||
|
@ -376,6 +376,11 @@ static int mtc_config_handler ( void * user,
|
||||
config_ptr->bmc_reset_delay = atoi(value);
|
||||
mtcInv.bmc_reset_delay = config_ptr->bmc_reset_delay ;
|
||||
}
|
||||
else if (MATCH("agent", "http_retry_wait"))
|
||||
{
|
||||
config_ptr->http_retry_wait = atoi(value);
|
||||
mtcInv.http_retry_wait = config_ptr->http_retry_wait ;
|
||||
}
|
||||
else if (MATCH("timeouts", "failsafe_shutdown_delay"))
|
||||
{
|
||||
config_ptr->failsafe_shutdown_delay = atoi(value);
|
||||
@ -692,6 +697,7 @@ int daemon_configure ( void )
|
||||
ilog ("TokenRefresh: %3d secs\n" , mtcInv.token_refresh_rate);
|
||||
ilog ("API Retries : %3d secs\n" , mtcInv.api_retries);
|
||||
ilog ("Reset Delay : %3d secs\n" , mtcInv.bmc_reset_delay);
|
||||
ilog ("HTTP Retry : %3d secs\n" , mtcInv.http_retry_wait);
|
||||
|
||||
/* Verify loaded config against an expected mask
|
||||
* as an ini file fault detection method */
|
||||
|
@ -69,6 +69,23 @@ string nodeLinkClass::mtcVimApi_state_get ( string hostname, int & http_status_c
|
||||
http_status_code = HTTP_NOTFOUND ;
|
||||
return ( payload );
|
||||
}
|
||||
#ifdef WANT_FIT_TESTING
|
||||
static const char * fit_file = "/var/run/fit/mtcVimApi_state_get";
|
||||
if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, hostname, "mtcVimApi_state_get"))
|
||||
{
|
||||
if ( daemon_is_file_present (fit_file) )
|
||||
{
|
||||
payload = daemon_read_file(fit_file);
|
||||
ilog("%s FIT Json: %s", hostname.c_str(), payload.c_str());
|
||||
return (payload);
|
||||
}
|
||||
else
|
||||
{
|
||||
slog("%s FIT file %s not found ; aborting fit", hostname.c_str(), fit_file);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
payload = ("{\"") ;
|
||||
payload.append (MTC_JSON_INV_ADMIN);
|
||||
payload.append ("\":\"");
|
||||
@ -246,6 +263,22 @@ int nodeLinkClass::mtcVimApi_state_change ( struct nodeLinkClass::node * node_pt
|
||||
node_ptr->httpReq.payload = "{\"state-change\": " ;
|
||||
node_ptr->httpReq.payload.append (mtcVimApi_state_get ( node_ptr->hostname , http_status_code ));
|
||||
|
||||
#ifdef WANT_FIT_TESTING
|
||||
static const char * fit_file = "/var/run/fit/mtcVimApi_state_change";
|
||||
if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, node_ptr->hostname, "mtcVimApi_state_change" ))
|
||||
{
|
||||
if ( daemon_is_file_present (fit_file) )
|
||||
{
|
||||
node_ptr->httpReq.payload = daemon_read_file(fit_file);
|
||||
ilog("%s FIT Json: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
slog("%s FIT file %s not found ; aborting fit", node_ptr->hostname.c_str(), fit_file);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (( request == VIM_HOST_FAILED ) || ( request == VIM_DPORT_FAILED ))
|
||||
{
|
||||
wlog ("%s %s\n", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
|
||||
|
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -331,7 +331,6 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
|
||||
syslog ( LOG_INFO, "+------+-------+--------------+---------+--------------+-----+----------------------+\n");
|
||||
}
|
||||
|
||||
|
||||
int size = node_ptr->libEvent_work_fifo.size() ;
|
||||
if ( size > QUEUE_OVERLOAD )
|
||||
{
|
||||
@ -456,6 +455,24 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
|
||||
break ;
|
||||
}
|
||||
}
|
||||
#ifdef WANT_FIT_TESTING
|
||||
if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED, node_ptr->hostname, "" ))
|
||||
{
|
||||
ilog("%s FIT Operation Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
|
||||
node_ptr->thisReq.status = FAIL_AUTHENTICATION ;
|
||||
rc = FAIL_OPERATION ;
|
||||
}
|
||||
else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT, node_ptr->hostname, "" ))
|
||||
{
|
||||
ilog("%s FIT Request Timeout Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
|
||||
rc = FAIL_TIMEOUT ;
|
||||
}
|
||||
else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS, node_ptr->hostname, "" ))
|
||||
{
|
||||
ilog("%s FIT Connection Loss: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
|
||||
node_ptr->thisReq.status = rc = FAIL_HTTP_ZERO_STATUS ;
|
||||
}
|
||||
#endif
|
||||
if ( rc != PASS )
|
||||
{
|
||||
node_ptr->libEvent_work_fifo_ptr->state =
|
||||
@ -598,6 +615,18 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->libEvent_work_fifo_ptr->cur_retries =
|
||||
node_ptr->thisReq.cur_retries ;
|
||||
|
||||
mtcTimer_start ( node_ptr->http_timer, mtcTimer_handler, HTTP_RETRY_WAIT_SECS );
|
||||
node_ptr->libEvent_work_fifo_ptr->state =
|
||||
node_ptr->thisReq.state = HTTP__RETRY_WAIT ;
|
||||
dlog ("%s %d sec retry wait started", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case HTTP__RETRY_WAIT:
|
||||
{
|
||||
if ( node_ptr->http_timer.ring == true )
|
||||
{
|
||||
dlog ("%s %d sec retry wait expired", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
|
||||
node_ptr->libEvent_work_fifo_ptr->state =
|
||||
node_ptr->thisReq.state = HTTP__TRANSMIT ;
|
||||
}
|
||||
@ -862,7 +891,6 @@ bool nodeLinkClass::workQueue_present ( libEvent & event )
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
wlog ("%s ... not found in work queue\n", event.log_prefix.c_str());
|
||||
return (false);
|
||||
}
|
||||
|
@ -78,6 +78,8 @@ bmc_reset_delay = 300 ; seconds to wait before issuing a bmc
|
||||
; ACK reboot requests. The delay gives
|
||||
; time for crashdumps to complete.
|
||||
|
||||
http_retry_wait = 10 ; secs to wait between http request retries
|
||||
|
||||
[client] ; Client Configuration
|
||||
|
||||
scheduling_priority = 45 ; realtime scheduling; range of 1 .. 99
|
||||
|
Loading…
Reference in New Issue
Block a user