Merge "Add a wait time between http request retries"

2024-02-13 16:45:26 +00:00 · 2024-02-13 16:45:26 +00:00 · 739c508e92
commit 739c508e92
parent 4af50a10bd 191c0aa6a8
9 changed files with 162 additions and 81 deletions
--- a/mtce-common/src/common/fitCodes.h
+++ b/mtce-common/src/common/fitCodes.h
@ -131,6 +131,11 @@
 #define FIT_CODE__STOP_HOST_SERVICES                 (71)

 #define FIT_CODE__SOCKET_SETUP                       (72)
+#define FIT_CODE__READ_JSON_FROM_FILE                (73)
+
+#define FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED    (75)
+#define FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT     (76)
+#define FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS     (77)

 /*****************      Process Fit Codes     ********************************/

--- a/mtce-common/src/common/httpUtil.h
+++ b/mtce-common/src/common/httpUtil.h
@ -2,10 +2,10 @@
 #define __INCLUDE_HTTPUTIL_H__

 /*
- * Copyright (c) 2013, 2016 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ * Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
 */

 #include <iostream>         /* for ... string               */
@ -93,12 +93,14 @@ typedef enum {
   HTTP__RECEIVE_WAIT  = 1,
   HTTP__RECEIVE       = 2,
   HTTP__FAILURE       = 3,
-   HTTP__DONE_FAIL     = 4,
-   HTTP__DONE_PASS     = 5,
-   HTTP__STAGES        = 6
+   HTTP__RETRY_WAIT    = 4,
+   HTTP__DONE_FAIL     = 5,
+   HTTP__DONE_PASS     = 6,
+   HTTP__STAGES        = 7
 }  httpStages_enum ;

 #define HTTP_RECEIVE_WAIT_MSEC (10)
+#define HTTP_RETRY_WAIT_SECS   (10)

 typedef struct
 {
@ -142,7 +144,7 @@ typedef enum {

    SYSINV_CONFIG_SHOW,
    SYSINV_CONFIG_MODIFY,
-    
+
    SYSINV_SENSOR_LOAD,
    SYSINV_SENSOR_LOAD_GROUPS,
    SYSINV_SENSOR_LOAD_GROUP,
--- a/mtce-common/src/common/logMacros.h
+++ b/mtce-common/src/common/logMacros.h
@ -1,10 +1,10 @@
 #ifndef __INCLUDE_NODELOG_HH__
 #define __INCLUDE_NODELOG_HH__
 /*
- * Copyright (c) 2013-2017,2023 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ * Copyright (c) 2013-2017, 2023-2024 Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
 */

 /**
@ -116,6 +116,7 @@ typedef struct
    int   start_delay           ; /**< startup delay, added for pmon          */
    int   api_retries           ; /**< api retries before failure             */
    int   bmc_reset_delay       ; /**< secs delay before bmc reset            */
+    int   http_retry_wait       ; /**< secs to wait between http reg retries  */
    int   hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
    bool  hostwd_reboot_on_err  ; /**< should hostwd reboot on fault detected */
    bool  hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -1,8 +1,8 @@
 /*
 * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
 */

 /**
@ -314,6 +314,7 @@ nodeLinkClass::nodeLinkClass()
    sysinv_timeout               = HTTP_SYSINV_CRIT_TIMEOUT ;
    sysinv_noncrit_timeout       = HTTP_SYSINV_NONC_TIMEOUT ;
    work_queue_timeout           = MTC_WORKQUEUE_TIMEOUT    ;
+    http_retry_wait              = HTTP_RETRY_WAIT_SECS     ;

    /* Init the auto recovery threshold and intervals to zero until
     * modified by daemon config */
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@ -1866,6 +1866,9 @@ public:
     * time for crashdumps to complete. */
    int bmc_reset_delay ;

+    /** seconds to wait between http request retries */
+    int http_retry_wait ;
+
    /* collectd event handler */
    int collectd_notify_handler ( string & hostname,
                                  string & resource,
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -376,6 +376,11 @@ static int mtc_config_handler ( void * user,
        config_ptr->bmc_reset_delay = atoi(value);
        mtcInv.bmc_reset_delay = config_ptr->bmc_reset_delay ;
    }
+    else if (MATCH("agent", "http_retry_wait"))
+    {
+        config_ptr->http_retry_wait = atoi(value);
+        mtcInv.http_retry_wait = config_ptr->http_retry_wait ;
+    }
    else if (MATCH("timeouts", "failsafe_shutdown_delay"))
    {
        config_ptr->failsafe_shutdown_delay = atoi(value);
@ -692,6 +697,7 @@ int daemon_configure ( void )
    ilog ("TokenRefresh: %3d secs\n" , mtcInv.token_refresh_rate);
    ilog ("API Retries : %3d secs\n" , mtcInv.api_retries);
    ilog ("Reset Delay : %3d secs\n" , mtcInv.bmc_reset_delay);
+    ilog ("HTTP Retry  : %3d secs\n" , mtcInv.http_retry_wait);

    /* Verify loaded config against an expected mask
     * as an ini file fault detection method */
--- a/mtce/src/maintenance/mtcVimApi.cpp
+++ b/mtce/src/maintenance/mtcVimApi.cpp
@ -69,6 +69,23 @@ string nodeLinkClass::mtcVimApi_state_get ( string hostname, int & http_status_c
        http_status_code = HTTP_NOTFOUND ;
        return ( payload );
    }
+    #ifdef WANT_FIT_TESTING
+    static const char * fit_file = "/var/run/fit/mtcVimApi_state_get";
+    if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, hostname, "mtcVimApi_state_get"))
+    {
+        if ( daemon_is_file_present (fit_file) )
+        {
+            payload = daemon_read_file(fit_file);
+            ilog("%s FIT Json: %s", hostname.c_str(), payload.c_str());
+            return (payload);
+        }
+        else
+        {
+            slog("%s FIT file %s not found ; aborting fit", hostname.c_str(), fit_file);
+        }
+    }
+    #endif
+
    payload = ("{\"") ;
    payload.append (MTC_JSON_INV_ADMIN);
    payload.append ("\":\"");
@ -246,6 +263,22 @@ int nodeLinkClass::mtcVimApi_state_change ( struct nodeLinkClass::node * node_pt
    node_ptr->httpReq.payload = "{\"state-change\": " ;
    node_ptr->httpReq.payload.append (mtcVimApi_state_get ( node_ptr->hostname , http_status_code ));

+    #ifdef WANT_FIT_TESTING
+    static const char * fit_file = "/var/run/fit/mtcVimApi_state_change";
+    if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, node_ptr->hostname, "mtcVimApi_state_change" ))
+    {
+        if ( daemon_is_file_present (fit_file) )
+        {
+            node_ptr->httpReq.payload = daemon_read_file(fit_file);
+            ilog("%s FIT Json: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+        }
+        else
+        {
+            slog("%s FIT file %s not found ; aborting fit", node_ptr->hostname.c_str(), fit_file);
+        }
+    }
+    #endif
+
    if (( request == VIM_HOST_FAILED ) || ( request == VIM_DPORT_FAILED ))
    {
        wlog ("%s %s\n", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
--- a/mtce/src/maintenance/mtcWorkQueue.cpp
+++ b/mtce/src/maintenance/mtcWorkQueue.cpp
@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2013, 2016 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ * Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
 */

 /**
@ -38,7 +38,7 @@ string _get_work_state_str ( httpStages_enum state )
    else if ( state == HTTP__RECEIVE  ) return ("  Rx");
    else if ( state == HTTP__FAILURE  ) return (" Er ");
    else if ( state == HTTP__RECEIVE_WAIT  ) return ("Wait");
-    else 
+    else
    {
        elog ("Invalid Http Work Queue State: %d\n", state );
        return ("----");
@ -58,7 +58,7 @@ void nodeLinkClass::workQueue_dump ( struct nodeLinkClass::node * node_ptr )
              node_ptr->libEvent_work_fifo_ptr != node_ptr->libEvent_work_fifo.end();
              node_ptr->libEvent_work_fifo_ptr ++ )
        {
-            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %15s:%d | %s\n", 
+            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %15s:%d | %s\n",
                _get_work_state_str(node_ptr->libEvent_work_fifo_ptr->state).c_str(),
                node_ptr->libEvent_work_fifo_ptr->sequence,
                node_ptr->libEvent_work_fifo_ptr->hostname.c_str(),
@ -101,11 +101,11 @@ void nodeLinkClass::doneQueue_dump ( struct nodeLinkClass::node * node_ptr )
              node_ptr->libEvent_done_fifo_ptr != node_ptr->libEvent_done_fifo.end();
              node_ptr->libEvent_done_fifo_ptr ++ )
        {
-            syslog ( LOG_INFO, "%15s httpReq doneQueue:%5d - %s '%s' -> Status:%d\n", 
-                         node_ptr->libEvent_done_fifo_ptr->hostname.c_str(), 
-                         node_ptr->libEvent_done_fifo_ptr->sequence, 
-                         node_ptr->libEvent_done_fifo_ptr->service.c_str(), 
-                         node_ptr->libEvent_done_fifo_ptr->operation.c_str(), 
+            syslog ( LOG_INFO, "%15s httpReq doneQueue:%5d - %s '%s' -> Status:%d\n",
+                         node_ptr->libEvent_done_fifo_ptr->hostname.c_str(),
+                         node_ptr->libEvent_done_fifo_ptr->sequence,
+                         node_ptr->libEvent_done_fifo_ptr->service.c_str(),
+                         node_ptr->libEvent_done_fifo_ptr->operation.c_str(),
                         node_ptr->libEvent_done_fifo_ptr->status );
        }
    }
@ -240,9 +240,9 @@ int nodeLinkClass::doneQueue_dequeue ( libEvent & event )
 * Description: This is a Per Host Finite State Machine (FSM) that
 *              processes the work queue for the supplied host's
 *              node pointer.
- *              
+ *
 * Constructs:
- * 
+ *
 * node_ptr->libEvent_work_fifo - the current work queue/fifo
 * node_ptr->libEvent_done_fifo - queue/fifo of completed requests
 *
@ -255,17 +255,17 @@ int nodeLinkClass::doneQueue_dequeue ( libEvent & event )
 *
 * In process libEvents are copied from the callers work queue to
 * its thisReq.
- * 
+ *
 * Completed events including execution status are copied to the host's
 * done fifo.
- * 
- * Failed events may be retried up to max_retries as specified by 
+ *
+ * Failed events may be retried up to max_retries as specified by
 * the callers libEvent.
 *
 * @param event is a reference to the callers libEvent.
 *
 * @return an integer with values of PASS, FAIL, RETRY
- *  
+ *
 * ************************************************************************/

 int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
@ -280,18 +280,18 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
         * responses */
         if ( node_ptr->libEvent_done_fifo.size() > 10 )
         {
-             qlog ("%s Done Queue has %ld elements\n", 
+             qlog ("%s Done Queue has %ld elements\n",
                       node_ptr->hostname.c_str(),
                       node_ptr->libEvent_done_fifo.size());

             /* TODO: look at the status of the commands and print a log of those that failed */
-             
+
             /* Remove the first 8 - its a fifo the first ones at the front are the oldest */
             for ( int i=0 ; i < 8 ; i++ )
             {
                 node_ptr->libEvent_done_fifo.pop_front();
             }
-             qlog ("%s Done Queue has %ld elements remaining\n", 
+             qlog ("%s Done Queue has %ld elements remaining\n",
                       node_ptr->hostname.c_str(),
                       node_ptr->libEvent_done_fifo.size());
         }
@ -299,8 +299,8 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )

    if ( node_ptr->libEvent_work_fifo.empty() )
    {
-        // qlog_throttled ( node_ptr->no_work_log_throttle, 300, 
-        //                  "%s Idle ... \n", 
+        // qlog_throttled ( node_ptr->no_work_log_throttle, 300,
+        //                  "%s Idle ... \n",
        //                  node_ptr->hostname.c_str());
        node_ptr->no_work_log_throttle = 0 ;
        return (PASS);
@ -317,7 +317,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
              node_ptr->libEvent_work_fifo_ptr != node_ptr->libEvent_work_fifo.end();
              node_ptr->libEvent_work_fifo_ptr ++ )
        {
-            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %3d | %15s:%d | %s\n", 
+            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %3d | %15s:%d | %s\n",
                _get_work_state_str(node_ptr->libEvent_work_fifo_ptr->state).c_str(),
                node_ptr->libEvent_work_fifo_ptr->sequence,
                node_ptr->libEvent_work_fifo_ptr->hostname.c_str(),
@ -331,7 +331,6 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
        syslog ( LOG_INFO, "+------+-------+--------------+---------+--------------+-----+----------------------+\n");
    }

-   
    int size = node_ptr->libEvent_work_fifo.size() ;
    if ( size > QUEUE_OVERLOAD )
    {
@ -354,18 +353,18 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
        case HTTP__TRANSMIT:
        {
            node_ptr->thisReq = node_ptr->libEvent_work_fifo.front();
-            
+
            qlog ("%s Transmitted\n", node_ptr->thisReq.log_prefix.c_str() );

            rc = mtcHttpUtil_api_request ( node_ptr->thisReq ) ;
            if ( rc )
            {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                node_ptr->thisReq.state = HTTP__FAILURE ;
            }
            else
            {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                node_ptr->thisReq.state = HTTP__RECEIVE_WAIT ;

                if ( node_ptr->http_timer.tid )
@ -374,7 +373,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                if ( rc != PASS )
                {
                    elog ("%s failed to start http command timer ; failing command\n", node_ptr->thisReq.log_prefix.c_str());
-                    node_ptr->libEvent_work_fifo_ptr->state = 
+                    node_ptr->libEvent_work_fifo_ptr->state =
                    node_ptr->thisReq.state = HTTP__FAILURE ;
                }
            }
@ -408,20 +407,20 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                slog ("%s has unexpected null HTTP request base pointer\n",
                          node_ptr->thisReq.log_prefix.c_str());

-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                node_ptr->thisReq.state = HTTP__FAILURE ;
                break ;
            }
-            
+
            int msec_timeout = (node_ptr->thisReq.timeout*1000);
            int wait_time = (++node_ptr->thisReq.rx_retry_cnt)*HTTP_RECEIVE_WAIT_MSEC ;

            rc = mtcHttpUtil_receive ( node_ptr->thisReq );
            if ( rc == RETRY )
            {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                node_ptr->thisReq.state = HTTP__RECEIVE_WAIT ;
-                mtcTimer_start_msec ( node_ptr->http_timer, mtcTimer_handler, HTTP_RECEIVE_WAIT_MSEC ); 
+                mtcTimer_start_msec ( node_ptr->http_timer, mtcTimer_handler, HTTP_RECEIVE_WAIT_MSEC );

                if ((wait_time > (msec_timeout/4)) && ( node_ptr->thisReq.low_wm == false ) )
                {
@ -449,48 +448,66 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                    /* Only print every 16 starting with 2 */
                    if ( (node_ptr->thisReq.rx_retry_cnt & 0xF) == 2 )
                    {
-                        qlog ("%s rx_retry_cnt:%d\n", 
+                        qlog ("%s rx_retry_cnt:%d\n",
                                  node_ptr->thisReq.log_prefix.c_str(),
                                  node_ptr->thisReq.rx_retry_cnt );
                    }
                    break ;
                }
            }
+            #ifdef WANT_FIT_TESTING
+            if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED, node_ptr->hostname, "" ))
+            {
+               ilog("%s FIT Operation Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+               node_ptr->thisReq.status = FAIL_AUTHENTICATION ;
+               rc = FAIL_OPERATION ;
+            }
+            else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT, node_ptr->hostname, "" ))
+            {
+               ilog("%s FIT Request Timeout Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+               rc = FAIL_TIMEOUT ;
+            }
+            else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS, node_ptr->hostname, "" ))
+            {
+               ilog("%s FIT Connection Loss: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+               node_ptr->thisReq.status = rc = FAIL_HTTP_ZERO_STATUS ;
+            }
+            #endif
            if ( rc != PASS )
            {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                node_ptr->thisReq.state = HTTP__FAILURE ;
            }
            else
            {
                if ( node_ptr->thisReq.cur_retries )
                {
-                    ilog ("%s Completed (after %d retries) (took %d of %d msecs)\n", 
-                              node_ptr->thisReq.log_prefix.c_str(), 
-                              node_ptr->thisReq.cur_retries, wait_time, 
+                    ilog ("%s Completed (after %d retries) (took %d of %d msecs)\n",
+                              node_ptr->thisReq.log_prefix.c_str(),
+                              node_ptr->thisReq.cur_retries, wait_time,
                              node_ptr->thisReq.timeout*1000);
                }
                else
                {
-                    qlog ("%s Completed (took %d of %d msecs)\n", 
-                              node_ptr->thisReq.log_prefix.c_str(), 
-                              wait_time, 
+                    qlog ("%s Completed (took %d of %d msecs)\n",
+                              node_ptr->thisReq.log_prefix.c_str(),
+                              wait_time,
                              node_ptr->thisReq.timeout*1000);
                }
                node_ptr->thisReq.exec_time_msec = wait_time ;

                node_ptr->thisReq.rx_retry_cnt = 0 ;
-                
+
                mtcHttpUtil_free_conn ( node_ptr->thisReq );
                mtcHttpUtil_free_base ( node_ptr->thisReq );

-                /* Don't add success responses to non-critical commands like  
+                /* Don't add success responses to non-critical commands like
                 * "update uptime" and "update task" to the done queue */
                if ( !node_ptr->thisReq.noncritical )
                {
                    /* Copy done event to the done queue */
                    node_ptr->libEvent_done_fifo.push_back(node_ptr->thisReq);
-            
+
                }
                /* Pop that done event off the work queue */
                node_ptr->libEvent_work_fifo.pop_front();
@ -503,21 +520,21 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )

            mtcHttpUtil_free_conn ( node_ptr->thisReq );
            mtcHttpUtil_free_base ( node_ptr->thisReq );
-                
+
            node_ptr->http_retries_cur++ ;
            node_ptr->thisReq.cur_retries++ ;

-            if ( node_ptr->thisReq.noncritical == true ) 
-            { 
+            if ( node_ptr->thisReq.noncritical == true )
+            {
                if ( node_ptr->thisReq.cur_retries > node_ptr->thisReq.max_retries )
                {
                    node_ptr->oper_failures++ ;

                    wlog ("%s retry conjestion abort of non-critical command (%d:%d)\n",
-                              node_ptr->thisReq.log_prefix.c_str(), 
+                              node_ptr->thisReq.log_prefix.c_str(),
                              node_ptr->thisReq.cur_retries,
                              node_ptr->thisReq.max_retries );
-                
+
                    /* Pop this aborted event off the work queue */
                    node_ptr->libEvent_work_fifo.pop_front();
                }
@ -561,7 +578,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                          node_ptr->thisReq.max_retries,
                          node_ptr->thisReq.timeout,
                          node_ptr->thisReq.noncritical ? "No" : "Yes" );
-                
+
                node_ptr->thisReq.response.clear();

                node_ptr->thisReq.status      = PASS  ;
@ -569,10 +586,10 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                node_ptr->thisReq.active      = false ;
                node_ptr->thisReq.response_len= 0     ;

-                /* 
+                /*
                 * If this is an inventory request ...
                 *
-                 * 1. Init the inv struct 
+                 * 1. Init the inv struct
                 * 2. increase the timeout if is a critical command
                 *
                 * */
@ -583,30 +600,42 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                    {
                        int temp = node_ptr->libEvent_work_fifo_ptr->timeout ;

-                        /* 
+                        /*
                         * Increase and update the timeout value for critical commands
                         * in hope that it will succeed on he next go around.
                         */
                        node_ptr->libEvent_work_fifo_ptr->timeout += get_mtcInv_ptr()->sysinv_timeout ;
-                        dlog ("%s timeout extended from %d to %d secs\n", 
+                        dlog ("%s timeout extended from %d to %d secs\n",
                                  node_ptr->thisReq.log_prefix.c_str(), temp,
                                  node_ptr->libEvent_work_fifo_ptr->timeout );
                    }
                }

                /* Save the retry count */
-                node_ptr->libEvent_work_fifo_ptr->cur_retries = 
+                node_ptr->libEvent_work_fifo_ptr->cur_retries =
                node_ptr->thisReq.cur_retries ;

-                node_ptr->libEvent_work_fifo_ptr->state = 
+                mtcTimer_start ( node_ptr->http_timer, mtcTimer_handler, HTTP_RETRY_WAIT_SECS );
+                node_ptr->libEvent_work_fifo_ptr->state =
+                node_ptr->thisReq.state = HTTP__RETRY_WAIT ;
+                dlog ("%s %d sec retry wait started", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
+            }
+            break ;
+        }
+        case HTTP__RETRY_WAIT:
+        {
+            if ( node_ptr->http_timer.ring == true )
+            {
+                dlog ("%s %d sec retry wait expired", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
+                node_ptr->libEvent_work_fifo_ptr->state =
                node_ptr->thisReq.state = HTTP__TRANSMIT ;
            }
            break ;
        }
        default:
        {
-            slog ("%s Bad libEvent work state (%d) ; clearing work/done queue\n", 
-                      node_ptr->hostname.c_str(), 
+            slog ("%s Bad libEvent work state (%d) ; clearing work/done queue\n",
+                      node_ptr->hostname.c_str(),
                      node_ptr->libEvent_work_fifo_ptr->state );
            node_ptr->libEvent_work_fifo.clear();
            node_ptr->libEvent_done_fifo.clear();
@ -623,7 +652,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
 * Description: To handle the pathalogical case where an event seems to
 *              have timed out at the callers level then this interface
 *              can be called to delete it from the work queue.
- *              
+ *
 * @param node_ptr so that the hosts work queue can be found
 * @param sequence to specify the specific sequence number to remove
 * @return always PASS since there is nothing the caller can or needs
@ -660,7 +689,7 @@ int nodeLinkClass::workQueue_del_cmd ( struct nodeLinkClass::node * node_ptr, in
 *
 * Description: Removes all items from the done queue.
 *
- * Returns a failure, the sequence number of the first command 
+ * Returns a failure, the sequence number of the first command
 * in the done queue that did not PASS.
 *
 */
@ -717,7 +746,7 @@ int nodeLinkClass::doneQueue_purge ( struct nodeLinkClass::node * node_ptr )
        {
            qlog ("%s all (%d) priority queued operations passed (qlog)\n", node_ptr->hostname.c_str(), size );
        }
-        
+
        qlog ("%s purging %d items from doneQueue\n", node_ptr->hostname.c_str(), size );
        node_ptr->libEvent_done_fifo.clear();
    }
@ -738,7 +767,7 @@ int nodeLinkClass::workQueue_purge ( struct nodeLinkClass::node * node_ptr )
    {
        /* TODO: find out how to force close a connection.
         * Don't free the connection if it is in the receiving state or
-         * we might get a segfault 
+         * we might get a segfault
         * There is only ever one connection open at a time for a specific host
         * so its only 'thisReq' we need to worry about. */
        if ( node_ptr->libEvent_work_fifo_ptr->state != HTTP__RECEIVE )
@ -754,12 +783,12 @@ int nodeLinkClass::workQueue_purge ( struct nodeLinkClass::node * node_ptr )
        {
            if ( node_ptr->libEvent_work_fifo_ptr->state == HTTP__TRANSMIT )
            {
-                wlog ("%s ... was not executed\n", 
+                wlog ("%s ... was not executed\n",
                           node_ptr->libEvent_work_fifo_ptr->log_prefix.c_str());
            }
            else
            {
-                wlog ("%s ... did not complete (%s)\n", 
+                wlog ("%s ... did not complete (%s)\n",
                           node_ptr->libEvent_work_fifo_ptr->log_prefix.c_str(),
                           _get_work_state_str(node_ptr->libEvent_work_fifo_ptr->state).c_str());
            }
@ -771,7 +800,7 @@ int nodeLinkClass::workQueue_purge ( struct nodeLinkClass::node * node_ptr )
    {
        qlog ("%s all work done\n", node_ptr->hostname.c_str());
    }
-    
+
    // node_ptr->libEvent_work_fifo_ptr->state = HTTP__TRANSMIT ;
    return (PASS);
 }
@ -793,7 +822,7 @@ int nodeLinkClass::workQueue_done ( struct nodeLinkClass::node * node_ptr )
              node_ptr->libEvent_work_fifo_ptr++ )
        {
            /* Don't report work queue timeout if there are only noncritical
-             * commands left in the work queue. Such commands might be 
+             * commands left in the work queue. Such commands might be
             * "update uptime" and "update task" */
            if ( !node_ptr->libEvent_work_fifo_ptr->noncritical )
            {
@ -862,7 +891,6 @@ bool nodeLinkClass::workQueue_present ( libEvent & event )
            }
        }
    }
-    
    wlog ("%s ... not found in work queue\n", event.log_prefix.c_str());
    return (false);
 }
--- a/mtce/src/scripts/mtc.conf
+++ b/mtce/src/scripts/mtc.conf
@ -78,6 +78,8 @@ bmc_reset_delay = 300        ; seconds to wait before issuing a bmc
                             ; ACK reboot requests. The delay gives
                             ; time for crashdumps to complete.

+http_retry_wait = 10         ; secs to wait between http request retries
+
 [client]                     ; Client Configuration

 scheduling_priority = 45     ; realtime scheduling; range of 1 .. 99