Work around CHILD_MAX bash limitation for async

Apparently bash (via POSIX) only guarantees a small (32ish) number of
children can be started and their statuses retrieved at any given
point. On larger jobs with lots of plugins and additional work, we
may go over that limit, especially for long-lived children, such
as the install_tempest task.

This works around that issue by creating a fifo for each child at
spawn time. When the child is complete, it will block on a read
against that fifo (and thus not exit). When the parent goes to wait
on the child, it first writes to that fifo, unblocking the child so
that it can exit near the time we go to wait.

Closes-Bug: #1923728
Change-Id: Id755bdb1e7f1664ec08742d034c174e87a3d2902
This commit is contained in:
Dan Smith 2021-04-14 14:27:32 -07:00
parent ef1e9ada9b
commit aa5c38727b

View File

@ -57,6 +57,7 @@ function async_log {
function async_inner { function async_inner {
local name="$1" local name="$1"
local rc local rc
local fifo=${DEST}/async/${name}.fifo
shift shift
set -o xtrace set -o xtrace
if $* >${DEST}/async/${name}.log 2>&1; then if $* >${DEST}/async/${name}.log 2>&1; then
@ -69,6 +70,8 @@ function async_inner {
async_log "$name" "FAILED with rc $rc" async_log "$name" "FAILED with rc $rc"
fi fi
iniset ${DEST}/async/${name}.ini job end_time $(date "+%s%3N") iniset ${DEST}/async/${name}.ini job end_time $(date "+%s%3N")
# Block on the fifo until we are signaled to exit by the main process
cat $fifo
return $rc return $rc
} }
@ -86,12 +89,14 @@ function async_run {
local name="$1" local name="$1"
shift shift
local inifile=${DEST}/async/${name}.ini local inifile=${DEST}/async/${name}.ini
local fifo=${DEST}/async/${name}.fifo
touch $inifile touch $inifile
iniset $inifile job command "$*" iniset $inifile job command "$*"
iniset $inifile job start_time $(date +%s%3N) iniset $inifile job start_time $(date +%s%3N)
if [[ "$DEVSTACK_PARALLEL" = "True" ]]; then if [[ "$DEVSTACK_PARALLEL" = "True" ]]; then
mkfifo $fifo
async_inner $name $* & async_inner $name $* &
iniset $inifile job pid $! iniset $inifile job pid $!
async_log "$name" "running: %command" async_log "$name" "running: %command"
@ -119,17 +124,23 @@ function async_wait {
xtrace=$(set +o | grep xtrace) xtrace=$(set +o | grep xtrace)
set +o xtrace set +o xtrace
local pid rc running inifile runtime local pid rc running inifile runtime fifo
rc=0 rc=0
for name in $*; do for name in $*; do
running=$(ls ${DEST}/async/*.ini 2>/dev/null | wc -l) running=$(ls ${DEST}/async/*.ini 2>/dev/null | wc -l)
inifile="${DEST}/async/${name}.ini" inifile="${DEST}/async/${name}.ini"
fifo=${DEST}/async/${name}.fifo
if pid=$(async_pidof "$name"); then if pid=$(async_pidof "$name"); then
async_log "$name" "Waiting for completion of %command" \ async_log "$name" "Waiting for completion of %command" \
"($running other jobs running)" "($running other jobs running)"
time_start async_wait time_start async_wait
if [[ "$pid" != "self" ]]; then if [[ "$pid" != "self" ]]; then
# Signal the child to go ahead and exit since we are about to
# wait for it to collect its status.
echo "Signaling exit"
echo WAKEUP > $fifo
echo "Signaled"
# Do not actually call wait if we ran synchronously # Do not actually call wait if we ran synchronously
if wait $pid; then if wait $pid; then
rc=0 rc=0
@ -137,6 +148,7 @@ function async_wait {
rc=$? rc=$?
fi fi
cat ${DEST}/async/${name}.log cat ${DEST}/async/${name}.log
rm -f $fifo
fi fi
time_stop async_wait time_stop async_wait
local start_time local start_time