[kvm-unit-tests PATCH 2/2] run_tests: allow run tests in parallel

Peter Xu <peterx@xxxxxxxxxx> · Sun, 1 Jan 2017 18:34:46 +0800

run_task.sh is getting slow. This patch is trying to make it faster by
running the tests concurrently.

First of all, we provide a new parameter "-j" for the run_tests.sh,
which can be used to specify how many run queues we want for the tests.
When "-j" is not provided, we'll keep the old behavior.

When the tests are running concurrently, we will use seperate log file
for each test case (currently located in logs/ dir, with name
test.TESTNAME.log), to avoid test logs messing up with each other.

A quick test on my laptop (x86 with 4 cores and 2 threads, so 8
processors) shows 3x improvement on overall test time:

   |-----------------+-----------|
   | command         | time used |
   |-----------------+-----------|
   | run_test.sh     | 75s       |
   | run_test.sh -j8 | 27s       |
   |-----------------+-----------|

Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
 run_tests.sh            |  19 +++++-
 scripts/functions.bash  |  20 ++++++-
 scripts/global.bash     |  13 ++++
 scripts/mkstandalone.sh |   1 +
 scripts/task.bash       | 156 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 205 insertions(+), 4 deletions(-)
 create mode 100644 scripts/task.bash

diff --git a/run_tests.sh b/run_tests.sh
index a04bfce..8794aa0 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -8,16 +8,18 @@ if [ ! -f config.mak ]; then
 fi
 source config.mak
 source scripts/global.bash
+source scripts/task.bash
 source scripts/functions.bash
 
 function usage()
 {
 cat <<EOF
 
-Usage: $0 [-g group] [-h] [-v]
+Usage: $0 [-g group] [-h] [-v] [-j N]
 
     -g: Only execute tests in the given group
     -h: Output this help text
+    -j: Execute tests in parallel
     -v: Enables verbose mode
 
 Set the environment variable QEMU=/path/to/qemu-system-ARCH to
@@ -29,7 +31,7 @@ EOF
 RUNTIME_arch_run="./$TEST_DIR/run"
 source scripts/runtime.bash
 
-while getopts "g:hv" opt; do
+while getopts "g:hj:v" opt; do
     case $opt in
         g)
             only_group=$OPTARG
@@ -38,6 +40,13 @@ while getopts "g:hv" opt; do
             usage
             exit
             ;;
+        j)
+            ut_run_queues=$OPTARG
+            if ! is_number "$ut_run_queues"; then
+                echo "Invalid -j option: $ut_run_queues"
+                exit 1
+            fi
+            ;;
         v)
             verbose="yes"
             ;;
@@ -57,6 +66,12 @@ RUNTIME_log_stdout () {
     fi
 }
 
+if ut_in_parallel; then
+    rm -rf $ut_log_dir
+    mkdir $ut_log_dir
+    task_set_queue_num $ut_run_queues
+fi
+
 config=$TEST_DIR/unittests.cfg
 rm -f $ut_default_log_file
 printf "BUILD_HEAD=$(cat build-head)\n\n" > $ut_default_log_file
diff --git a/scripts/functions.bash b/scripts/functions.bash
index 90daed4..0da08e6 100644
--- a/scripts/functions.bash
+++ b/scripts/functions.bash
@@ -1,7 +1,18 @@
+source scripts/global.bash
+source scripts/task.bash
+
 function run_task()
 {
-	RUNTIME_log_file=$ut_default_log_file
-	"$@"
+	local testname="$2"
+
+	if ut_in_parallel; then
+		RUNTIME_log_file="${ut_log_dir}/test.${testname}.log"
+		# run in background
+		task_enqueue "$@"
+	else
+		RUNTIME_log_file=$ut_default_log_file
+		"$@"
+	fi
 }
 
 function for_each_unittest()
@@ -51,5 +62,10 @@ function for_each_unittest()
 		fi
 	done
 	run_task "$cmd" "$testname" "$groups" "$smp" "$kernel" "$opts" "$arch" "$check" "$accel" "$timeout"
+
+	if ut_in_parallel; then
+		task_wait_all
+	fi
+
 	exec {fd}<&-
 }
diff --git a/scripts/global.bash b/scripts/global.bash
index 9076785..dfcf0fe 100644
--- a/scripts/global.bash
+++ b/scripts/global.bash
@@ -1 +1,14 @@
 : ${ut_default_log_file:=test.log}
+: ${ut_log_dir:=logs}
+# how many run queues for the unit tests
+: ${ut_run_queues:=1}
+
+function ut_in_parallel()
+{
+    [[ $ut_run_queues != 1 ]]
+}
+
+function is_number()
+{
+    [[ "$1" =~ ^[0-9]+$ ]]
+}
diff --git a/scripts/mkstandalone.sh b/scripts/mkstandalone.sh
index d2bae19..b6c23c6 100755
--- a/scripts/mkstandalone.sh
+++ b/scripts/mkstandalone.sh
@@ -5,6 +5,7 @@ if [ ! -f config.mak ]; then
 	exit 1
 fi
 source config.mak
+source scripts/global.bash
 source scripts/functions.bash
 
 escape ()
diff --git a/scripts/task.bash b/scripts/task.bash
new file mode 100644
index 0000000..4b74e0e
--- /dev/null
+++ b/scripts/task.bash
@@ -0,0 +1,156 @@
+###################################################################
+#
+# This is a bash library to allow run multiple tasks in the
+# background.
+#
+# Exported interface:
+#
+# - task_enqueue:     enqueue a command to run in the bg
+# - task_wait_all:    wait until all the tasks are finished
+#
+# A sample test code:
+#
+#   source task.bash
+#   for i in $(seq 10); do
+#       task_enqueue sleep $i
+#   done
+#   task_wait_all
+#
+# NOTE: SIGUSR1 is used to deliver task notifications.
+#
+# Author(s): Peter Xu <peterx@xxxxxxxxxx>
+#
+###################################################################
+
+task_debug=false                # debug flag
+task_max_n=5                    # concurrent task number
+
+# stores the main process that sourced this library
+task_main_pid=$$
+task_cur_n=0
+
+declare -a task_pid_list
+
+task_set_queue_num()
+{
+    task_max_n=$1
+}
+
+__task_print()
+{
+    echo "$@" >&2
+}
+
+__task_debug()
+{
+    if $task_debug; then
+        __task_print "$@"
+    fi
+}
+
+__task_sig_handler()
+{
+    local i pid
+
+    # wait for a short time to make sure the subprocess that has sent
+    # this signal has totally quit. 200ms should be far enough in most
+    # systems.
+    sleep 0.2
+
+    __task_debug "Detected child die"
+
+    for (( i=0; i<$task_max_n; i++ )); do
+        pid="${task_pid_list[$i]}"
+        if [[ -z "$pid" ]]; then
+            __task_debug "  Task slot $i empty"
+            continue;
+        fi
+        if ! kill -0 $pid &> /dev/null; then
+            __task_debug "  Child $pid died"
+            task_pid_list[$i]=""
+        else
+            __task_debug "  Child $pid still working"
+        fi
+    done
+}
+trap __task_sig_handler SIGUSR1
+
+__task_cur_move()
+{
+    task_cur_n=$(( $task_cur_n + 1 ))
+    if [[ $task_cur_n == $task_max_n ]]; then
+        task_cur_n=0
+    fi
+    __task_debug "Moving task pointer to $task_cur_n"
+}
+
+__task_run()
+{
+    "$@"
+    kill -USR1 $task_main_pid
+    __task_debug "Child $BASHPID quitting"
+}
+
+task_enqueue()
+{
+    local slot ret
+    local miss_cnt=0
+
+    # try to find an empty slot and run the task. If the queue is
+    # full, we wait until we got empty slot.
+    while :; do
+        if [[ -z "${task_pid_list[$task_cur_n]}" ]]; then
+            __task_debug "Found avail slot $task_cur_n"
+            slot=$task_cur_n
+            __task_cur_move
+            break
+        fi
+        __task_cur_move
+        miss_cnt=$(( $miss_cnt + 1 ))
+        if [[ $miss_cnt == $task_max_n ]]; then
+            # we looped over the tasks, no free slot, then we wait for
+            # any of them to quit. Here "wait" can be interrupted by
+            # retcode 138 (ECHILD) or 0 (when no child exists any
+            # more). Other retcode should be errornous.
+            __task_debug "Failed to find empty slot, will wait"
+            wait
+            ret=$?
+            if [[ $ret != 0 && $ret != 138 ]]; then
+                __task_print "Error: wait retcode illegal: $ret"
+                exit 1
+            fi
+            # we should have at least one empty slot now, reset the
+            # miss counter and retry. Logically we will for sure have
+            # an empty slot in the next iteration.
+            miss_cnt=0
+        fi
+    done
+
+    __task_debug "Starting task at slot $slot: '$@'"
+    __task_run "$@" &
+
+    task_pid_list[$slot]=$!
+}
+
+task_wait_all()
+{
+    local ret=0
+
+    while :; do
+        wait
+        ret=$?
+        if [[ $ret == 0 ]]; then
+            # all childs quited
+            return 0
+        elif [[ $ret == 138 ]]; then
+            # one of the child may have quited, but we need to wait
+            # more
+            continue
+        else
+            # this should not happen, if happens, we dump error
+            # and stop the loop
+            __task_print "Error: wait() failed with ret: $ret"
+            return 1
+        fi
+    done
+}
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html