On Wed, Sep 15, 2021 at 8:26 PM Yucong Sun <fallentree@xxxxxx> wrote: > > From: Yucong Sun <sunyucong@xxxxxxxxx> > > This patch adds "-j" mode to test_progs, executing tests in multiple process. > "-j" mode is optional, and works with all existing test selection mechanism, as > well as "-v", "-l" etc. > > In "-j" mode, main process use UDS to communicate to each forked worker, > commanding it to run tests and collect logs. After all tests are > finished, a summary is printed. main process use multiple competing > threads to dispatch work to worker, trying to keep them all busy. > > The test status will be printed as soon as it is finished, if there are error > logs, it will be printed after the final summary line. > > By specifying "--debug", additional debug information on server/worker > communication will be printed. > > Example output: > > ./test_progs -n 15-20 -j > [ 12.801730] bpf_testmod: loading out-of-tree module taints kernel. > Launching 8 workers. > #20 btf_split:OK > #16 btf_endian:OK > #18 btf_module:OK > #17 btf_map_in_map:OK > #19 btf_skc_cls_ingress:OK > #15 btf_dump:OK > Summary: 6/20 PASSED, 0 SKIPPED, 0 FAILED > > Signed-off-by: Yucong Sun <sunyucong@xxxxxxxxx> > --- Love the error summary logic. Can we please have it in sequential mode as well?... :) Also didn't find anything obvious that would explain 20 second parallel run for few small and fast tests. > tools/testing/selftests/bpf/test_progs.c | 577 +++++++++++++++++++++-- > tools/testing/selftests/bpf/test_progs.h | 36 +- > 2 files changed, 581 insertions(+), 32 deletions(-) > [...] > @@ -661,6 +678,20 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) > case ARG_LIST_TEST_NAMES: > env->list_test_names = true; > break; > + case ARG_NUM_WORKERS: > + if (arg) { > + env->workers = atoi(arg); > + if (!env->workers) { > + fprintf(stderr, "Invalid number of worker: %s.", arg); > + return -1; I missed this problem when SELFTESTS_VERBOSE logic was added, but all the rest of the code returns -EINVAL, not -EPERM (-1). Let's keep it consistent. > + } > + } else { > + env->workers = get_nprocs(); > + } > + break; > + case ARG_DEBUG: > + env->debug = true; > + break; > case ARGP_KEY_ARG: > argp_usage(state); > break; > @@ -678,7 +709,7 @@ static void stdio_hijack(void) > env.stdout = stdout; > env.stderr = stderr; > > - if (env.verbosity > VERBOSE_NONE) { > + if (env.verbosity > VERBOSE_NONE && env.worker_id == -1) { > /* nothing to do, output to stdout by default */ > return; > } > @@ -704,10 +735,6 @@ static void stdio_restore(void) > return; > > fclose(stdout); > - free(env.log_buf); > - > - env.log_buf = NULL; > - env.log_cnt = 0; > > stdout = env.stdout; > stderr = env.stderr; > @@ -794,11 +821,444 @@ void crash_handler(int signum) > dump_test_log(env.test, true); > if (env.stdout) > stdio_restore(); > - > + if (env.worker_id != -1) > + fprintf(stderr, "[%d]: ", env.worker_id); > fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum); > backtrace_symbols_fd(bt, sz, STDERR_FILENO); > } > > +void sigint_handler(int signum) { { on new line, please run checkpatch.pl > + for (int i = 0; i < env.workers; i++) > + close(env.worker_socks[i]); this can race with env.worker_socks allocation, no? Better to install signal handler after we initialize env completely. Also it's a good idea to reset sigint handler after first ctrl-c just in case something got stuck. So that second ctrl-c will unconditionally kill the process. Nothing more annoying than application that got stuck and it doesn't die on ctrl-c > +} > + > +static int current_test_idx = 0; > +static pthread_mutex_t current_test_lock; > +static pthread_mutex_t stdout_output_lock; > + > +struct test_result { > + int error_cnt; > + int skip_cnt; > + int sub_succ_cnt; > + > + size_t log_cnt; > + char *log_buf; > +}; > + > +static struct test_result test_results[ARRAY_SIZE(prog_test_defs)]; > + > +static inline const char *str_msg(const struct msg *msg, char *buf) > +{ > + switch (msg->type) { > + case MSG_DO_TEST: > + sprintf(buf, "MSG_DO_TEST %d", msg->do_test.test_num); > + break; > + case MSG_TEST_DONE: > + sprintf(buf, "MSG_TEST_DONE %d (log: %d)", > + msg->test_done.test_num, > + msg->test_done.have_log); > + break; > + case MSG_TEST_LOG: > + sprintf(buf, "MSG_TEST_LOG (cnt: %ld, last: %d)", > + strlen(msg->test_log.log_buf), > + msg->test_log.is_last); > + break; > + case MSG_EXIT: > + sprintf(buf, "MSG_EXIT"); > + break; > + default: > + sprintf(buf, "UNKNOWN"); > + break; > + } > + > + return buf; > +} > + > +static int send_message(int sock, const struct msg *msg) > +{ > + char buf[256]; empty line after variable declaration block > + if (env.verbosity > VERBOSE_SUPER) > + fprintf(stderr, "Sending msg: %s\n", str_msg(msg, buf)); I thought you were going to make this depend on --debug? you can also combine check for --debug and verbosity levels, if necessry, so that you can have more control for debugging > + return send(sock, msg, sizeof(*msg), 0); > +} > + > +static int recv_message(int sock, struct msg *msg) > +{ > + int ret; > + char buf[256]; > + > + memset(msg, 0, sizeof(*msg)); > + ret = recv(sock, msg, sizeof(*msg), 0); > + if (ret >= 0) { > + if (env.debug) > + fprintf(stderr, "Received msg: %s\n", str_msg(msg, buf)); > + } > + return ret; > +} > + > +static void run_one_test(int test_num) { { on separate line > + struct prog_test_def *test = &prog_test_defs[test_num]; > + > + env.test = test; > + > + test->run_test(); > + > + /* ensure last sub-test is finalized properly */ > + if (test->subtest_name) > + test__end_subtest(); > + > + test->tested = true; > + > + dump_test_log(test, test->error_cnt); > + > + reset_affinity(); > + restore_netns(); > + if (test->need_cgroup_cleanup) > + cleanup_cgroup_environment(); > +} > + > +static const char *get_test_name(int idx) > +{ > + struct prog_test_def *test; > + > + test = &prog_test_defs[idx]; > + return test->test_name; > +} seems like a quite useless helper, prog_tests_defs[idx].test_name is just as short. If you are worried about prog_test_defs[] setup changing, then adding helper to return pointer to the test definition itself is more useful, because you fetch that test def pointer in more places > + > +struct dispatch_data { > + int worker_id; > + int sock_fd; > +}; > + > +void *dispatch_thread(void *ctx) static > +{ > + struct dispatch_data *data = ctx; > + int sock_fd; > + FILE *log_fd = NULL; not a FD, log_file/log_stream? > + > + sock_fd = data->sock_fd; > + > + while (true) { > + int test_to_run = -1; no need to initialize, it's always set if it's used. This -1 made me search the code needlessly to figure out where this -1 matters. > + struct prog_test_def *test; > + struct test_result *result; > + > + /* grab a test */ > + { > + pthread_mutex_lock(¤t_test_lock); > + > + if (current_test_idx >= prog_test_cnt) { > + pthread_mutex_unlock(¤t_test_lock); > + goto done; > + } > + > + test = &prog_test_defs[current_test_idx]; > + test_to_run = current_test_idx; > + current_test_idx++; > + > + pthread_mutex_unlock(¤t_test_lock); > + } > + > + if (!test->should_run) { > + continue; > + } styling: no {} around single-line if bodies. But instead of locking/unlocking and then checking, why not just find the next test that should_run above? > + > + > + /* run test through worker */ > + { > + struct msg msg_do_test; > + > + msg_do_test.type = MSG_DO_TEST; > + msg_do_test.do_test.test_num = test_to_run; > + if (send_message(sock_fd, &msg_do_test) < 0) { > + perror("Fail to send command"); > + goto done; > + } > + env.worker_current_test[data->worker_id] = test_to_run; > + } > + > + /* wait for test done */ > + { > + struct msg msg_test_done; > + > + if (errno = recv_message(sock_fd, &msg_test_done) < 0) is this errno = (recv_message(sock_fd, &msg_test_done) < 0) && errno != 0 or errno = recv_message(sock_fd, &msg_test_done); if (errno < 0) { ... } ? So please no. And also don't use errno (global thread-local variable declared by libc) for this, declare `int err`. > + goto error; > + if (msg_test_done.type != MSG_TEST_DONE) > + goto error; > + if (test_to_run != msg_test_done.test_done.test_num) > + goto error; > + > + test->tested = true; > + result = &test_results[test_to_run]; > + > + result->error_cnt = msg_test_done.test_done.error_cnt; > + result->skip_cnt = msg_test_done.test_done.skip_cnt; > + result->sub_succ_cnt = msg_test_done.test_done.sub_succ_cnt; s/msg_test_done/msg/, same above for msg_do_test. You have a local variable in extra {} lexical scope, so no conflict. And it will read non-redundantly: msg.test_done.sub_succ_cnt. > + > + /* collect all logs */ > + if (msg_test_done.test_done.have_log) { > + log_fd = open_memstream(&result->log_buf, &result->log_cnt); > + if (!log_fd) > + goto error; > + > + while (true) { > + struct msg msg_log; > + > + if (recv_message(sock_fd, &msg_log) < 0) > + goto error; > + if (msg_log.type != MSG_TEST_LOG) > + goto error; > + > + fprintf(log_fd, "%s", msg_log.test_log.log_buf); > + if (msg_log.test_log.is_last) > + break; > + } > + fclose(log_fd); > + log_fd = NULL; > + } > + /* output log */ > + { > + pthread_mutex_lock(&stdout_output_lock); > + > + if (result->log_cnt) { > + result->log_buf[result->log_cnt] = '\0'; > + fprintf(stdout, "%s", result->log_buf); > + if (result->log_buf[result->log_cnt - 1] != '\n') > + fprintf(stdout, "\n"); > + } > + > + fprintf(stdout, "#%d %s:%s\n", > + test->test_num, test->test_name, > + result->error_cnt ? "FAIL" : (result->skip_cnt ? "SKIP" : "OK")); > + > + pthread_mutex_unlock(&stdout_output_lock); > + } > + > + } /* wait for test done */ > + } /* while (true) */ > +error: > + if (env.debug) > + fprintf(stderr, "[%d]: Protocol/IO error: %s.\n", data->worker_id, strerror(errno)); you don't always set errno above when you do `goto error`. Please don't use errno directly, it's better to save it into int err, and do `err = -errno; goto error;` everywhere. It's way too easy to forget and clobber errno. > + > + if (log_fd) > + fclose(log_fd); > +done: > + { > + struct msg msg_exit; > + > + msg_exit.type = MSG_EXIT; > + if (send_message(sock_fd, &msg_exit) < 0) { > + if (env.debug) > + fprintf(stderr, "[%d]: send_message msg_exit: %s.\n", > + data->worker_id, strerror(errno)); > + } > + } > + return NULL; > +} > + > +static int server_main(void) > +{ > + pthread_t *dispatcher_threads; > + struct dispatch_data *data; > + > + dispatcher_threads = calloc(sizeof(pthread_t), env.workers); > + data = calloc(sizeof(struct dispatch_data), env.workers); > + > + env.worker_current_test = calloc(sizeof(int), env.workers); nit: this is backwards use of calloc(). First argument is the count, the second is the size of the element. I don't know if that matters, but it might matter for some alignment logic. So please swap the order. > + for (int i = 0; i < env.workers; i++) { int i inside for isn't C89, please declare outside. And we should specify whatever the compiler flags are specified for the kernel to make the compiler complain about this... > + int rc; > + > + data[i].worker_id = i; > + data[i].sock_fd = env.worker_socks[i]; > + rc = pthread_create(&dispatcher_threads[i], NULL, dispatch_thread, &data[i]); > + if (rc < 0) { > + perror("Failed to launch dispatcher thread"); > + return -1; > + } > + } > + > + /* wait for all dispatcher to finish */ > + for (int i = 0; i < env.workers; i++) { same > + while (true) { > + struct timespec timeout = { > + .tv_sec = time(NULL) + 5, > + .tv_nsec = 0 > + }; empty line goes here, checkpatch.pl goes everywhere :) > + if (pthread_timedjoin_np(dispatcher_threads[i], NULL, &timeout) != ETIMEDOUT) > + break; > + if (env.debug) > + fprintf(stderr, "Still waiting for thread %d (test %d).\n", > + i, env.worker_current_test[i] + 1); > + } > + } > + free(dispatcher_threads); > + free(env.worker_current_test); > + free(data); > + > + /* generate summary */ > + fflush(stderr); > + fflush(stdout); > + > + for (int i = 0; i < prog_test_cnt; i++) { > + struct prog_test_def *current_test; > + struct test_result *result; > + > + current_test = &prog_test_defs[i]; > + result = &test_results[i]; > + > + if (!current_test->tested) > + continue; > + > + env.succ_cnt += result->error_cnt ? 0 : 1; > + env.skip_cnt += result->skip_cnt; > + env.fail_cnt += result->error_cnt; > + env.sub_succ_cnt += result->sub_succ_cnt; > + } > + > + fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", > + env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); > + if (env.fail_cnt) > + fprintf(stdout, "\nAll error logs:\n"); > + > + /* print error logs again */ > + > + for (int i = 0; i < prog_test_cnt; i++) { > + struct prog_test_def *current_test; > + struct test_result *result; > + > + current_test = &prog_test_defs[i]; > + result = &test_results[i]; > + > + if (!current_test->tested || !result->error_cnt) > + continue; > + > + if (result->log_cnt) { > + result->log_buf[result->log_cnt] = '\0'; > + fprintf(stdout, "%s", result->log_buf); > + if (result->log_buf[result->log_cnt - 1] != '\n') > + fprintf(stdout, "\n"); > + } > + fprintf(stdout, "#%d %s:%s\n", > + current_test->test_num, current_test->test_name, > + result->error_cnt ? "FAIL" : (result->skip_cnt ? "SKIP" : "OK")); This if and fprintfs is duplicated, can you move it into a helper? > + } > + > + /* reap all workers */ > + for (int i = 0; i < env.workers; i++) { > + int wstatus, pid; > + > + pid = waitpid(env.worker_pids[i], &wstatus, 0); > + if (pid != env.worker_pids[i]) > + perror("Unable to reap worker"); > + } > + > + return 0; > +} > + > +static int worker_main(int sock) > +{ > + save_netns(); > + > + while (true) { > + /* receive command */ > + struct msg msg; > + > + if (recv_message(sock, &msg) < 0) > + goto out; > + > + switch (msg.type) { > + case MSG_EXIT: > + if (env.debug) > + fprintf(stderr, "[%d]: worker exit.\n", > + env.worker_id); > + goto out; > + case MSG_DO_TEST: { > + int test_to_run; > + struct prog_test_def *test; > + struct msg msg_done; > + > + test_to_run = msg.do_test.test_num; > + > + if (env.debug) > + fprintf(stderr, "[%d]: #%d:%s running.\n", > + env.worker_id, > + test_to_run + 1, > + get_test_name(test_to_run)); > + > + test = &prog_test_defs[test_to_run]; > + > + stdio_hijack(); > + > + run_one_test(test_to_run); > + > + stdio_restore(); > + > + memset(&msg_done, 0, sizeof(msg_done)); just reuse msg, it's not used anymore at this point > + msg_done.type = MSG_TEST_DONE; > + msg_done.test_done.test_num = test_to_run; > + msg_done.test_done.error_cnt = test->error_cnt; > + msg_done.test_done.skip_cnt = test->skip_cnt; > + msg_done.test_done.sub_succ_cnt = test->sub_succ_cnt; > + msg_done.test_done.have_log = false; > + > + if (env.verbosity > VERBOSE_NONE || test->force_log || test->error_cnt) { > + if (env.log_cnt) > + msg_done.test_done.have_log = true; > + } > + if (send_message(sock, &msg_done) < 0) { > + perror("Fail to send message done"); > + goto out; > + } > + > + /* send logs */ > + if (msg_done.test_done.have_log) { > + char *src; > + size_t slen; > + > + src = env.log_buf; > + slen = env.log_cnt; > + while (slen) { > + struct msg msg_log; same, msg can be just reused > + char *dest; > + size_t len; > + > + memset(&msg_log, 0, sizeof(msg_log)); > + msg_log.type = MSG_TEST_LOG; > + dest = msg_log.test_log.log_buf; > + len = slen >= MAX_LOG_TRUNK_SIZE ? MAX_LOG_TRUNK_SIZE : slen; > + memcpy(dest, src, len); > + > + src += len; > + slen -= len; > + if (!slen) > + msg_log.test_log.is_last = true; > + > + assert(send_message(sock, &msg_log) >= 0); missed assert here > + } > + } > + if (env.log_buf) { > + free(env.log_buf); > + env.log_buf = NULL; > + env.log_cnt = 0; > + } > + if (env.debug) > + fprintf(stderr, "[%d]: #%d:%s done.\n", > + env.worker_id, > + test_to_run + 1, > + get_test_name(test_to_run)); > + break; > + } /* case MSG_DO_TEST */ > + default: > + if (env.debug) > + fprintf(stderr, "[%d]: unknown message.\n", env.worker_id); > + return -1; > + } > + } > +out: > + restore_netns(); > + return 0; > +} > + > int main(int argc, char **argv) > { > static const struct argp argp = { > @@ -809,10 +1269,15 @@ int main(int argc, char **argv) > struct sigaction sigact = { > .sa_handler = crash_handler, > .sa_flags = SA_RESETHAND, > - }; > + }; > + struct sigaction sigact_int = { > + .sa_handler = sigint_handler, > + .sa_flags = SA_RESETHAND, > + }; > int err, i; > > sigaction(SIGSEGV, &sigact, NULL); > + sigaction(SIGINT, &sigact_int, NULL); as mentioned above, let's delay it until we setup environment completely? > > err = argp_parse(&argp, argc, argv, 0, NULL, &env); > if (err) > @@ -837,21 +1302,76 @@ int main(int argc, char **argv) > return -1; > } > > - save_netns(); > - stdio_hijack(); > + env.stdout = stdout; > + env.stderr = stderr; > + > env.has_testmod = true; > if (!env.list_test_names && load_bpf_testmod()) { > fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); > env.has_testmod = false; > } > + > + /* initializing tests */ > for (i = 0; i < prog_test_cnt; i++) { > struct prog_test_def *test = &prog_test_defs[i]; > > - env.test = test; > test->test_num = i + 1; > - > - if (!should_run(&env.test_selector, > + if (should_run(&env.test_selector, > test->test_num, test->test_name)) > + test->should_run = true; > + else > + test->should_run = false; test->should_run = should_run(...) > + } > + > + /* ignore workers if we are just listing */ > + if (env.get_test_cnt || env.list_test_names) > + env.workers = 0; > + > + /* launch workers if requested */ > + env.worker_id = -1; /* main process */ > + if (env.workers) { > + env.worker_pids = calloc(sizeof(__pid_t), env.workers); > + env.worker_socks = calloc(sizeof(int), env.workers); check for NULLs > + fprintf(stdout, "Launching %d workers.\n", env.workers); if (debug) > + for (int i = 0; i < env.workers; i++) { > + int sv[2]; > + pid_t pid; > + > + if (socketpair(AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0, sv) < 0) { > + perror("Fail to create worker socket"); > + return -1; > + } > + pid = fork(); > + if (pid < 0) { > + perror("Failed to fork worker"); > + return -1; > + } else if (pid != 0) { /* main process */ > + close(sv[1]); > + env.worker_pids[i] = pid; > + env.worker_socks[i] = sv[0]; > + } else { /* inside each worker process */ > + close(sv[0]); > + env.worker_id = i; > + return worker_main(sv[1]); > + } > + } > + > + if (env.worker_id == -1) { you have return worker_main above, can't be anything else, drop unnecessary ifs > + server_main(); > + goto out; > + } > + } > + > + /* The rest of the main process */ > + > + /* on single mode */ > + save_netns(); > + stdio_hijack(); > + > + for (i = 0; i < prog_test_cnt; i++) { > + struct prog_test_def *test = &prog_test_defs[i]; > + > + if (!test->should_run) > continue; > > if (env.get_test_cnt) { > @@ -865,14 +1385,7 @@ int main(int argc, char **argv) > continue; > } > > - test->run_test(); > - /* ensure last sub-test is finalized properly */ > - if (test->subtest_name) > - test__end_subtest(); > - > - test->tested = true; > - > - dump_test_log(test, test->error_cnt); > + run_one_test(i); > > fprintf(env.stdout, "#%d %s:%s\n", > test->test_num, test->test_name, > @@ -882,16 +1395,16 @@ int main(int argc, char **argv) > env.fail_cnt++; > else > env.succ_cnt++; > - skip_account(); > > - reset_affinity(); > - restore_netns(); > - if (test->need_cgroup_cleanup) > - cleanup_cgroup_environment(); > + skip_account(); can you please double-check skip accounting in parallel case? I'm not sure it's there. > + env.sub_succ_cnt += test->sub_succ_cnt; > } > - if (!env.list_test_names && env.has_testmod) > - unload_bpf_testmod(); > stdio_restore(); > + if (env.log_buf) { > + free(env.log_buf); > + env.log_buf = NULL; > + env.log_cnt = 0; > + } [...]