Hi, we've got a bug report of git-log stall in below: https://bugzilla.opensuse.org/show_bug.cgi?id=942297 In short, git-log is left unterminated sometimes when pager is aborted. When this happens, git process can't be killed by SIGTERM, but only by SIGKILL. And, further investigation revealed the possible mutex deadlock used in glibc *alloc()/free(). I tried to reproduce this by adding a fault injection in glibc code, and actually got the similar stack trace as reported. The problem is that glibc malloc (in this case realloc() and free()) takes a private mutex. Thus calling any function does *alloc() or free() in a signal handler may deadlock. In the case of git, it was free() calls in various cleanup codes and the printf() / strerror() invocations. Also, basically it's not safe to call fflush() or raise(), either. But they seem to work practically on the current systems. Below is a band-aid patch I tested and confirmed to work in the fault-injection case. But, some unsafe calls mentioned in the above are left. If we want to be in a safer side, we should really limit the things to do in a signal handler, e.g. only calling close() and doing waitpid(), I suppose. Any better ideas? thanks, Takashi --- diff --git a/pager.c b/pager.c index 27d4c8a17aa1..57dda0142fa9 100644 --- a/pager.c +++ b/pager.c @@ -14,19 +14,25 @@ static const char *pager_argv[] = { NULL, NULL }; static struct child_process pager_process = CHILD_PROCESS_INIT; -static void wait_for_pager(void) +static void flush_pager(void) { fflush(stdout); fflush(stderr); /* signal EOF to pager */ close(1); close(2); +} + +static void wait_for_pager(void) +{ + flush_pager(); finish_command(&pager_process); } static void wait_for_pager_signal(int signo) { - wait_for_pager(); + flush_pager(); + finish_command_in_signal(&pager_process); sigchain_pop(signo); raise(signo); } diff --git a/run-command.c b/run-command.c index 3277cf797ed4..a8cdc8f32944 100644 --- a/run-command.c +++ b/run-command.c @@ -18,26 +18,27 @@ struct child_to_clean { static struct child_to_clean *children_to_clean; static int installed_child_cleanup_handler; -static void cleanup_children(int sig) +static void cleanup_children(int sig, int in_signal) { while (children_to_clean) { struct child_to_clean *p = children_to_clean; children_to_clean = p->next; kill(p->pid, sig); - free(p); + if (!in_signal) + free(p); } } static void cleanup_children_on_signal(int sig) { - cleanup_children(sig); + cleanup_children(sig, 1); sigchain_pop(sig); raise(sig); } static void cleanup_children_on_exit(void) { - cleanup_children(SIGTERM); + cleanup_children(SIGTERM, 0); } static void mark_child_for_cleanup(pid_t pid) @@ -220,7 +221,7 @@ static inline void set_cloexec(int fd) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); } -static int wait_or_whine(pid_t pid, const char *argv0) +static int wait_or_whine(pid_t pid, const char *argv0, int in_signal) { int status, code = -1; pid_t waiting; @@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0) if (waiting < 0) { failed_errno = errno; - error("waitpid for %s failed: %s", argv0, strerror(errno)); + if (!in_signal) + error("waitpid for %s failed: %s", argv0, + strerror(errno)); } else if (waiting != pid) { - error("waitpid is confused (%s)", argv0); + if (!in_signal) + error("waitpid is confused (%s)", argv0); } else if (WIFSIGNALED(status)) { code = WTERMSIG(status); if (code != SIGINT && code != SIGQUIT) - error("%s died of signal %d", argv0, code); + if (!in_signal) + error("%s died of signal %d", argv0, code); /* * This return value is chosen so that code & 0xff * mimics the exit code that a POSIX shell would report for @@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0) failed_errno = ENOENT; } } else { - error("waitpid is confused (%s)", argv0); + if (!in_signal) + error("waitpid is confused (%s)", argv0); } - clear_child_for_cleanup(pid); + if (!in_signal) + clear_child_for_cleanup(pid); errno = failed_errno; return code; @@ -437,7 +444,7 @@ fail_pipe: * At this point we know that fork() succeeded, but execvp() * failed. Errors have been reported to our stderr. */ - wait_or_whine(cmd->pid, cmd->argv[0]); + wait_or_whine(cmd->pid, cmd->argv[0], 0); failed_errno = errno; cmd->pid = -1; } @@ -536,12 +543,18 @@ fail_pipe: int finish_command(struct child_process *cmd) { - int ret = wait_or_whine(cmd->pid, cmd->argv[0]); + int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0); argv_array_clear(&cmd->args); argv_array_clear(&cmd->env_array); return ret; } +int finish_command_in_signal(struct child_process *cmd) +{ + return wait_or_whine(cmd->pid, cmd->argv[0], 1); +} + + int run_command(struct child_process *cmd) { int code; @@ -772,7 +785,7 @@ error: int finish_async(struct async *async) { #ifdef NO_PTHREADS - return wait_or_whine(async->pid, "child process"); + return wait_or_whine(async->pid, "child process", 0); #else void *ret = (void *)(intptr_t)(-1); diff --git a/run-command.h b/run-command.h index 5b4425a3cbe1..275d35c442ac 100644 --- a/run-command.h +++ b/run-command.h @@ -50,6 +50,7 @@ void child_process_init(struct child_process *); int start_command(struct child_process *); int finish_command(struct child_process *); +int finish_command_in_signal(struct child_process *); int run_command(struct child_process *); /* -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html