On Thu, 03 Sep 2015 20:12:38 +0200, Junio C Hamano wrote: > > Takashi Iwai <tiwai@xxxxxxx> writes: > > > we've got a bug report of git-log stall in below: > > https://bugzilla.opensuse.org/show_bug.cgi?id=942297 > > > > In short, git-log is left unterminated sometimes when pager is > > aborted. When this happens, git process can't be killed by SIGTERM, > > but only by SIGKILL. And, further investigation revealed the possible > > mutex deadlock used in glibc *alloc()/free(). > > > > I tried to reproduce this by adding a fault injection in glibc code, > > and actually got the similar stack trace as reported. The problem is > > that glibc malloc (in this case realloc() and free()) takes a private > > mutex. Thus calling any function does *alloc() or free() in a signal > > handler may deadlock. In the case of git, it was free() calls in > > various cleanup codes and the printf() / strerror() invocations. > > > > Also, basically it's not safe to call fflush() or raise(), either. > > But they seem to work practically on the current systems. > > > > Below is a band-aid patch I tested and confirmed to work in the > > fault-injection case. But, some unsafe calls mentioned in the above > > are left. If we want to be in a safer side, we should really limit > > the things to do in a signal handler, e.g. only calling close() and > > doing waitpid(), I suppose. > > > > Any better ideas? > > Sorry, but I am not with better ideas (at least offhand right now). > > Essentially the idea seems to be to avoid calling allocation-related > functions in the signal handler codepath that is used for cleaning > things up. Not calling free() in the codepaths is perfectly fine as > we know we will be soon exiting anyway. Not being able to call > error() and strerror() to report funnies (other than the fact that > we were interrupted) is somewhat sad, though. Reading signal(7) again, I correct some of my statement: raise() is safe to use. But fflush() still isn't. Maybe fflush() should be avoided but just call only close(). Regarding the error message: write() is still safe to use, so it is possible in some level. strerror() seems invoking malloc(), judging from the stack trace in bugzilla, so this needs to be avoided. printf() is a blackbox, so it's hard to tell. That is, in the worst case, we can just call write() directly for serious errors, if any. Takashi > > > > > > diff --git a/pager.c b/pager.c > > index 27d4c8a17aa1..57dda0142fa9 100644 > > --- a/pager.c > > +++ b/pager.c > > @@ -14,19 +14,25 @@ > > static const char *pager_argv[] = { NULL, NULL }; > > static struct child_process pager_process = CHILD_PROCESS_INIT; > > > > -static void wait_for_pager(void) > > +static void flush_pager(void) > > { > > fflush(stdout); > > fflush(stderr); > > /* signal EOF to pager */ > > close(1); > > close(2); > > +} > > + > > +static void wait_for_pager(void) > > +{ > > + flush_pager(); > > finish_command(&pager_process); > > } > > > > static void wait_for_pager_signal(int signo) > > { > > - wait_for_pager(); > > + flush_pager(); > > + finish_command_in_signal(&pager_process); > > sigchain_pop(signo); > > raise(signo); > > } > > diff --git a/run-command.c b/run-command.c > > index 3277cf797ed4..a8cdc8f32944 100644 > > --- a/run-command.c > > +++ b/run-command.c > > @@ -18,26 +18,27 @@ struct child_to_clean { > > static struct child_to_clean *children_to_clean; > > static int installed_child_cleanup_handler; > > > > -static void cleanup_children(int sig) > > +static void cleanup_children(int sig, int in_signal) > > { > > while (children_to_clean) { > > struct child_to_clean *p = children_to_clean; > > children_to_clean = p->next; > > kill(p->pid, sig); > > - free(p); > > + if (!in_signal) > > + free(p); > > } > > } > > > > static void cleanup_children_on_signal(int sig) > > { > > - cleanup_children(sig); > > + cleanup_children(sig, 1); > > sigchain_pop(sig); > > raise(sig); > > } > > > > static void cleanup_children_on_exit(void) > > { > > - cleanup_children(SIGTERM); > > + cleanup_children(SIGTERM, 0); > > } > > > > static void mark_child_for_cleanup(pid_t pid) > > @@ -220,7 +221,7 @@ static inline void set_cloexec(int fd) > > fcntl(fd, F_SETFD, flags | FD_CLOEXEC); > > } > > > > -static int wait_or_whine(pid_t pid, const char *argv0) > > +static int wait_or_whine(pid_t pid, const char *argv0, int in_signal) > > { > > int status, code = -1; > > pid_t waiting; > > @@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0) > > > > if (waiting < 0) { > > failed_errno = errno; > > - error("waitpid for %s failed: %s", argv0, strerror(errno)); > > + if (!in_signal) > > + error("waitpid for %s failed: %s", argv0, > > + strerror(errno)); > > } else if (waiting != pid) { > > - error("waitpid is confused (%s)", argv0); > > + if (!in_signal) > > + error("waitpid is confused (%s)", argv0); > > } else if (WIFSIGNALED(status)) { > > code = WTERMSIG(status); > > if (code != SIGINT && code != SIGQUIT) > > - error("%s died of signal %d", argv0, code); > > + if (!in_signal) > > + error("%s died of signal %d", argv0, code); > > /* > > * This return value is chosen so that code & 0xff > > * mimics the exit code that a POSIX shell would report for > > @@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0) > > failed_errno = ENOENT; > > } > > } else { > > - error("waitpid is confused (%s)", argv0); > > + if (!in_signal) > > + error("waitpid is confused (%s)", argv0); > > } > > > > - clear_child_for_cleanup(pid); > > + if (!in_signal) > > + clear_child_for_cleanup(pid); > > > > errno = failed_errno; > > return code; > > @@ -437,7 +444,7 @@ fail_pipe: > > * At this point we know that fork() succeeded, but execvp() > > * failed. Errors have been reported to our stderr. > > */ > > - wait_or_whine(cmd->pid, cmd->argv[0]); > > + wait_or_whine(cmd->pid, cmd->argv[0], 0); > > failed_errno = errno; > > cmd->pid = -1; > > } > > @@ -536,12 +543,18 @@ fail_pipe: > > > > int finish_command(struct child_process *cmd) > > { > > - int ret = wait_or_whine(cmd->pid, cmd->argv[0]); > > + int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0); > > argv_array_clear(&cmd->args); > > argv_array_clear(&cmd->env_array); > > return ret; > > } > > > > +int finish_command_in_signal(struct child_process *cmd) > > +{ > > + return wait_or_whine(cmd->pid, cmd->argv[0], 1); > > +} > > + > > + > > int run_command(struct child_process *cmd) > > { > > int code; > > @@ -772,7 +785,7 @@ error: > > int finish_async(struct async *async) > > { > > #ifdef NO_PTHREADS > > - return wait_or_whine(async->pid, "child process"); > > + return wait_or_whine(async->pid, "child process", 0); > > #else > > void *ret = (void *)(intptr_t)(-1); > > > > diff --git a/run-command.h b/run-command.h > > index 5b4425a3cbe1..275d35c442ac 100644 > > --- a/run-command.h > > +++ b/run-command.h > > @@ -50,6 +50,7 @@ void child_process_init(struct child_process *); > > > > int start_command(struct child_process *); > > int finish_command(struct child_process *); > > +int finish_command_in_signal(struct child_process *); > > int run_command(struct child_process *); > > > > /* > -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html