Re: glibc mutex deadlock in signal handler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 03 Sep 2015 20:12:38 +0200,
Junio C Hamano wrote:
> 
> Takashi Iwai <tiwai@xxxxxxx> writes:
> 
> > we've got a bug report of git-log stall in below:
> >   https://bugzilla.opensuse.org/show_bug.cgi?id=942297
> >
> > In short, git-log is left unterminated sometimes when pager is
> > aborted.  When this happens, git process can't be killed by SIGTERM,
> > but only by SIGKILL.  And, further investigation revealed the possible
> > mutex deadlock used in glibc *alloc()/free().
> >
> > I tried to reproduce this by adding a fault injection in glibc code,
> > and actually got the similar stack trace as reported.  The problem is
> > that glibc malloc (in this case realloc() and free()) takes a private
> > mutex.  Thus calling any function does *alloc() or free() in a signal
> > handler may deadlock.  In the case of git, it was free() calls in
> > various cleanup codes and the printf() / strerror() invocations.
> >
> > Also, basically it's not safe to call fflush() or raise(), either.
> > But they seem to work practically on the current systems.
> >
> > Below is a band-aid patch I tested and confirmed to work in the
> > fault-injection case.  But, some unsafe calls mentioned in the above
> > are left.  If we want to be in a safer side, we should really limit
> > the things to do in a signal handler, e.g. only calling close() and
> > doing waitpid(), I suppose.
> >
> > Any better ideas?
> 
> Sorry, but I am not with better ideas (at least offhand right now).
> 
> Essentially the idea seems to be to avoid calling allocation-related
> functions in the signal handler codepath that is used for cleaning
> things up.  Not calling free() in the codepaths is perfectly fine as
> we know we will be soon exiting anyway.  Not being able to call
> error() and strerror() to report funnies (other than the fact that
> we were interrupted) is somewhat sad, though.

Reading signal(7) again, I correct some of my statement: raise() is
safe to use.  But fflush() still isn't.  Maybe fflush() should be
avoided but just call only close().

Regarding the error message: write() is still safe to use, so it is
possible in some level.  strerror() seems invoking malloc(), judging
from the stack trace in bugzilla, so this needs to be avoided.
printf() is a blackbox, so it's hard to tell.  That is, in the worst
case, we can just call write() directly for serious errors, if any.


Takashi

> 
> 
> 
> 
> > diff --git a/pager.c b/pager.c
> > index 27d4c8a17aa1..57dda0142fa9 100644
> > --- a/pager.c
> > +++ b/pager.c
> > @@ -14,19 +14,25 @@
> >  static const char *pager_argv[] = { NULL, NULL };
> >  static struct child_process pager_process = CHILD_PROCESS_INIT;
> >  
> > -static void wait_for_pager(void)
> > +static void flush_pager(void)
> >  {
> >  	fflush(stdout);
> >  	fflush(stderr);
> >  	/* signal EOF to pager */
> >  	close(1);
> >  	close(2);
> > +}
> > +
> > +static void wait_for_pager(void)
> > +{
> > +	flush_pager();
> >  	finish_command(&pager_process);
> >  }
> >  
> >  static void wait_for_pager_signal(int signo)
> >  {
> > -	wait_for_pager();
> > +	flush_pager();
> > +	finish_command_in_signal(&pager_process);
> >  	sigchain_pop(signo);
> >  	raise(signo);
> >  }
> > diff --git a/run-command.c b/run-command.c
> > index 3277cf797ed4..a8cdc8f32944 100644
> > --- a/run-command.c
> > +++ b/run-command.c
> > @@ -18,26 +18,27 @@ struct child_to_clean {
> >  static struct child_to_clean *children_to_clean;
> >  static int installed_child_cleanup_handler;
> >  
> > -static void cleanup_children(int sig)
> > +static void cleanup_children(int sig, int in_signal)
> >  {
> >  	while (children_to_clean) {
> >  		struct child_to_clean *p = children_to_clean;
> >  		children_to_clean = p->next;
> >  		kill(p->pid, sig);
> > -		free(p);
> > +		if (!in_signal)
> > +			free(p);
> >  	}
> >  }
> >  
> >  static void cleanup_children_on_signal(int sig)
> >  {
> > -	cleanup_children(sig);
> > +	cleanup_children(sig, 1);
> >  	sigchain_pop(sig);
> >  	raise(sig);
> >  }
> >  
> >  static void cleanup_children_on_exit(void)
> >  {
> > -	cleanup_children(SIGTERM);
> > +	cleanup_children(SIGTERM, 0);
> >  }
> >  
> >  static void mark_child_for_cleanup(pid_t pid)
> > @@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
> >  		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
> >  }
> >  
> > -static int wait_or_whine(pid_t pid, const char *argv0)
> > +static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
> >  {
> >  	int status, code = -1;
> >  	pid_t waiting;
> > @@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0)
> >  
> >  	if (waiting < 0) {
> >  		failed_errno = errno;
> > -		error("waitpid for %s failed: %s", argv0, strerror(errno));
> > +		if (!in_signal)
> > +			error("waitpid for %s failed: %s", argv0,
> > +			      strerror(errno));
> >  	} else if (waiting != pid) {
> > -		error("waitpid is confused (%s)", argv0);
> > +		if (!in_signal)
> > +			error("waitpid is confused (%s)", argv0);
> >  	} else if (WIFSIGNALED(status)) {
> >  		code = WTERMSIG(status);
> >  		if (code != SIGINT && code != SIGQUIT)
> > -			error("%s died of signal %d", argv0, code);
> > +			if (!in_signal)
> > +				error("%s died of signal %d", argv0, code);
> >  		/*
> >  		 * This return value is chosen so that code & 0xff
> >  		 * mimics the exit code that a POSIX shell would report for
> > @@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0)
> >  			failed_errno = ENOENT;
> >  		}
> >  	} else {
> > -		error("waitpid is confused (%s)", argv0);
> > +		if (!in_signal)
> > +			error("waitpid is confused (%s)", argv0);
> >  	}
> >  
> > -	clear_child_for_cleanup(pid);
> > +	if (!in_signal)
> > +		clear_child_for_cleanup(pid);
> >  
> >  	errno = failed_errno;
> >  	return code;
> > @@ -437,7 +444,7 @@ fail_pipe:
> >  		 * At this point we know that fork() succeeded, but execvp()
> >  		 * failed. Errors have been reported to our stderr.
> >  		 */
> > -		wait_or_whine(cmd->pid, cmd->argv[0]);
> > +		wait_or_whine(cmd->pid, cmd->argv[0], 0);
> >  		failed_errno = errno;
> >  		cmd->pid = -1;
> >  	}
> > @@ -536,12 +543,18 @@ fail_pipe:
> >  
> >  int finish_command(struct child_process *cmd)
> >  {
> > -	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
> > +	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
> >  	argv_array_clear(&cmd->args);
> >  	argv_array_clear(&cmd->env_array);
> >  	return ret;
> >  }
> >  
> > +int finish_command_in_signal(struct child_process *cmd)
> > +{
> > +	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
> > +}
> > +
> > +
> >  int run_command(struct child_process *cmd)
> >  {
> >  	int code;
> > @@ -772,7 +785,7 @@ error:
> >  int finish_async(struct async *async)
> >  {
> >  #ifdef NO_PTHREADS
> > -	return wait_or_whine(async->pid, "child process");
> > +	return wait_or_whine(async->pid, "child process", 0);
> >  #else
> >  	void *ret = (void *)(intptr_t)(-1);
> >  
> > diff --git a/run-command.h b/run-command.h
> > index 5b4425a3cbe1..275d35c442ac 100644
> > --- a/run-command.h
> > +++ b/run-command.h
> > @@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
> >  
> >  int start_command(struct child_process *);
> >  int finish_command(struct child_process *);
> > +int finish_command_in_signal(struct child_process *);
> >  int run_command(struct child_process *);
> >  
> >  /*
> 
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]