Re: glibc mutex deadlock in signal handler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Takashi Iwai <tiwai@xxxxxxx> writes:

> we've got a bug report of git-log stall in below:
>   https://bugzilla.opensuse.org/show_bug.cgi?id=942297
>
> In short, git-log is left unterminated sometimes when pager is
> aborted.  When this happens, git process can't be killed by SIGTERM,
> but only by SIGKILL.  And, further investigation revealed the possible
> mutex deadlock used in glibc *alloc()/free().
>
> I tried to reproduce this by adding a fault injection in glibc code,
> and actually got the similar stack trace as reported.  The problem is
> that glibc malloc (in this case realloc() and free()) takes a private
> mutex.  Thus calling any function does *alloc() or free() in a signal
> handler may deadlock.  In the case of git, it was free() calls in
> various cleanup codes and the printf() / strerror() invocations.
>
> Also, basically it's not safe to call fflush() or raise(), either.
> But they seem to work practically on the current systems.
>
> Below is a band-aid patch I tested and confirmed to work in the
> fault-injection case.  But, some unsafe calls mentioned in the above
> are left.  If we want to be in a safer side, we should really limit
> the things to do in a signal handler, e.g. only calling close() and
> doing waitpid(), I suppose.
>
> Any better ideas?

Sorry, but I am not with better ideas (at least offhand right now).

Essentially the idea seems to be to avoid calling allocation-related
functions in the signal handler codepath that is used for cleaning
things up.  Not calling free() in the codepaths is perfectly fine as
we know we will be soon exiting anyway.  Not being able to call
error() and strerror() to report funnies (other than the fact that
we were interrupted) is somewhat sad, though.




> diff --git a/pager.c b/pager.c
> index 27d4c8a17aa1..57dda0142fa9 100644
> --- a/pager.c
> +++ b/pager.c
> @@ -14,19 +14,25 @@
>  static const char *pager_argv[] = { NULL, NULL };
>  static struct child_process pager_process = CHILD_PROCESS_INIT;
>  
> -static void wait_for_pager(void)
> +static void flush_pager(void)
>  {
>  	fflush(stdout);
>  	fflush(stderr);
>  	/* signal EOF to pager */
>  	close(1);
>  	close(2);
> +}
> +
> +static void wait_for_pager(void)
> +{
> +	flush_pager();
>  	finish_command(&pager_process);
>  }
>  
>  static void wait_for_pager_signal(int signo)
>  {
> -	wait_for_pager();
> +	flush_pager();
> +	finish_command_in_signal(&pager_process);
>  	sigchain_pop(signo);
>  	raise(signo);
>  }
> diff --git a/run-command.c b/run-command.c
> index 3277cf797ed4..a8cdc8f32944 100644
> --- a/run-command.c
> +++ b/run-command.c
> @@ -18,26 +18,27 @@ struct child_to_clean {
>  static struct child_to_clean *children_to_clean;
>  static int installed_child_cleanup_handler;
>  
> -static void cleanup_children(int sig)
> +static void cleanup_children(int sig, int in_signal)
>  {
>  	while (children_to_clean) {
>  		struct child_to_clean *p = children_to_clean;
>  		children_to_clean = p->next;
>  		kill(p->pid, sig);
> -		free(p);
> +		if (!in_signal)
> +			free(p);
>  	}
>  }
>  
>  static void cleanup_children_on_signal(int sig)
>  {
> -	cleanup_children(sig);
> +	cleanup_children(sig, 1);
>  	sigchain_pop(sig);
>  	raise(sig);
>  }
>  
>  static void cleanup_children_on_exit(void)
>  {
> -	cleanup_children(SIGTERM);
> +	cleanup_children(SIGTERM, 0);
>  }
>  
>  static void mark_child_for_cleanup(pid_t pid)
> @@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
>  		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
>  }
>  
> -static int wait_or_whine(pid_t pid, const char *argv0)
> +static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
>  {
>  	int status, code = -1;
>  	pid_t waiting;
> @@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0)
>  
>  	if (waiting < 0) {
>  		failed_errno = errno;
> -		error("waitpid for %s failed: %s", argv0, strerror(errno));
> +		if (!in_signal)
> +			error("waitpid for %s failed: %s", argv0,
> +			      strerror(errno));
>  	} else if (waiting != pid) {
> -		error("waitpid is confused (%s)", argv0);
> +		if (!in_signal)
> +			error("waitpid is confused (%s)", argv0);
>  	} else if (WIFSIGNALED(status)) {
>  		code = WTERMSIG(status);
>  		if (code != SIGINT && code != SIGQUIT)
> -			error("%s died of signal %d", argv0, code);
> +			if (!in_signal)
> +				error("%s died of signal %d", argv0, code);
>  		/*
>  		 * This return value is chosen so that code & 0xff
>  		 * mimics the exit code that a POSIX shell would report for
> @@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0)
>  			failed_errno = ENOENT;
>  		}
>  	} else {
> -		error("waitpid is confused (%s)", argv0);
> +		if (!in_signal)
> +			error("waitpid is confused (%s)", argv0);
>  	}
>  
> -	clear_child_for_cleanup(pid);
> +	if (!in_signal)
> +		clear_child_for_cleanup(pid);
>  
>  	errno = failed_errno;
>  	return code;
> @@ -437,7 +444,7 @@ fail_pipe:
>  		 * At this point we know that fork() succeeded, but execvp()
>  		 * failed. Errors have been reported to our stderr.
>  		 */
> -		wait_or_whine(cmd->pid, cmd->argv[0]);
> +		wait_or_whine(cmd->pid, cmd->argv[0], 0);
>  		failed_errno = errno;
>  		cmd->pid = -1;
>  	}
> @@ -536,12 +543,18 @@ fail_pipe:
>  
>  int finish_command(struct child_process *cmd)
>  {
> -	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
> +	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
>  	argv_array_clear(&cmd->args);
>  	argv_array_clear(&cmd->env_array);
>  	return ret;
>  }
>  
> +int finish_command_in_signal(struct child_process *cmd)
> +{
> +	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
> +}
> +
> +
>  int run_command(struct child_process *cmd)
>  {
>  	int code;
> @@ -772,7 +785,7 @@ error:
>  int finish_async(struct async *async)
>  {
>  #ifdef NO_PTHREADS
> -	return wait_or_whine(async->pid, "child process");
> +	return wait_or_whine(async->pid, "child process", 0);
>  #else
>  	void *ret = (void *)(intptr_t)(-1);
>  
> diff --git a/run-command.h b/run-command.h
> index 5b4425a3cbe1..275d35c442ac 100644
> --- a/run-command.h
> +++ b/run-command.h
> @@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
>  
>  int start_command(struct child_process *);
>  int finish_command(struct child_process *);
> +int finish_command_in_signal(struct child_process *);
>  int run_command(struct child_process *);
>  
>  /*
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]