Re: [PATCH RFC 1/2] Add polling support to pidfd

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On April 11, 2019 10:00:59 PM GMT+02:00, Joel Fernandes <joel@xxxxxxxxxxxxxxxxx> wrote:
>On Thu, Apr 11, 2019 at 01:50:42PM -0400, Joel Fernandes (Google)
>wrote:
>> pidfd are /proc/pid directory file descriptors referring to a task
>group
>> leader. Android low memory killer (LMK) needs pidfd polling support
>to
>> replace code that currently checks for existence of /proc/pid for
>> knowing a process that is signalled to be killed has died, which is
>both
>> racy and slow. The pidfd poll approach is race-free, and also allows
>the
>> LMK to do other things (such as by polling on other fds) while
>awaiting
>> the process being killed to die.
>
>It appears to me that the "pidfd" now will be an anon inode fd, and not
>based
>on /proc/, based on discussions with Linus. So I'll rework the patches
>accordingly. However that is relatively independent of this patch so
>this
>version can also be reviewed before I send out the reworked version.

Thank you very much, Joel.
I'm off this week and traveling but I'll try to give it a look asap.

Christian

>
>thanks,
>
> - Joel
>
>> 
>> It prevents a situation where a PID is reused between when LMK sends
>a
>> kill signal and checks for existence of the PID, since the wrong PID
>is
>> now possibly checked for existence.
>> 
>> In this patch, we follow the same mechanism used uhen the parent of
>the
>> task group is to be notified, that is when the tasks waiting on a
>poll
>> of pidfd are also awakened.
>> 
>> We have decided to include the waitqueue in struct pid for the
>following
>> reasons:
>> 1. The wait queue has to survive for the lifetime of the poll.
>Including
>> it in task_struct would not be option in this case because the task
>can
>> be reaped and destroyed before the poll returns.
>> 
>> 2. By including the struct pid for the waitqueue means that during
>> de_exec, the thread doing de_thread() automatically gets the new
>> waitqueue/pid even though its task_struct is different.
>> 
>> Appropriate test cases are added in the second patch to provide
>coverage
>> of all the cases the patch is handling.
>> 
>> Andy had a similar patch [1] in the past which was a good reference
>> however this patch tries to handle different situations properly
>related
>> to thread group existence, and how/where it notifies. And also solves
>> other bugs (existence of taks_struct).  Daniel had a similar patch
>[2]
>> recently which this patch supercedes.
>> 
>> [1] https://lore.kernel.org/patchwork/patch/345098/
>> [2]
>https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@xxxxxxxxxx/
>> 
>> Cc: luto@xxxxxxxxxxxxxx
>> Cc: rostedt@xxxxxxxxxxx
>> Cc: dancol@xxxxxxxxxx
>> Cc: christian@xxxxxxxxxx
>> Cc: jannh@xxxxxxxxxx
>> Cc: surenb@xxxxxxxxxx
>> Cc: torvalds@xxxxxxxxxxxxxxxxxxxx
>> Co-developed-by: Daniel Colascione <dancol@xxxxxxxxxx>
>> Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
>> 
>> ---
>>  fs/proc/base.c      | 39 +++++++++++++++++++++++++++++++++++++++
>>  include/linux/pid.h |  3 +++
>>  kernel/exit.c       |  1 -
>>  kernel/pid.c        |  2 ++
>>  kernel/signal.c     | 14 ++++++++++++++
>>  5 files changed, 58 insertions(+), 1 deletion(-)
>> 
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index 6a803a0b75df..879900082647 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -3069,8 +3069,47 @@ static int proc_tgid_base_readdir(struct file
>*file, struct dir_context *ctx)
>>  				   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
>>  }
>>  
>> +static unsigned int proc_tgid_base_poll(struct file *file, struct
>poll_table_struct *pts)
>> +{
>> +	int poll_flags = 0;
>> +	struct task_struct *task;
>> +	struct pid *pid;
>> +
>> +	task = get_proc_task(file->f_path.dentry->d_inode);
>> +
>> +	WARN_ON_ONCE(task && !thread_group_leader(task));
>> +
>> +	/*
>> +	 * tasklist_lock must be held because to avoid racing with
>> +	 * changes in exit_state and wake up. Basically to avoid:
>> +	 *
>> +	 * P0: read exit_state = 0
>> +	 * P1: write exit_state = EXIT_DEAD
>> +	 * P1: Do a wake up - wq is empty, so do nothing
>> +	 * P0: Queue for polling - wait forever.
>> +	 */
>> +	read_lock(&tasklist_lock);
>> +	if (!task)
>> +		poll_flags = POLLIN | POLLRDNORM | POLLERR;
>> +	else if (task->exit_state == EXIT_DEAD)
>> +		poll_flags = POLLIN | POLLRDNORM;
>> +	else if (task->exit_state == EXIT_ZOMBIE &&
>thread_group_empty(task))
>> +		poll_flags = POLLIN | POLLRDNORM;
>> +
>> +	if (!poll_flags) {
>> +		pid = proc_pid(file->f_path.dentry->d_inode);
>> +		poll_wait(file, &pid->wait_pidfd, pts);
>> +	}
>> +	read_unlock(&tasklist_lock);
>> +
>> +	if (task)
>> +		put_task_struct(task);
>> +	return poll_flags;
>> +}
>> +
>>  static const struct file_operations proc_tgid_base_operations = {
>>  	.read		= generic_read_dir,
>> +	.poll		= proc_tgid_base_poll,
>>  	.iterate_shared	= proc_tgid_base_readdir,
>>  	.llseek		= generic_file_llseek,
>>  };
>> diff --git a/include/linux/pid.h b/include/linux/pid.h
>> index b6f4ba16065a..2e0dcbc6d14e 100644
>> --- a/include/linux/pid.h
>> +++ b/include/linux/pid.h
>> @@ -3,6 +3,7 @@
>>  #define _LINUX_PID_H
>>  
>>  #include <linux/rculist.h>
>> +#include <linux/wait.h>
>>  
>>  enum pid_type
>>  {
>> @@ -60,6 +61,8 @@ struct pid
>>  	unsigned int level;
>>  	/* lists of tasks that use this pid */
>>  	struct hlist_head tasks[PIDTYPE_MAX];
>> +	/* wait queue for pidfd pollers */
>> +	wait_queue_head_t wait_pidfd;
>>  	struct rcu_head rcu;
>>  	struct upid numbers[1];
>>  };
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index 2166c2d92ddc..c386ec52687d 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -181,7 +181,6 @@ static void delayed_put_task_struct(struct
>rcu_head *rhp)
>>  	put_task_struct(tsk);
>>  }
>>  
>> -
>>  void release_task(struct task_struct *p)
>>  {
>>  	struct task_struct *leader;
>> diff --git a/kernel/pid.c b/kernel/pid.c
>> index 20881598bdfa..5c90c239242f 100644
>> --- a/kernel/pid.c
>> +++ b/kernel/pid.c
>> @@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>>  	for (type = 0; type < PIDTYPE_MAX; ++type)
>>  		INIT_HLIST_HEAD(&pid->tasks[type]);
>>  
>> +	init_waitqueue_head(&pid->wait_pidfd);
>> +
>>  	upid = pid->numbers + ns->level;
>>  	spin_lock_irq(&pidmap_lock);
>>  	if (!(ns->pid_allocated & PIDNS_ADDING))
>> diff --git a/kernel/signal.c b/kernel/signal.c
>> index f98448cf2def..e3781703ef7e 100644
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct
>pid *pid, enum pid_type type)
>>  	return ret;
>>  }
>>  
>> +static void do_wakeup_pidfd_pollers(struct task_struct *task)
>> +{
>> +	struct pid *pid;
>> +
>> +	lockdep_assert_held(&tasklist_lock);
>> +
>> +	pid = get_task_pid(task, PIDTYPE_PID);
>> +	wake_up_all(&pid->wait_pidfd);
>> +	put_pid(pid);
>> +}
>> +
>>  /*
>>   * Let a parent know about the death of a child.
>>   * For a stopped/continued status change, use
>do_notify_parent_cldstop instead.
>> @@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk,
>int sig)
>>  	BUG_ON(!tsk->ptrace &&
>>  	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
>>  
>> +	/* Wake up all pidfd waiters */
>> +	do_wakeup_pidfd_pollers(tsk);
>> +
>>  	if (sig != SIGCHLD) {
>>  		/*
>>  		 * This is only possible if parent == real_parent.
>> -- 
>> 2.21.0.392.gf8f6787159e-goog
>> 





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux