> Please the the incomplete/untested patch below. > > - The change in exit_notify() is sub-optimal, we can do better > to avoid 2 do_notify_pidfd() calls from exit_notify(). But > so far this is only for discussion, lets keep it simple. > > - __pidfd_prepare() needs some minor cleanups regardless of > this change, I'll send the patch... > > What do you think? > > And why is thread_group_exited() exported? > > Oleg. > > diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h > index 5406fbc13074..2e6461459877 100644 > --- a/include/uapi/linux/pidfd.h > +++ b/include/uapi/linux/pidfd.h > @@ -7,6 +7,7 @@ > #include <linux/fcntl.h> > > /* Flags for pidfd_open(). */ > -#define PIDFD_NONBLOCK O_NONBLOCK > +#define PIDFD_NONBLOCK O_NONBLOCK > +#define PIDFD_THREAD O_EXCL // or anything else not used by anon_inode's I like it! The only request I would have is to not alias O_EXCL and PIDFD_THREAD. Because it doesn't map as clearly as NONBLOCK did. > > #endif /* _UAPI_LINUX_PIDFD_H */ > diff --git a/kernel/exit.c b/kernel/exit.c > index dfb963d2f862..9f8526b7d717 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -752,6 +752,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) > autoreap = true; > } > > + /* unnecessary if do_notify_parent() was already called, > + we can do better */ > + do_notify_pidfd(tsk); > + > if (autoreap) { > tsk->exit_state = EXIT_DEAD; > list_add(&tsk->ptrace_entry, &dead); > diff --git a/kernel/fork.c b/kernel/fork.c > index c981fa6171c1..38f2c7423fb4 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -101,6 +101,7 @@ > #include <linux/user_events.h> > #include <linux/iommu.h> > #include <linux/rseq.h> > +#include <uapi/linux/pidfd.h> > > #include <asm/pgalloc.h> > #include <linux/uaccess.h> > @@ -2068,12 +2069,27 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) > } > #endif > > +static bool xxx_exited(struct pid *pid, int excl) > +{ > + struct task_struct *task; > + bool exited; > + > + rcu_read_lock(); > + task = pid_task(pid, PIDTYPE_PID); > + exited = !task || > + (READ_ONCE(task->exit_state) && (excl || thread_group_empty(task))); > + rcu_read_unlock(); > + > + return exited; > +} > + > /* > * Poll support for process exit notification. > */ > static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) > { > struct pid *pid = file->private_data; > + int excl = file->f_flags & PIDFD_THREAD; > __poll_t poll_flags = 0; > > poll_wait(file, &pid->wait_pidfd, pts); > @@ -2083,7 +2099,7 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) > * If the thread group leader exits before all other threads in the > * group, then poll(2) should block, similar to the wait(2) family. > */ > - if (thread_group_exited(pid)) > + if (xxx_exited(pid, excl)) > poll_flags = EPOLLIN | EPOLLRDNORM; > > return poll_flags; > @@ -2129,7 +2145,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re > { > int pidfd; > struct file *pidfd_file; > + unsigned excl = flags & PIDFD_THREAD; > > + flags &= ~PIDFD_THREAD; > if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) > return -EINVAL; > > @@ -2144,6 +2162,7 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re > return PTR_ERR(pidfd_file); > } > get_pid(pid); /* held by pidfd_file now */ > + pidfd_file->f_flags |= excl; > *ret = pidfd_file; > return pidfd; > } > @@ -2176,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re > */ > int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) > { > - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) > + unsigned excl = flags & PIDFD_THREAD; > + > + if (!pid || !pid_has_task(pid, excl ? PIDTYPE_PID : PIDTYPE_TGID)) > return -EINVAL; > > return __pidfd_prepare(pid, flags, ret); > diff --git a/kernel/pid.c b/kernel/pid.c > index b52b10865454..5257197f9493 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -629,7 +629,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) > int fd; > struct pid *p; > > - if (flags & ~PIDFD_NONBLOCK) > + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) > return -EINVAL; > > if (pid <= 0) >