On Wed, Mar 22, 2023 at 4:35 PM Christian Brauner <brauner@xxxxxxxxxx> wrote: > > On Tue, Mar 21, 2023 at 07:33:41PM +0100, Alexander Mikhalitsyn wrote: > > Add SO_PEERPIDFD which allows to get pidfd of peer socket holder pidfd. > > This thing is direct analog of SO_PEERCRED which allows to get plain PID. > > > > Cc: "David S. Miller" <davem@xxxxxxxxxxxxx> > > Cc: Eric Dumazet <edumazet@xxxxxxxxxx> > > Cc: Jakub Kicinski <kuba@xxxxxxxxxx> > > Cc: Paolo Abeni <pabeni@xxxxxxxxxx> > > Cc: Leon Romanovsky <leon@xxxxxxxxxx> > > Cc: David Ahern <dsahern@xxxxxxxxxx> > > Cc: Arnd Bergmann <arnd@xxxxxxxx> > > Cc: Kees Cook <keescook@xxxxxxxxxxxx> > > Cc: Christian Brauner <brauner@xxxxxxxxxx> > > Cc: Kuniyuki Iwashima <kuniyu@xxxxxxxxxx> > > Cc: Lennart Poettering <mzxreary@xxxxxxxxxxx> > > Cc: linux-kernel@xxxxxxxxxxxxxxx > > Cc: netdev@xxxxxxxxxxxxxxx > > Cc: linux-arch@xxxxxxxxxxxxxxx > > Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@xxxxxxxxxxxxx> > > --- > > v2: > > According to review comments from Kuniyuki Iwashima and Christian Brauner: > > - use pidfd_create(..) retval as a result > > - whitespace change > > --- > > arch/alpha/include/uapi/asm/socket.h | 1 + > > arch/mips/include/uapi/asm/socket.h | 1 + > > arch/parisc/include/uapi/asm/socket.h | 1 + > > arch/sparc/include/uapi/asm/socket.h | 1 + > > include/uapi/asm-generic/socket.h | 1 + > > net/core/sock.c | 21 +++++++++++++++++++++ > > tools/include/uapi/asm-generic/socket.h | 1 + > > 7 files changed, 27 insertions(+) > > > > diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h > > index ff310613ae64..e94f621903fe 100644 > > --- a/arch/alpha/include/uapi/asm/socket.h > > +++ b/arch/alpha/include/uapi/asm/socket.h > > @@ -138,6 +138,7 @@ > > #define SO_RCVMARK 75 > > > > #define SO_PASSPIDFD 76 > > +#define SO_PEERPIDFD 77 > > > > #if !defined(__KERNEL__) > > > > diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h > > index 762dcb80e4ec..60ebaed28a4c 100644 > > --- a/arch/mips/include/uapi/asm/socket.h > > +++ b/arch/mips/include/uapi/asm/socket.h > > @@ -149,6 +149,7 @@ > > #define SO_RCVMARK 75 > > > > #define SO_PASSPIDFD 76 > > +#define SO_PEERPIDFD 77 > > > > #if !defined(__KERNEL__) > > > > diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h > > index df16a3e16d64..be264c2b1a11 100644 > > --- a/arch/parisc/include/uapi/asm/socket.h > > +++ b/arch/parisc/include/uapi/asm/socket.h > > @@ -130,6 +130,7 @@ > > #define SO_RCVMARK 0x4049 > > > > #define SO_PASSPIDFD 0x404A > > +#define SO_PEERPIDFD 0x404B > > > > #if !defined(__KERNEL__) > > > > diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h > > index 6e2847804fea..682da3714686 100644 > > --- a/arch/sparc/include/uapi/asm/socket.h > > +++ b/arch/sparc/include/uapi/asm/socket.h > > @@ -131,6 +131,7 @@ > > #define SO_RCVMARK 0x0054 > > > > #define SO_PASSPIDFD 0x0055 > > +#define SO_PEERPIDFD 0x0056 > > > > #if !defined(__KERNEL__) > > > > diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h > > index b76169fdb80b..8ce8a39a1e5f 100644 > > --- a/include/uapi/asm-generic/socket.h > > +++ b/include/uapi/asm-generic/socket.h > > @@ -133,6 +133,7 @@ > > #define SO_RCVMARK 75 > > > > #define SO_PASSPIDFD 76 > > +#define SO_PEERPIDFD 77 > > > > #if !defined(__KERNEL__) > > > > diff --git a/net/core/sock.c b/net/core/sock.c > > index 3f974246ba3e..85c269ca9d8a 100644 > > --- a/net/core/sock.c > > +++ b/net/core/sock.c > > @@ -1763,6 +1763,27 @@ int sk_getsockopt(struct sock *sk, int level, int optname, > > goto lenout; > > } > > > > + case SO_PEERPIDFD: > > + { > > + struct pid *peer_pid; > > + int pidfd; > > + > > + if (len > sizeof(pidfd)) > > + len = sizeof(pidfd); > > + > > + spin_lock(&sk->sk_peer_lock); > > + peer_pid = get_pid(sk->sk_peer_pid); > > + spin_unlock(&sk->sk_peer_lock); > > + > > + pidfd = pidfd_create(peer_pid, 0); > > + > > + put_pid(peer_pid); > > + > > + if (copy_to_sockptr(optval, &pidfd, len)) > > + return -EFAULT; > > This leaks the pidfd. We could do: > > if (copy_to_sockptr(optval, &pidfd, len)) { > close_fd(pidfd); > return -EFAULT; > } Ah, my bad. Thanks for pointing this out! > > but it's a nasty anti-pattern to install the fd in the caller's fdtable > and then close it again. So let's avoid it if we can. Since you can only > set one socket option per setsockopt() sycall we should be able to > reserve an fd and pidfd_file, do the stuff that might fail, and then > call fd_install. So that would roughly be: > > peer_pid = get_pid(sk->sk_peer_pid); > pidfd_file = pidfd_file_create(peer_pid, 0, &pidfd); > f (copy_to_sockptr(optval, &pidfd, len)) > return -EFAULT; > goto lenout: > > . > . > . > > lenout: > if (copy_to_sockptr(optlen, &len, sizeof(int))) > return -EFAULT; > > // Made it safely, install pidfd now. > fd_install(pidfd, pidfd_file) > > (See below for the associated api I'm going to publish independent of > this as kernel/fork.c and fanotify both could use it.) > > But now, let's look at net/socket.c there's another wrinkle. So let's say you > have successfully installed the pidfd then it seems you can still fail later: > > if (level == SOL_SOCKET) > err = sock_getsockopt(sock, level, optname, optval, optlen); > else if (unlikely(!sock->ops->getsockopt)) > err = -EOPNOTSUPP; > else > err = sock->ops->getsockopt(sock, level, optname, optval, > optlen); > > if (!in_compat_syscall()) > err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, > optval, optlen, max_optlen, > err); > > out_put: > fput_light(sock->file, fput_needed); > return err; > > If the bpf hook returns an error we've placed an fd into the caller's sockopt > buffer without their knowledge. yes, so we need to postpone fd_install to the end of __sys_getsockopt. I'll think about that. > > From 4fee16f0920308bee2531fd3b08484f607eb5830 Mon Sep 17 00:00:00 2001 > From: Christian Brauner <brauner@xxxxxxxxxx> > Date: Wed, 22 Mar 2023 15:59:02 +0100 > Subject: [PATCH 1/3] [HERE BE DRAGONS - DRAFT - __UNTESTED__] pid: add > pidfd_file_create() > > Reserve and fd and pidfile, do stuff that might fail, install fd when > point of no return. > > [HERE BE DRAGONS - DRAFT - __UNTESTED__] pid: add pidfd_file_create() > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > include/linux/pid.h | 1 + > kernel/pid.c | 45 +++++++++++++++++++++++++++++++++------------ > 2 files changed, 34 insertions(+), 12 deletions(-) > > diff --git a/include/linux/pid.h b/include/linux/pid.h > index 343abf22092e..c486dbc4d7b6 100644 > --- a/include/linux/pid.h > +++ b/include/linux/pid.h > @@ -80,6 +80,7 @@ extern struct pid *pidfd_pid(const struct file *file); > struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); > struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); > int pidfd_create(struct pid *pid, unsigned int flags); > +struct file *pidfd_file_create(struct pid *pid, unsigned int flags, int *pidfd); > > static inline struct pid *get_pid(struct pid *pid) > { > diff --git a/kernel/pid.c b/kernel/pid.c > index 3fbc5e46b721..8d0924f1dbf6 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -576,6 +576,32 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) > return task; > } > > +struct file *pidfd_file_create(struct pid *pid, unsigned int flags, int *pidfd) > +{ > + int fd; > + struct file *pidfile; > + > + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) > + return ERR_PTR(-EINVAL); > + > + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) > + return ERR_PTR(-EINVAL); > + > + fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); > + if (fd < 0) > + return ERR_PTR(fd); > + > + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, > + flags | O_RDWR | O_CLOEXEC); > + if (IS_ERR(pidfile)) { > + put_unused_fd(fd); > + return pidfile; > + } > + get_pid(pid); /* held by pidfile now */ > + *pidfd = fd; > + return pidfile; > +} > + > /** > * pidfd_create() - Create a new pid file descriptor. > * > @@ -594,20 +620,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) > */ > int pidfd_create(struct pid *pid, unsigned int flags) > { > - int fd; > + int pidfd; > + struct file *pidfile; > > - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) > - return -EINVAL; > + pidfile = pidfd_file_create(pid, flags, &pidfd); > + if (IS_ERR(pidfile)) > + return PTR_ERR(pidfile); > > - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) > - return -EINVAL; > - > - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), > - flags | O_RDWR | O_CLOEXEC); > - if (fd < 0) > - put_pid(pid); > - > - return fd; > + fd_install(pidfd, pidfile); > + return pidfd; > } > > /** > -- > 2.34.1 > > From c336f1c6cc39faa5aef4fbedd3c4f8eca51d8436 Mon Sep 17 00:00:00 2001 > From: Christian Brauner <brauner@xxxxxxxxxx> > Date: Wed, 22 Mar 2023 15:59:54 +0100 > Subject: [PATCH 2/3] [HERE BE DRAGONS - DRAFT - __UNTESTED__] fork: use > pidfd_file_create() > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > kernel/fork.c | 11 +---------- > 1 file changed, 1 insertion(+), 10 deletions(-) > > diff --git a/kernel/fork.c b/kernel/fork.c > index f68954d05e89..c8dc78ee0a74 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -2296,20 +2296,11 @@ static __latent_entropy struct task_struct *copy_process( > * if the fd table isn't shared). > */ > if (clone_flags & CLONE_PIDFD) { > - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); > - if (retval < 0) > - goto bad_fork_free_pid; > - > - pidfd = retval; > - > - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, > - O_RDWR | O_CLOEXEC); > + pidfile = pidfd_file_create(pid, O_RDWR | O_CLOEXEC, &pidfd); > if (IS_ERR(pidfile)) { > - put_unused_fd(pidfd); > retval = PTR_ERR(pidfile); > goto bad_fork_free_pid; > } > - get_pid(pid); /* held by pidfile now */ > > retval = put_user(pidfd, args->pidfd); > if (retval) > -- > 2.34.1 > > From 0897f68fe06a8777d8ec600fdc719143f76095b1 Mon Sep 17 00:00:00 2001 > From: Christian Brauner <brauner@xxxxxxxxxx> > Date: Wed, 22 Mar 2023 16:02:50 +0100 > Subject: [PATCH 3/3] [HERE BE DRAGONS - DRAFT - __UNTESTED__] fanotify: use > pidfd_file_create() > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > fs/notify/fanotify/fanotify_user.c | 15 +++++++++++---- > 1 file changed, 11 insertions(+), 4 deletions(-) > > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c > index 8f430bfad487..4a8db6b5f690 100644 > --- a/fs/notify/fanotify/fanotify_user.c > +++ b/fs/notify/fanotify/fanotify_user.c > @@ -665,6 +665,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, > unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; > struct file *f = NULL; > int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; > + struct file *pidfd_file = NULL; > > pr_debug("%s: group=%p event=%p\n", __func__, group, event); > > @@ -718,9 +719,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, > !pid_has_task(event->pid, PIDTYPE_TGID)) { > pidfd = FAN_NOPIDFD; > } else { > - pidfd = pidfd_create(event->pid, 0); > - if (pidfd < 0) > + pidfd_file = pidfd_file_create(event->pid, 0, &pidfd); > + if (IS_ERR(pidfd_file)) { > pidfd = FAN_EPIDFD; > + pidfd_file = NULL; > + } > } > } > > @@ -750,6 +753,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, > > if (f) > fd_install(fd, f); > + if (pidfd_file) > + fd_install(pidfd, pidfd_file); > > return metadata.event_len; > > @@ -759,8 +764,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, > fput(f); > } > > - if (pidfd >= 0) > - close_fd(pidfd); > + if (pidfd >= 0) { > + put_unused_fd(pidfd); > + fput(pidfd_file); > + } > > return ret; > } > -- > 2.34.1 >