This is an RFC for the implementation of pidfds as /proc/<pid> file descriptors. They can be retrieved through the clone() with the addition of the CLONE_PIDFD flag. The tricky part here is that we need to retrieve a file descriptor for /proc/<pid> before clone's point of no return. Otherwise, we need to fail the creation of a process that has already passed all barriers and is visible in userspace. Getting that file descriptor then becomes a rather intricate dance including allocating a detached dentry that we need to commit once attach_pid() has been called. Note that this RFC only includes the logic we think is needed to return /proc/<pid> file descriptors from clone. It does *not* yet include the even more complex logic needed to restrict procfs itself. And the additional logic needed to prevent attacks such as openat(pidfd, "..", ...) and access to /proc/<pid>/net/. There are a couple of reasons why we stopped short of this and decided to sent out an RFC first. - Even the initial part of getting file descriptors from /proc/<pid> out out clone() required rather complex code that struck us as very inelegant and heavy (which granted, might partially caused by not seeing a cleaner way to implement this). Thus, it felt like we needed to see whether this is even remotely considered acceptable. - While discussion further aspects of this approach with Al we received rather substantiated opposition to exposing even more codepaths to procfs. - Restricting access to procfs properly requires a lot of invasive work even touching core vfs functions such as follow_dotdot()/follow_dotdot_rcu() which also caused 2. Jann and I are providing a second RFC alongside this one that shows an alternative and rather much simpler approach we think might be preferable. Signed-off-by: Christian Brauner <christian@xxxxxxxxxx> Signed-off-by: Jann Horn <jann@xxxxxxxxx> Cc: Arnd Bergmann <arnd@xxxxxxxx> Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> Cc: Kees Cook <keescook@xxxxxxxxxxxx> Cc: Alexey Dobriyan <adobriyan@xxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: David Howells <dhowells@xxxxxxxxxx> Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@xxxxxxxxx> Cc: Jonathan Kowalski <bl0pbl33p@xxxxxxxxx> Cc: "Dmitry V. Levin" <ldv@xxxxxxxxxxxx> Cc: Andy Lutomirsky <luto@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Oleg Nesterov <oleg@xxxxxxxxxx> Cc: Aleksa Sarai <cyphar@xxxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> --- fs/proc/base.c | 130 ++++++++++++++++++++++++++++++++++--- fs/proc/fd.c | 4 +- fs/proc/internal.h | 2 +- fs/proc/namespaces.c | 2 +- include/linux/proc_fs.h | 19 ++++++ include/uapi/linux/sched.h | 1 + kernel/fork.c | 40 ++++++++++-- 7 files changed, 180 insertions(+), 18 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 6a803a0b75df..2f5d7bd5d047 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1731,7 +1731,7 @@ void task_dump_owner(struct task_struct *task, umode_t mode, *rgid = gid; } -struct inode *proc_pid_make_inode(struct super_block * sb, +struct inode *proc_pid_make_inode(struct super_block * sb, struct pid *pid, struct task_struct *task, umode_t mode) { struct inode * inode; @@ -1753,7 +1753,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* * grab the reference to task. */ - ei->pid = get_task_pid(task, PIDTYPE_PID); + ei->pid = pid ? get_pid(pid) : get_task_pid(task, PIDTYPE_PID); if (!ei->pid) goto out_unlock; @@ -2070,7 +2070,7 @@ proc_map_files_instantiate(struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | + inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) @@ -2428,7 +2428,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry, struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); + inode = proc_pid_make_inode(dentry->d_sb, NULL, task, p->mode); if (!inode) return ERR_PTR(-ENOENT); @@ -3184,12 +3184,13 @@ void proc_flush_task(struct task_struct *task) } } -static struct dentry *proc_pid_instantiate(struct dentry * dentry, - struct task_struct *task, const void *ptr) +static struct inode *proc_pid_dentry_init(struct dentry *dentry, + struct task_struct *task) { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_inode(dentry->d_sb, NULL, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); @@ -3201,9 +3202,122 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); + return inode; +} + +static struct dentry *proc_pid_instantiate(struct dentry * dentry, + struct task_struct *task, const void *ptr) +{ + struct inode *inode = proc_pid_dentry_init(dentry, task); + if (IS_ERR(inode)) + return ERR_CAST(inode); return d_splice_alias(inode, dentry); } +/* + * Open /proc/$pid before clone()'s point of no return, i.e. before + * attach_pid() has been called. + */ +int proc_pid_open_early(struct task_struct *task, struct pid *pid, + struct early_proc_pid *info) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct dentry *root = ns->proc_mnt->mnt_root; + pid_t vpid = pid_nr_ns(pid, ns); + struct inode *inode; + char pid_str[12]; + int res; + + if (WARN_ON(vpid == 0)) + return -ESRCH; + + /* + * We can't use lookup_one_len() here. When this function is called + * attach_pid() will not have been called which means that + * proc_pid_lookup() will fail with ENOENT as it can't successfully + * find_task_by_pid_ns(). + * We can just use d_alloc_name() though. + */ + snprintf(pid_str, sizeof(pid_str), "%d", vpid); + info->dentry = d_alloc_name(root, pid_str); + if (IS_ERR(info->dentry)) + return PTR_ERR(info->dentry); + + info->fd = __alloc_fd(current->files, 1, rlimit(RLIMIT_NOFILE), + O_CLOEXEC); + if (info->fd < 0) { + res = info->fd; + goto out_put_dentry; + } + + info->tmp_dentry = d_alloc_anon(root->d_sb); + if (!info->tmp_dentry) { + res = -ENOMEM; + goto out_put_fd; + } + + inode = proc_pid_dentry_init(info->tmp_dentry, task); + if (IS_ERR(inode)) { + res = PTR_ERR(inode); + goto out_put_tmp_dentry; + } + + d_instantiate(info->tmp_dentry, inode); + info->file = file_open_root(info->tmp_dentry, ns->proc_mnt, "/", + O_RDONLY | O_NOFOLLOW | O_DIRECTORY, 0); + if (IS_ERR(info->file)) { + res = PTR_ERR(info->file); + goto out_put_tmp_dentry; + } + + return 0; + +out_put_tmp_dentry: + dput(info->tmp_dentry); + +out_put_fd: + put_unused_fd(info->fd); + +out_put_dentry: + dput(info->dentry); + + return res; +} + +void proc_pid_dentry_commit_lock(struct early_proc_pid *info) +{ + lock_rename(info->tmp_dentry, info->dentry->d_parent); +} + +/* + * Commit /proc/$pid after clone()'s point of no return, and install the file + * descriptor. + * Drops the locks acquired by proc_pid_dentry_commit_lock(). + * Returns the file descriptor. + */ +int proc_pid_dentry_commit_unlock(struct early_proc_pid *info) +{ + /* commit the dentry */ + d_move(info->tmp_dentry, info->dentry); + unlock_rename(info->tmp_dentry, info->dentry->d_parent); + + /* release extra references */ + dput(info->tmp_dentry); + dput(info->dentry); + + /* install fd */ + fd_install(info->fd, info->file); + return info->fd; +} + +void proc_pid_dentry_abort(struct early_proc_pid *info) +{ + fput(info->file); + dput(info->tmp_dentry); + put_unused_fd(info->fd); + dput(info->dentry); +} + struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; @@ -3480,7 +3594,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 81882a13212d..7e624695db5a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -186,7 +186,7 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK); + inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFLNK); if (!inode) return ERR_PTR(-ENOENT); @@ -325,7 +325,7 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR); + inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFREG | S_IRUSR); if (!inode) return ERR_PTR(-ENOENT); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d1671e97f7fe..9b4cb85b96be 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -159,7 +159,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); -extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); +extern struct inode *proc_pid_make_inode(struct super_block *, struct pid *pid, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..b77e4234a892 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -94,7 +94,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); + inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFLNK | S_IRWXUGO); if (!inode) return ERR_PTR(-ENOENT); diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 52a283ba0465..e801481a8a24 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -75,6 +75,18 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo void *data); extern struct pid *tgid_pidfd_to_pid(const struct file *file); +struct early_proc_pid { + struct dentry *dentry; + struct dentry *tmp_dentry; + struct file *file; + int fd; +}; +int proc_pid_open_early(struct task_struct *task, struct pid *pid, + struct early_proc_pid *info); +void proc_pid_dentry_commit_lock(struct early_proc_pid *info); +int proc_pid_dentry_commit_unlock(struct early_proc_pid *info); +void proc_pid_dentry_abort(struct early_proc_pid *info); + #else /* CONFIG_PROC_FS */ static inline void proc_root_init(void) @@ -120,6 +132,13 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file) return ERR_PTR(-EBADF); } +struct early_proc_pid {}; +static inline int proc_pid_open_early(struct task_struct *task, + struct pid *pid, struct early_proc_pid *info) { return -EINVAL; } +void proc_pid_dentry_commit_lock(struct early_proc_pid *info) { } +static inline int proc_pid_dentry_commit_unlock(struct early_proc_pid *info) { return -EINVAL; } +static inline void proc_pid_dentry_abort(struct early_proc_pid *info) { } + #endif /* CONFIG_PROC_FS */ struct net; diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 22627f80063e..cd9bd14ce56d 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -10,6 +10,7 @@ #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PIDFD 0x00001000 /* create new pid file descriptor */ #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..31b405eee020 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -32,6 +32,7 @@ #include <linux/sem.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/fsnotify.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/binfmts.h> @@ -1678,11 +1679,13 @@ static __latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, unsigned long tls, - int node) + int node, + int *pidfd) { int retval; struct task_struct *p; struct multiprocess_signals delayed; + struct early_proc_pid proc_pid_info; /* * Don't allow sharing the root directory with processes in a different @@ -1936,6 +1939,17 @@ static __latent_entropy struct task_struct *copy_process( } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child if + * the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = proc_pid_open_early(p, pid, &proc_pid_info); + if (retval) + goto bad_fork_free_pid; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1996,7 +2010,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_abort_cgroup; /* * From this point on we must avoid any synchronous user-space @@ -2009,6 +2023,9 @@ static __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); + if (clone_flags & CLONE_PIDFD) + proc_pid_dentry_commit_lock(&proc_pid_info); + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! @@ -2091,12 +2108,16 @@ static __latent_entropy struct task_struct *copy_process( attach_pid(p, PIDTYPE_PID); nr_threads++; } + total_forks++; hlist_del_init(&delayed.node); spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + if (clone_flags & CLONE_PIDFD) + *pidfd = proc_pid_dentry_commit_unlock(&proc_pid_info); + proc_fork_connector(p); cgroup_post_fork(p); cgroup_threadgroup_change_end(current); @@ -2111,8 +2132,11 @@ static __latent_entropy struct task_struct *copy_process( spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); -bad_fork_free_pid: +bad_fork_abort_cgroup: cgroup_threadgroup_change_end(current); + if (clone_flags & CLONE_PIDFD) + proc_pid_dentry_abort(&proc_pid_info); +bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -2177,7 +2201,7 @@ struct task_struct *fork_idle(int cpu) { struct task_struct *task; task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, - cpu_to_node(cpu)); + cpu_to_node(cpu), NULL); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); @@ -2202,7 +2226,7 @@ long _do_fork(unsigned long clone_flags, struct completion vfork; struct pid *pid; struct task_struct *p; - int trace = 0; + int trace = 0, pidfd; long nr; /* @@ -2224,7 +2248,7 @@ long _do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + child_tidptr, NULL, trace, tls, NUMA_NO_NODE, &pidfd); add_latent_entropy(); if (IS_ERR(p)) @@ -2260,6 +2284,10 @@ long _do_fork(unsigned long clone_flags, } put_pid(pid); + + if (clone_flags & CLONE_PIDFD) + nr = pidfd; + return nr; } -- 2.21.0