[RFC-1 PATCH 1/1] fork: add CLONE_PIDFD via /proc/<pid>

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is an RFC for the implementation of pidfds as /proc/<pid> file
descriptors. They can be retrieved through the clone() with the addition of
the CLONE_PIDFD flag.
The tricky part here is that we need to retrieve a file descriptor for
/proc/<pid> before clone's point of no return. Otherwise, we need to fail
the creation of a process that has already passed all barriers and is
visible in userspace. Getting that file descriptor then becomes a rather
intricate dance including allocating a detached dentry that we need to
commit once attach_pid() has been called.
Note that this RFC only includes the logic we think is needed to return
/proc/<pid> file descriptors from clone. It does *not* yet include the even
more complex logic needed to restrict procfs itself. And the additional
logic needed to prevent attacks such as openat(pidfd, "..", ...) and access
to /proc/<pid>/net/.

There are a couple of reasons why we stopped short of this and decided to
sent out an RFC first.
- Even the initial part of getting file descriptors from /proc/<pid> out
  out clone() required rather complex code that struck us as very
  inelegant and heavy (which granted, might partially caused by not seeing
  a cleaner way to implement this). Thus, it felt like we needed to see
  whether this is even remotely considered acceptable.
- While discussion further aspects of this approach with Al we received
  rather substantiated opposition to exposing even more codepaths to
  procfs.
- Restricting access to procfs properly requires a lot of invasive work
  even touching core vfs functions such as
  follow_dotdot()/follow_dotdot_rcu() which also caused 2.

Jann and I are providing a second RFC alongside this one that shows an
alternative and rather much simpler approach we think might be preferable.

Signed-off-by: Christian Brauner <christian@xxxxxxxxxx>
Signed-off-by: Jann Horn <jann@xxxxxxxxx>
Cc: Arnd Bergmann <arnd@xxxxxxxx>
Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Alexey Dobriyan <adobriyan@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: David Howells <dhowells@xxxxxxxxxx>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@xxxxxxxxx>
Cc: Jonathan Kowalski <bl0pbl33p@xxxxxxxxx>
Cc: "Dmitry V. Levin" <ldv@xxxxxxxxxxxx>
Cc: Andy Lutomirsky <luto@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: Aleksa Sarai <cyphar@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
---
 fs/proc/base.c             | 130 ++++++++++++++++++++++++++++++++++---
 fs/proc/fd.c               |   4 +-
 fs/proc/internal.h         |   2 +-
 fs/proc/namespaces.c       |   2 +-
 include/linux/proc_fs.h    |  19 ++++++
 include/uapi/linux/sched.h |   1 +
 kernel/fork.c              |  40 ++++++++++--
 7 files changed, 180 insertions(+), 18 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6a803a0b75df..2f5d7bd5d047 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1731,7 +1731,7 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
 	*rgid = gid;
 }
 
-struct inode *proc_pid_make_inode(struct super_block * sb,
+struct inode *proc_pid_make_inode(struct super_block * sb, struct pid *pid,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
@@ -1753,7 +1753,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 	/*
 	 * grab the reference to task.
 	 */
-	ei->pid = get_task_pid(task, PIDTYPE_PID);
+	ei->pid = pid ? get_pid(pid) : get_task_pid(task, PIDTYPE_PID);
 	if (!ei->pid)
 		goto out_unlock;
 
@@ -2070,7 +2070,7 @@ proc_map_files_instantiate(struct dentry *dentry,
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
+	inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFLNK |
 				    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
 				    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
 	if (!inode)
@@ -2428,7 +2428,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
 	struct inode *inode;
 	struct proc_inode *ei;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
+	inode = proc_pid_make_inode(dentry->d_sb, NULL, task, p->mode);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
@@ -3184,12 +3184,13 @@ void proc_flush_task(struct task_struct *task)
 	}
 }
 
-static struct dentry *proc_pid_instantiate(struct dentry * dentry,
-				   struct task_struct *task, const void *ptr)
+static struct inode *proc_pid_dentry_init(struct dentry *dentry,
+					  struct task_struct *task)
 {
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = proc_pid_make_inode(dentry->d_sb, NULL, task,
+				    S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
@@ -3201,9 +3202,122 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
 	pid_update_inode(task, inode);
 
 	d_set_d_op(dentry, &pid_dentry_operations);
+	return inode;
+}
+
+static struct dentry *proc_pid_instantiate(struct dentry * dentry,
+				   struct task_struct *task, const void *ptr)
+{
+	struct inode *inode = proc_pid_dentry_init(dentry, task);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 	return d_splice_alias(inode, dentry);
 }
 
+/*
+ * Open /proc/$pid before clone()'s point of no return, i.e. before
+ * attach_pid() has been called.
+ */
+int proc_pid_open_early(struct task_struct *task, struct pid *pid,
+			struct early_proc_pid *info)
+{
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	struct dentry *root = ns->proc_mnt->mnt_root;
+	pid_t vpid = pid_nr_ns(pid, ns);
+	struct inode *inode;
+	char pid_str[12];
+	int res;
+
+	if (WARN_ON(vpid == 0))
+		return -ESRCH;
+
+	/*
+	 * We can't use lookup_one_len() here. When this function is called
+	 * attach_pid() will not have been called which means that
+	 * proc_pid_lookup() will fail with ENOENT as it can't successfully
+	 * find_task_by_pid_ns().
+	 * We can just use d_alloc_name() though.
+	 */
+	snprintf(pid_str, sizeof(pid_str), "%d", vpid);
+	info->dentry = d_alloc_name(root, pid_str);
+	if (IS_ERR(info->dentry))
+		return PTR_ERR(info->dentry);
+
+	info->fd = __alloc_fd(current->files, 1, rlimit(RLIMIT_NOFILE),
+			      O_CLOEXEC);
+	if (info->fd < 0) {
+		res = info->fd;
+		goto out_put_dentry;
+	}
+
+	info->tmp_dentry = d_alloc_anon(root->d_sb);
+	if (!info->tmp_dentry) {
+		res = -ENOMEM;
+		goto out_put_fd;
+	}
+
+	inode = proc_pid_dentry_init(info->tmp_dentry, task);
+	if (IS_ERR(inode)) {
+		res = PTR_ERR(inode);
+		goto out_put_tmp_dentry;
+	}
+
+	d_instantiate(info->tmp_dentry, inode);
+	info->file = file_open_root(info->tmp_dentry, ns->proc_mnt, "/",
+				    O_RDONLY | O_NOFOLLOW | O_DIRECTORY, 0);
+	if (IS_ERR(info->file)) {
+		res = PTR_ERR(info->file);
+		goto out_put_tmp_dentry;
+	}
+
+	return 0;
+
+out_put_tmp_dentry:
+	dput(info->tmp_dentry);
+
+out_put_fd:
+	put_unused_fd(info->fd);
+
+out_put_dentry:
+	dput(info->dentry);
+
+	return res;
+}
+
+void proc_pid_dentry_commit_lock(struct early_proc_pid *info)
+{
+	lock_rename(info->tmp_dentry, info->dentry->d_parent);
+}
+
+/*
+ * Commit /proc/$pid after clone()'s point of no return, and install the file
+ * descriptor.
+ * Drops the locks acquired by proc_pid_dentry_commit_lock().
+ * Returns the file descriptor.
+ */
+int proc_pid_dentry_commit_unlock(struct early_proc_pid *info)
+{
+	/* commit the dentry */
+	d_move(info->tmp_dentry, info->dentry);
+	unlock_rename(info->tmp_dentry, info->dentry->d_parent);
+
+	/* release extra references */
+	dput(info->tmp_dentry);
+	dput(info->dentry);
+
+	/* install fd */
+	fd_install(info->fd, info->file);
+	return info->fd;
+}
+
+void proc_pid_dentry_abort(struct early_proc_pid *info)
+{
+	fput(info->file);
+	dput(info->tmp_dentry);
+	put_unused_fd(info->fd);
+	dput(info->dentry);
+}
+
 struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
 {
 	struct task_struct *task;
@@ -3480,7 +3594,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
 	struct task_struct *task, const void *ptr)
 {
 	struct inode *inode;
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 81882a13212d..7e624695db5a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -186,7 +186,7 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry,
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK);
+	inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFLNK);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
@@ -325,7 +325,7 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
+	inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFREG | S_IRUSR);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d1671e97f7fe..9b4cb85b96be 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -159,7 +159,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
 extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int proc_setattr(struct dentry *, struct iattr *);
-extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct pid *pid, struct task_struct *, umode_t);
 extern void pid_update_inode(struct task_struct *, struct inode *);
 extern int pid_delete_dentry(const struct dentry *);
 extern int proc_pid_readdir(struct file *, struct dir_context *);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index dd2b35f78b09..b77e4234a892 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -94,7 +94,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
 	struct inode *inode;
 	struct proc_inode *ei;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
+	inode = proc_pid_make_inode(dentry->d_sb, NULL, task, S_IFLNK | S_IRWXUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 52a283ba0465..e801481a8a24 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -75,6 +75,18 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
+struct early_proc_pid {
+	struct dentry *dentry;
+	struct dentry *tmp_dentry;
+	struct file *file;
+	int fd;
+};
+int proc_pid_open_early(struct task_struct *task, struct pid *pid,
+			struct early_proc_pid *info);
+void proc_pid_dentry_commit_lock(struct early_proc_pid *info);
+int proc_pid_dentry_commit_unlock(struct early_proc_pid *info);
+void proc_pid_dentry_abort(struct early_proc_pid *info);
+
 #else /* CONFIG_PROC_FS */
 
 static inline void proc_root_init(void)
@@ -120,6 +132,13 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
 	return ERR_PTR(-EBADF);
 }
 
+struct early_proc_pid {};
+static inline int proc_pid_open_early(struct task_struct *task,
+		struct pid *pid, struct early_proc_pid *info) { return -EINVAL; }
+void proc_pid_dentry_commit_lock(struct early_proc_pid *info) { }
+static inline int proc_pid_dentry_commit_unlock(struct early_proc_pid *info) { return -EINVAL; }
+static inline void proc_pid_dentry_abort(struct early_proc_pid *info) { }
+
 #endif /* CONFIG_PROC_FS */
 
 struct net;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 22627f80063e..cd9bd14ce56d 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -10,6 +10,7 @@
 #define CLONE_FS	0x00000200	/* set if fs info shared between processes */
 #define CLONE_FILES	0x00000400	/* set if open files shared between processes */
 #define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_PIDFD	0x00001000	/* create new pid file descriptor */
 #define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
 #define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
 #define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..31b405eee020 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
@@ -1678,11 +1679,13 @@ static __latent_entropy struct task_struct *copy_process(
 					struct pid *pid,
 					int trace,
 					unsigned long tls,
-					int node)
+					int node,
+					int *pidfd)
 {
 	int retval;
 	struct task_struct *p;
 	struct multiprocess_signals delayed;
+	struct early_proc_pid proc_pid_info;
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -1936,6 +1939,17 @@ static __latent_entropy struct task_struct *copy_process(
 		}
 	}
 
+	/*
+	 * This has to happen after we've potentially unshared the file
+	 * descriptor table (so that the pidfd doesn't leak into the child if
+	 * the fd table isn't shared).
+	 */
+	if (clone_flags & CLONE_PIDFD) {
+		retval = proc_pid_open_early(p, pid, &proc_pid_info);
+		if (retval)
+			goto bad_fork_free_pid;
+	}
+
 #ifdef CONFIG_BLOCK
 	p->plug = NULL;
 #endif
@@ -1996,7 +2010,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	retval = cgroup_can_fork(p);
 	if (retval)
-		goto bad_fork_free_pid;
+		goto bad_fork_abort_cgroup;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2009,6 +2023,9 @@ static __latent_entropy struct task_struct *copy_process(
 	p->start_time = ktime_get_ns();
 	p->real_start_time = ktime_get_boot_ns();
 
+	if (clone_flags & CLONE_PIDFD)
+		proc_pid_dentry_commit_lock(&proc_pid_info);
+
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
@@ -2091,12 +2108,16 @@ static __latent_entropy struct task_struct *copy_process(
 		attach_pid(p, PIDTYPE_PID);
 		nr_threads++;
 	}
+
 	total_forks++;
 	hlist_del_init(&delayed.node);
 	spin_unlock(&current->sighand->siglock);
 	syscall_tracepoint_update(p);
 	write_unlock_irq(&tasklist_lock);
 
+	if (clone_flags & CLONE_PIDFD)
+		*pidfd = proc_pid_dentry_commit_unlock(&proc_pid_info);
+
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	cgroup_threadgroup_change_end(current);
@@ -2111,8 +2132,11 @@ static __latent_entropy struct task_struct *copy_process(
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p);
-bad_fork_free_pid:
+bad_fork_abort_cgroup:
 	cgroup_threadgroup_change_end(current);
+	if (clone_flags & CLONE_PIDFD)
+		proc_pid_dentry_abort(&proc_pid_info);
+bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_thread:
@@ -2177,7 +2201,7 @@ struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
 	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
-			    cpu_to_node(cpu));
+			    cpu_to_node(cpu), NULL);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task);
 		init_idle(task, cpu);
@@ -2202,7 +2226,7 @@ long _do_fork(unsigned long clone_flags,
 	struct completion vfork;
 	struct pid *pid;
 	struct task_struct *p;
-	int trace = 0;
+	int trace = 0, pidfd;
 	long nr;
 
 	/*
@@ -2224,7 +2248,7 @@ long _do_fork(unsigned long clone_flags,
 	}
 
 	p = copy_process(clone_flags, stack_start, stack_size,
-			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE, &pidfd);
 	add_latent_entropy();
 
 	if (IS_ERR(p))
@@ -2260,6 +2284,10 @@ long _do_fork(unsigned long clone_flags,
 	}
 
 	put_pid(pid);
+
+	if (clone_flags & CLONE_PIDFD)
+		nr = pidfd;
+
 	return nr;
 }
 
-- 
2.21.0




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux