Hi Eric, Thanks a lot for looking into this problem. On Sat, Jun 16, 2018 at 7:55 PM Eric W. Biederman <ebiederm@xxxxxxxxxxxx> wrote: > > > Today there are three users of proc_mnt. > - The legacy sysctl system call implementation. > - The uml mconsole driver. > - The process cleanup function proc_flush_task. > > The first two are slow path and essentially unused. I expect soon we > will be able to remove the legacy sysctl system call entirely. To > keep them working for now a new wrapper file_open_proc_is added to > mount and unmount proc around file_open_root. Which nicely removes > the need for a always mounted proc instance for these cases. > > Handling proc_flush_task which is regularly used requires a little more > work. First I optimize proc_flush_task to do nothing where there is > evidence that there are no entries in proc, by looking at pid->count. > Then I carefully update proc_fill_super and proc_kill_sb to maintain a > ns->proc_super pointer to the super block for proc. This allows > proc_flush_task to find the appropriate instance of proc via rcu. > > Once the appropriate instance of proc is found in proc_flush_task > atomic_inc_not_zero is used to increase the s_active count ensuring > proc_kill_sb will not be called, until the superblock is deactivated. > This makes it safe to inspect the instance of proc and invalidate any > dentries that mention the exiting task. > > The two extra atomics operations in exit are not my favorite but given > that exit is already almost completely serialized with the task lock I > do not expect this change will be measurable. > > The benefit for all of this change is that one of the most error prone > and tricky parts of the pid namespace implementation, maintaining > kernel mounts of proc is removed. > > In addition removing the unnecessary complexity of the kernel mount > fixes a regression that caused the proc mount options to be ignored. > Now that the initial mount of proc comes from userspace, those mount > options are again honored. This fixes Android's usage of the proc > hidepid option. > > Reported-by: Alistair Strachan <astrachan@xxxxxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx > Fixes: e94591d0d90c ("proc: Convert proc_mount to use mount_ns.") > Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> > --- > > Alistair if you can test and confirm this fixes your issue I will add > your tested by and send the fix to Linus. I tested v2 with both UML and qemu-system-x86_64 / ARCH=x86_64 against 4.18-rc1, 4.14 and 4.9 and I couldn't break it. The hidepid problem is resolved, and the mount flags can now only be specified on the first userspace mount for that pid namespace. Tested-by: Alistair Strachan <astrachan@xxxxxxxxxx> > Since my earlier posting I have spot tested this. Fixed a few bugs that > showed up and verified my changes work. So I think this is ready to go > unless someone looks at this and in testing or code review spots a bug. Agreed! > Eric > > arch/um/drivers/mconsole_kern.c | 4 ++-- > fs/proc/base.c | 36 ++++++++++++++++++++++++++------- > fs/proc/inode.c | 5 ++++- > fs/proc/root.c | 28 ++++++++++--------------- > include/linux/pid_namespace.h | 3 +-- > include/linux/proc_ns.h | 7 ++----- > kernel/pid.c | 8 -------- > kernel/pid_namespace.c | 7 ------- > kernel/sysctl_binary.c | 5 ++--- > 9 files changed, 51 insertions(+), 52 deletions(-) > > diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c > index d5f9a2d1da1b..36af0e02d56b 100644 > --- a/arch/um/drivers/mconsole_kern.c > +++ b/arch/um/drivers/mconsole_kern.c > @@ -27,6 +27,7 @@ > #include <linux/file.h> > #include <linux/uaccess.h> > #include <asm/switch_to.h> > +#include <linux/proc_ns.h> > > #include <init.h> > #include <irq_kern.h> > @@ -124,7 +125,6 @@ void mconsole_log(struct mc_request *req) > > void mconsole_proc(struct mc_request *req) > { > - struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; > char *buf; > int len; > struct file *file; > @@ -135,7 +135,7 @@ void mconsole_proc(struct mc_request *req) > ptr += strlen("proc"); > ptr = skip_spaces(ptr); > > - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); > + file = file_open_proc(ptr, O_RDONLY, 0); > if (IS_ERR(file)) { > mconsole_reply(req, "Failed to open file", 1, 0); > printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file)); > diff --git a/fs/proc/base.c b/fs/proc/base.c > index 1b2ede6abcdf..cd7b68a64ed1 100644 > --- a/fs/proc/base.c > +++ b/fs/proc/base.c > @@ -3052,7 +3052,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = { > .permission = proc_pid_permission, > }; > > -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) > +static void proc_flush_task_root(struct dentry *proc_root, pid_t pid, pid_t tgid) > { > struct dentry *dentry, *leader, *dir; > char buf[10 + 1]; > @@ -3061,7 +3061,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) > name.name = buf; > name.len = snprintf(buf, sizeof(buf), "%u", pid); > /* no ->d_hash() rejects on procfs */ > - dentry = d_hash_and_lookup(mnt->mnt_root, &name); > + dentry = d_hash_and_lookup(proc_root, &name); > if (dentry) { > d_invalidate(dentry); > dput(dentry); > @@ -3072,7 +3072,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) > > name.name = buf; > name.len = snprintf(buf, sizeof(buf), "%u", tgid); > - leader = d_hash_and_lookup(mnt->mnt_root, &name); > + leader = d_hash_and_lookup(proc_root, &name); > if (!leader) > goto out; > > @@ -3102,8 +3102,8 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) > * @task: task that should be flushed. > * > * When flushing dentries from proc, one needs to flush them from global > - * proc (proc_mnt) and from all the namespaces' procs this task was seen > - * in. This call is supposed to do all of this job. > + * proc and from all the namespaces' procs this task was seen in. This call > + * is supposed to do all of this job. > * > * Looks in the dcache for > * /proc/@pid > @@ -3127,15 +3127,37 @@ void proc_flush_task(struct task_struct *task) > int i; > struct pid *pid, *tgid; > struct upid *upid; > + int expected = 1; > > pid = task_pid(task); > tgid = task_tgid(task); > + if (thread_group_leader(task)) { > + if (task_pgrp(task) == pid) > + expected++; > + if (task_session(task) == pid) > + expected++; > + } > + > + /* Nothing to do if proc inodes have not take a reference to pid */ > + if (atomic_read(&pid->count) == expected) > + return; > > + rcu_read_lock(); > for (i = 0; i <= pid->level; i++) { > + struct super_block *sb; > upid = &pid->numbers[i]; > - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, > - tgid->numbers[i].nr); > + > + sb = rcu_dereference(upid->ns->proc_super); > + if (!sb || !atomic_inc_not_zero(&sb->s_active)) > + continue; > + rcu_read_unlock(); > + > + proc_flush_task_root(sb->s_root, upid->nr, tgid->numbers[i].nr); > + deactivate_super(sb); > + > + rcu_read_lock(); > } > + rcu_read_unlock(); > } > > static int proc_pid_instantiate(struct inode *dir, > diff --git a/fs/proc/inode.c b/fs/proc/inode.c > index 2cf3b74391ca..1dd9514fa068 100644 > --- a/fs/proc/inode.c > +++ b/fs/proc/inode.c > @@ -532,5 +532,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent) > if (ret) { > return ret; > } > - return proc_setup_thread_self(s); > + ret = proc_setup_thread_self(s); > + > + rcu_assign_pointer(ns->proc_super, s); > + return ret; > } > diff --git a/fs/proc/root.c b/fs/proc/root.c > index 61b7340b357a..59ca06c386a0 100644 > --- a/fs/proc/root.c > +++ b/fs/proc/root.c > @@ -89,14 +89,7 @@ int proc_remount(struct super_block *sb, int *flags, char *data) > static struct dentry *proc_mount(struct file_system_type *fs_type, > int flags, const char *dev_name, void *data) > { > - struct pid_namespace *ns; > - > - if (flags & SB_KERNMOUNT) { > - ns = data; > - data = NULL; > - } else { > - ns = task_active_pid_ns(current); > - } > + struct pid_namespace *ns = task_active_pid_ns(current); > > return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); > } > @@ -106,6 +99,7 @@ static void proc_kill_sb(struct super_block *sb) > struct pid_namespace *ns; > > ns = (struct pid_namespace *)sb->s_fs_info; > + rcu_assign_pointer(ns->proc_super, NULL); > if (ns->proc_self) > dput(ns->proc_self); > if (ns->proc_thread_self) > @@ -208,19 +202,19 @@ struct proc_dir_entry proc_root = { > .inline_name = "/proc", > }; > > -int pid_ns_prepare_proc(struct pid_namespace *ns) > +#if defined(CONFIG_SYSCTL_SYSCALL) || defined(CONFIG_MCONSOLE) > +struct file *file_open_proc(const char *pathname, int flags, umode_t mode) > { > struct vfsmount *mnt; > + struct file *file; > > - mnt = kern_mount_data(&proc_fs_type, ns); > + mnt = kern_mount(&proc_fs_type); > if (IS_ERR(mnt)) > - return PTR_ERR(mnt); > + return ERR_CAST(mnt); > > - ns->proc_mnt = mnt; > - return 0; > -} > + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, mode); > + kern_unmount(mnt); > > -void pid_ns_release_proc(struct pid_namespace *ns) > -{ > - kern_unmount(ns->proc_mnt); > + return file; > } > +#endif > diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h > index 49538b172483..dfa70858b19a 100644 > --- a/include/linux/pid_namespace.h > +++ b/include/linux/pid_namespace.h > @@ -31,7 +31,7 @@ struct pid_namespace { > unsigned int level; > struct pid_namespace *parent; > #ifdef CONFIG_PROC_FS > - struct vfsmount *proc_mnt; > + struct super_block __rcu *proc_super; > struct dentry *proc_self; > struct dentry *proc_thread_self; > #endif > @@ -40,7 +40,6 @@ struct pid_namespace { > #endif > struct user_namespace *user_ns; > struct ucounts *ucounts; > - struct work_struct proc_work; > kgid_t pid_gid; > int hide_pid; > int reboot; /* group exit code if this pidns was rebooted */ > diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h > index d31cb6215905..8f1b9edf40ba 100644 > --- a/include/linux/proc_ns.h > +++ b/include/linux/proc_ns.h > @@ -47,16 +47,11 @@ enum { > > #ifdef CONFIG_PROC_FS > > -extern int pid_ns_prepare_proc(struct pid_namespace *ns); > -extern void pid_ns_release_proc(struct pid_namespace *ns); > extern int proc_alloc_inum(unsigned int *pino); > extern void proc_free_inum(unsigned int inum); > > #else /* CONFIG_PROC_FS */ > > -static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } > -static inline void pid_ns_release_proc(struct pid_namespace *ns) {} > - > static inline int proc_alloc_inum(unsigned int *inum) > { > *inum = 1; > @@ -86,4 +81,6 @@ extern int ns_get_name(char *buf, size_t size, struct task_struct *task, > const struct proc_ns_operations *ns_ops); > extern void nsfs_init(void); > > +extern struct file *file_open_proc(const char *pathname, int flags, umode_t mode); > + > #endif /* _LINUX_PROC_NS_H */ > diff --git a/kernel/pid.c b/kernel/pid.c > index 157fe4b19971..7a1a4f39e527 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -143,9 +143,6 @@ void free_pid(struct pid *pid) > /* Handle a fork failure of the first process */ > WARN_ON(ns->child_reaper); > ns->pid_allocated = 0; > - /* fall through */ > - case 0: > - schedule_work(&ns->proc_work); > break; > } > > @@ -204,11 +201,6 @@ struct pid *alloc_pid(struct pid_namespace *ns) > tmp = tmp->parent; > } > > - if (unlikely(is_child_reaper(pid))) { > - if (pid_ns_prepare_proc(ns)) > - goto out_free; > - } > - > get_pid_ns(ns); > atomic_set(&pid->count, 1); > for (type = 0; type < PIDTYPE_MAX; ++type) > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index 2a2ac53d8b8b..3018cc18ac38 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -58,12 +58,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) > return READ_ONCE(*pkc); > } > > -static void proc_cleanup_work(struct work_struct *work) > -{ > - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); > - pid_ns_release_proc(ns); > -} > - > static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) > { > return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); > @@ -115,7 +109,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > ns->user_ns = get_user_ns(user_ns); > ns->ucounts = ucounts; > ns->pid_allocated = PIDNS_ADDING; > - INIT_WORK(&ns->proc_work, proc_cleanup_work); > > return ns; > > diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c > index 07148b497451..b655410fa05a 100644 > --- a/kernel/sysctl_binary.c > +++ b/kernel/sysctl_binary.c > @@ -17,6 +17,7 @@ > #include <linux/uuid.h> > #include <linux/slab.h> > #include <linux/compat.h> > +#include <linux/proc_ns.h> > > #ifdef CONFIG_SYSCTL_SYSCALL > > @@ -1278,7 +1279,6 @@ static ssize_t binary_sysctl(const int *name, int nlen, > void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) > { > const struct bin_table *table = NULL; > - struct vfsmount *mnt; > struct file *file; > ssize_t result; > char *pathname; > @@ -1301,8 +1301,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, > goto out_putname; > } > > - mnt = task_active_pid_ns(current)->proc_mnt; > - file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); > + file = file_open_proc(pathname, flags, 0); > result = PTR_ERR(file); > if (IS_ERR(file)) > goto out_putname; > -- > 2.17.1