On 30.07.2020 16:26, Christian Brauner wrote: > On Thu, Jul 30, 2020 at 03:00:19PM +0300, Kirill Tkhai wrote: >> This is a new directory to show all namespaces, which can be >> accessed from this /proc tasks credentials. >> >> Every /proc is related to a pid_namespace, and the pid_namespace >> is related to a user_namespace. The items, we show in this >> /proc/namespaces/ directory, are the namespaces, >> whose user_namespaces are the same as /proc's user_namespace, >> or their descendants. >> >> Say, /proc has pid_ns->user_ns, so in /proc/namespace we show >> only a ns, which is in_userns(pid_ns->user_ns, ns->user_ns). >> >> The final result is like below: >> >> # ls /proc/namespaces/ -l >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'cgroup:[4026531835]' -> 'cgroup:[4026531835]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'ipc:[4026531839]' -> 'ipc:[4026531839]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026531840]' -> 'mnt:[4026531840]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026531861]' -> 'mnt:[4026531861]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532133]' -> 'mnt:[4026532133]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532134]' -> 'mnt:[4026532134]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532135]' -> 'mnt:[4026532135]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532136]' -> 'mnt:[4026532136]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'net:[4026531993]' -> 'net:[4026531993]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'pid:[4026531836]' -> 'pid:[4026531836]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'time:[4026531834]' -> 'time:[4026531834]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'user:[4026531837]' -> 'user:[4026531837]' >> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'uts:[4026531838]' -> 'uts:[4026531838]' > > So usually, the /proc/<pid>/ns entries are guarded by > ptrace_may_access() but from skimming the patch it seems that > /proc/namespaces/ would be accessible by any user. > > I think we should guard /proc/namespaces/. Either by restricting it to > userns CAP_SYS_ADMIN or - to make it work with unprivileged CRIU - by > ns_capable(proc's_pid_ns->user_ns, CAP_SYS_PTRACE). I do agree with you, the restrictions have to be strict. Advising this, do you mean only open() on /proc/namespaces/* files? I'm not sure we should prohibit simple readdir of this directory. What do you think? > This should probably also be a mount option on procfs given that we now > allow a restricted view of procfs. > > Christian > >> >> Every namespace may be open like ordinary file in /proc/[pid]/ns. >> >> Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx> >> --- >> fs/nsfs.c | 2 >> fs/proc/Makefile | 1 >> fs/proc/internal.h | 16 ++ >> fs/proc/namespaces.c | 314 +++++++++++++++++++++++++++++++++++++++++++++++ >> fs/proc/root.c | 17 ++- >> include/linux/proc_fs.h | 1 >> 6 files changed, 345 insertions(+), 6 deletions(-) >> create mode 100644 fs/proc/namespaces.c >> >> diff --git a/fs/nsfs.c b/fs/nsfs.c >> index ee4be67d3a0b..61b789d2089c 100644 >> --- a/fs/nsfs.c >> +++ b/fs/nsfs.c >> @@ -58,7 +58,7 @@ static void nsfs_evict(struct inode *inode) >> ns->ops->put(ns); >> } >> >> -static int __ns_get_path(struct path *path, struct ns_common *ns) >> +int __ns_get_path(struct path *path, struct ns_common *ns) >> { >> struct vfsmount *mnt = nsfs_mnt; >> struct dentry *dentry; >> diff --git a/fs/proc/Makefile b/fs/proc/Makefile >> index dc2d51f42905..34ff671c6d59 100644 >> --- a/fs/proc/Makefile >> +++ b/fs/proc/Makefile >> @@ -25,6 +25,7 @@ proc-y += util.o >> proc-y += version.o >> proc-y += softirqs.o >> proc-y += task_namespaces.o >> +proc-y += namespaces.o >> proc-y += self.o >> proc-y += thread_self.o >> proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o >> diff --git a/fs/proc/internal.h b/fs/proc/internal.h >> index 572757ff97be..d19fe5574799 100644 >> --- a/fs/proc/internal.h >> +++ b/fs/proc/internal.h >> @@ -134,10 +134,11 @@ void task_dump_owner(struct task_struct *task, umode_t mode, >> kuid_t *ruid, kgid_t *rgid); >> >> unsigned name_to_int(const struct qstr *qstr); >> -/* >> - * Offset of the first process in the /proc root directory.. >> - */ >> -#define FIRST_PROCESS_ENTRY 256 >> + >> +/* Offset of "namespaces" entry in /proc root directory */ >> +#define NAMESPACES_ENTRY 256 >> +/* Offset of the first process in the /proc root directory */ >> +#define FIRST_PROCESS_ENTRY (NAMESPACES_ENTRY + 1) >> >> /* Worst case buffer size needed for holding an integer. */ >> #define PROC_NUMBUF 13 >> @@ -168,6 +169,7 @@ extern void proc_pid_evict_inode(struct proc_inode *); >> extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); >> extern void pid_update_inode(struct task_struct *, struct inode *); >> extern int pid_delete_dentry(const struct dentry *); >> +extern int proc_emit_namespaces(struct file *, struct dir_context *); >> extern int proc_pid_readdir(struct file *, struct dir_context *); >> struct dentry *proc_pid_lookup(struct dentry *, unsigned int); >> extern loff_t mem_lseek(struct file *, loff_t, int); >> @@ -222,6 +224,12 @@ void set_proc_pid_nlink(void); >> extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); >> extern void proc_entry_rundown(struct proc_dir_entry *); >> >> +/* >> + * namespaces.c >> + */ >> +extern int proc_setup_namespaces(struct super_block *); >> +extern void proc_namespaces_init(void); >> + >> /* >> * task_namespaces.c >> */ >> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c >> new file mode 100644 >> index 000000000000..ab47e1555619 >> --- /dev/null >> +++ b/fs/proc/namespaces.c >> @@ -0,0 +1,314 @@ >> +#include <linux/pid_namespace.h> >> +#include <linux/user_namespace.h> >> +#include <linux/namei.h> >> +#include "internal.h" >> + >> +static unsigned namespaces_inum __ro_after_init; >> + >> +int proc_emit_namespaces(struct file *file, struct dir_context *ctx) >> +{ >> + struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); >> + struct inode *inode = d_inode(fs_info->proc_namespaces); >> + >> + return dir_emit(ctx, "namespaces", 10, inode->i_ino, DT_DIR); >> +} >> + >> +static int parse_namespace_dentry_name(const struct dentry *dentry, >> + const char **type, unsigned int *type_len, unsigned int *inum) >> +{ >> + const char *p, *name; >> + int count; >> + >> + *type = name = dentry->d_name.name; >> + p = strchr(name, ':'); >> + *type_len = p - name; >> + if (!p || p == name) >> + return -ENOENT; > > Hm, rather: > > p = strchr(name, ':'); > if (!p || p == name) > return -ENOENT; > *type_len = p - name; > >> + >> + p += 1; >> + if (sscanf(p, "[%u]%n", inum, &count) != 1 || *(p + count) != '\0' || >> + *inum < PROC_NS_MIN_INO) >> + return -ENOENT; >> + >> + return 0; >> +} >> + >> +static struct ns_common *get_namespace_by_dentry(struct pid_namespace *pid_ns, >> + const struct dentry *dentry) >> +{ >> + unsigned int type_len, inum, p_inum; >> + struct user_namespace *user_ns; >> + struct ns_common *ns; >> + const char *type; >> + >> + if (parse_namespace_dentry_name(dentry, &type, &type_len, &inum) < 0) >> + return NULL; >> + >> + p_inum = inum - 1; >> + ns = ns_get_next(&p_inum); >> + if (!ns) >> + return NULL; >> + >> + if (ns->inum != inum || strncmp(type, ns->ops->name, type_len) != 0 || >> + ns->ops->name[type_len] != '\0') { >> + ns->ops->put(ns); >> + return NULL; >> + } >> + >> + if (ns->ops != &userns_operations) >> + user_ns = ns->ops->owner(ns); >> + else >> + user_ns = container_of(ns, struct user_namespace, ns); >> + >> + if (!in_userns(pid_ns->user_ns, user_ns)) { >> + ns->ops->put(ns); >> + return NULL; >> + } >> + >> + return ns; >> +} >> + >> +static struct dentry *proc_namespace_instantiate(struct dentry *dentry, >> + struct task_struct *task, const void *ptr); >> + >> +static struct dentry *proc_namespaces_lookup(struct inode *dir, struct dentry *dentry, >> + unsigned int flags) >> +{ >> + struct pid_namespace *pid_ns = proc_pid_ns(dir->i_sb); >> + struct task_struct *task; >> + struct ns_common *ns; >> + >> + ns = get_namespace_by_dentry(pid_ns, dentry); >> + if (!ns) >> + return ERR_PTR(-ENOENT); >> + >> + read_lock(&tasklist_lock); >> + task = get_task_struct(pid_ns->child_reaper); >> + read_unlock(&tasklist_lock); >> + >> + dentry = proc_namespace_instantiate(dentry, task, ns); >> + put_task_struct(task); >> + ns->ops->put(ns); >> + >> + return dentry; >> +} >> + >> +static int proc_namespaces_permission(struct inode *inode, int mask) >> +{ >> + if ((mask & MAY_EXEC) && S_ISLNK(inode->i_mode)) >> + return -EACCES; >> + >> + return 0; >> +} >> + >> +static int proc_namespaces_getattr(const struct path *path, struct kstat *stat, >> + u32 request_mask, unsigned int query_flags) >> +{ >> + struct inode *inode = d_inode(path->dentry); >> + >> + generic_fillattr(inode, stat); >> + return 0; >> +} >> + >> +static const struct inode_operations proc_namespaces_inode_operations = { >> + .lookup = proc_namespaces_lookup, >> + .permission = proc_namespaces_permission, >> + .getattr = proc_namespaces_getattr, >> +}; >> + >> +static int proc_namespaces_readlink(struct dentry *dentry, char __user *buffer, int buflen) >> +{ >> + struct inode *dir = dentry->d_parent->d_inode; >> + struct pid_namespace *pid_ns = proc_pid_ns(dir->i_sb); >> + struct ns_common *ns; >> + >> + ns = get_namespace_by_dentry(pid_ns, dentry); >> + if (!ns) >> + return -ENOENT; >> + ns->ops->put(ns); >> + >> + /* proc_namespaces_readdir() creates dentry names in namespace format */ >> + return readlink_copy(buffer, buflen, dentry->d_iname); >> +} >> + >> +int __ns_get_path(struct path *path, struct ns_common *ns); >> + >> +static const char *proc_namespaces_getlink(struct dentry *dentry, >> + struct inode *inode, struct delayed_call *done) >> +{ >> + struct pid_namespace *pid_ns = proc_pid_ns(inode->i_sb); >> + struct ns_common *ns; >> + struct path path; >> + int ret; >> + >> + if (!dentry) >> + return ERR_PTR(-ECHILD); >> + >> + while (1) { >> + ret = -ENOENT; >> + ns = get_namespace_by_dentry(pid_ns, dentry); >> + if (!ns) >> + goto out; >> + >> + ret = __ns_get_path(&path, ns); >> + if (ret == -EAGAIN) >> + continue; >> + if (ret) >> + goto out; >> + break; >> + } >> + >> + ret = nd_jump_link(&path); >> +out: >> + return ERR_PTR(ret); >> +} >> + >> +static const struct inode_operations proc_namespaces_link_inode_operations = { >> + .readlink = proc_namespaces_readlink, >> + .get_link = proc_namespaces_getlink, >> +}; >> + >> +static int namespace_delete_dentry(const struct dentry *dentry) >> +{ >> + struct inode *dir = dentry->d_parent->d_inode; >> + struct pid_namespace *pid_ns = proc_pid_ns(dir->i_sb); >> + struct ns_common *ns; >> + >> + ns = get_namespace_by_dentry(pid_ns, dentry); >> + if (!ns) >> + return 1; >> + >> + ns->ops->put(ns); >> + return 0; >> +} >> + >> +const struct dentry_operations namespaces_dentry_operations = { >> + .d_delete = namespace_delete_dentry, >> +}; >> + >> +static void namespace_update_inode(struct inode *inode) >> +{ >> + struct user_namespace *user_ns = proc_pid_ns(inode->i_sb)->user_ns; >> + >> + inode->i_uid = make_kuid(user_ns, 0); >> + if (!uid_valid(inode->i_uid)) >> + inode->i_uid = GLOBAL_ROOT_UID; >> + >> + inode->i_gid = make_kgid(user_ns, 0); >> + if (!gid_valid(inode->i_gid)) >> + inode->i_gid = GLOBAL_ROOT_GID; >> +} >> + >> +static struct dentry *proc_namespace_instantiate(struct dentry *dentry, >> + struct task_struct *task, const void *ptr) >> +{ >> + const struct ns_common *ns = ptr; >> + struct inode *inode; >> + struct proc_inode *ei; >> + >> + /* >> + * Create inode with credentials of @task, and add it to @task's >> + * quick removal list. >> + */ >> + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); >> + if (!inode) >> + return ERR_PTR(-ENOENT); >> + >> + ei = PROC_I(inode); >> + inode->i_op = &proc_namespaces_link_inode_operations; >> + ei->ns_ops = ns->ops; >> + namespace_update_inode(inode); >> + >> + d_set_d_op(dentry, &namespaces_dentry_operations); >> + return d_splice_alias(inode, dentry); >> +} >> + >> +static int proc_namespaces_readdir(struct file *file, struct dir_context *ctx) >> +{ >> + struct pid_namespace *pid_ns = proc_pid_ns(file_inode(file)->i_sb); >> + struct user_namespace *user_ns; >> + struct task_struct *task; >> + struct ns_common *ns; >> + unsigned int inum; >> + >> + read_lock(&tasklist_lock); >> + task = get_task_struct(pid_ns->child_reaper); >> + read_unlock(&tasklist_lock); >> + >> + if (!dir_emit_dots(file, ctx)) >> + goto out; >> + >> + inum = ctx->pos - 2; >> + while ((ns = ns_get_next(&inum)) != NULL) { >> + unsigned int len; >> + char name[32]; >> + >> + if (ns->ops != &userns_operations) >> + user_ns = ns->ops->owner(ns); >> + else >> + user_ns = container_of(ns, struct user_namespace, ns); >> + >> + if (!in_userns(pid_ns->user_ns, user_ns)) >> + goto next; >> + >> + len = snprintf(name, sizeof(name), "%s:[%u]", ns->ops->name, inum); >> + >> + if (!proc_fill_cache(file, ctx, name, len, >> + proc_namespace_instantiate, task, ns)) { >> + ns->ops->put(ns); >> + break; >> + } >> +next: >> + ns->ops->put(ns); >> + ctx->pos = inum + 2; >> + } >> +out: >> + put_task_struct(task); >> + return 0; >> +} >> + >> +static const struct file_operations proc_namespaces_file_operations = { >> + .read = generic_read_dir, >> + .iterate_shared = proc_namespaces_readdir, >> + .llseek = generic_file_llseek, >> +}; >> + >> +int proc_setup_namespaces(struct super_block *s) >> +{ >> + struct proc_fs_info *fs_info = proc_sb_info(s); >> + struct inode *root_inode = d_inode(s->s_root); >> + struct dentry *namespaces; >> + int ret = -ENOMEM; >> + >> + inode_lock(root_inode); >> + namespaces = d_alloc_name(s->s_root, "namespaces"); >> + if (namespaces) { >> + struct inode *inode = new_inode_pseudo(s); >> + if (inode) { >> + inode->i_ino = namespaces_inum; >> + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); >> + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; >> + inode->i_uid = GLOBAL_ROOT_UID; >> + inode->i_gid = GLOBAL_ROOT_GID; >> + inode->i_op = &proc_namespaces_inode_operations; >> + inode->i_fop = &proc_namespaces_file_operations; >> + d_add(namespaces, inode); >> + ret = 0; >> + } else { >> + dput(namespaces); >> + } >> + } >> + inode_unlock(root_inode); >> + >> + if (ret) >> + pr_err("proc_setup_namespaces: can't allocate /proc/namespaces\n"); >> + else >> + fs_info->proc_namespaces = namespaces; >> + >> + return ret; >> +} >> + >> +void __init proc_namespaces_init(void) >> +{ >> + proc_alloc_inum(&namespaces_inum); >> +} >> diff --git a/fs/proc/root.c b/fs/proc/root.c >> index 5e444d4f9717..e4e4f90fca3d 100644 >> --- a/fs/proc/root.c >> +++ b/fs/proc/root.c >> @@ -206,6 +206,10 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) >> return -ENOMEM; >> } >> >> + ret = proc_setup_namespaces(s); >> + if (ret) >> + return ret; >> + >> ret = proc_setup_self(s); >> if (ret) { >> return ret; >> @@ -272,6 +276,9 @@ static void proc_kill_sb(struct super_block *sb) >> dput(fs_info->proc_self); >> dput(fs_info->proc_thread_self); >> >> + if (fs_info->proc_namespaces) >> + dput(fs_info->proc_namespaces); >> + >> kill_anon_super(sb); >> put_pid_ns(fs_info->pid_ns); >> kfree(fs_info); >> @@ -289,6 +296,7 @@ void __init proc_root_init(void) >> { >> proc_init_kmemcache(); >> set_proc_pid_nlink(); >> + proc_namespaces_init(); >> proc_self_init(); >> proc_thread_self_init(); >> proc_symlink("mounts", NULL, "self/mounts"); >> @@ -326,8 +334,15 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr >> >> static int proc_root_readdir(struct file *file, struct dir_context *ctx) >> { >> - if (ctx->pos < FIRST_PROCESS_ENTRY) { >> + if (ctx->pos < NAMESPACES_ENTRY) { >> int error = proc_readdir(file, ctx); >> + if (unlikely(error <= 0)) >> + return error; >> + ctx->pos = NAMESPACES_ENTRY; >> + } >> + >> + if (ctx->pos == NAMESPACES_ENTRY) { >> + int error = proc_emit_namespaces(file, ctx); >> if (unlikely(error <= 0)) >> return error; >> ctx->pos = FIRST_PROCESS_ENTRY; >> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h >> index 97b3f5f06db9..8b0002a6cacf 100644 >> --- a/include/linux/proc_fs.h >> +++ b/include/linux/proc_fs.h >> @@ -61,6 +61,7 @@ struct proc_fs_info { >> struct pid_namespace *pid_ns; >> struct dentry *proc_self; /* For /proc/self */ >> struct dentry *proc_thread_self; /* For /proc/thread-self */ >> + struct dentry *proc_namespaces; /* For /proc/namespaces */ >> kgid_t pid_gid; >> enum proc_hidepid hide_pid; >> enum proc_pidonly pidonly; >> >>