On Thu, Nov 07, 2019 at 03:03:04PM +0100, Laurent Vivier wrote: > This patch allows to have a different binfmt_misc configuration > for each new user namespace. By default, the binfmt_misc configuration > is the one of the previous level, but if the binfmt_misc filesystem is > mounted in the new namespace a new empty binfmt instance is created and > used in this namespace. > > For instance, using "unshare" we can start a chroot of another > architecture and configure the binfmt_misc interpreter without being root > to run the binaries in this chroot. > > Signed-off-by: Laurent Vivier <laurent@xxxxxxxxx> > Acked-by: Andrei Vagin <avagin@xxxxxxxxx> > --- > fs/binfmt_misc.c | 115 +++++++++++++++++++++++++-------- > include/linux/user_namespace.h | 15 +++++ > kernel/user.c | 14 ++++ > kernel/user_namespace.c | 3 + > 4 files changed, 119 insertions(+), 28 deletions(-) > > diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c > index cdb45829354d..ba5f0d2ade96 100644 > --- a/fs/binfmt_misc.c > +++ b/fs/binfmt_misc.c > @@ -40,9 +40,6 @@ enum { > VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ > }; > > -static LIST_HEAD(entries); > -static int enabled = 1; > - > enum {Enabled, Magic}; > #define MISC_FMT_PRESERVE_ARGV0 (1 << 31) > #define MISC_FMT_OPEN_BINARY (1 << 30) > @@ -62,10 +59,7 @@ typedef struct { > struct file *interp_file; > } Node; > > -static DEFINE_RWLOCK(entries_lock); > static struct file_system_type bm_fs_type; > -static struct vfsmount *bm_mnt; > -static int entry_count; > > /* > * Max length of the register string. Determined by: > @@ -82,18 +76,37 @@ static int entry_count; > */ > #define MAX_REGISTER_LENGTH 1920 > > +static struct binfmt_namespace *binfmt_ns(struct user_namespace *ns) > +{ > + struct binfmt_namespace *b_ns; > + > + while (ns) { > + b_ns = READ_ONCE(ns->binfmt_ns); > + if (b_ns) > + return b_ns; > + ns = ns->parent; > + } > + /* as the first user namespace is initialized with > + * &init_binfmt_ns we should never come here > + * but we try to stay safe by logging a warning > + * and returning a sane value > + */ > + WARN_ON_ONCE(1); > + return &init_binfmt_ns; > +} > + > /* > * Check if we support the binfmt > * if we do, return the node, else NULL > * locking is done in load_misc_binary > */ > -static Node *check_file(struct linux_binprm *bprm) > +static Node *check_file(struct binfmt_namespace *ns, struct linux_binprm *bprm) > { > char *p = strrchr(bprm->interp, '.'); > struct list_head *l; > > /* Walk all the registered handlers. */ > - list_for_each(l, &entries) { > + list_for_each(l, &ns->entries) { > Node *e = list_entry(l, Node, list); > char *s; > int j; > @@ -135,17 +148,18 @@ static int load_misc_binary(struct linux_binprm *bprm) > struct file *interp_file = NULL; > int retval; > int fd_binary = -1; > + struct binfmt_namespace *ns = binfmt_ns(current_user_ns()); > > retval = -ENOEXEC; > - if (!enabled) > + if (!ns->enabled) > return retval; > > /* to keep locking time low, we copy the interpreter string */ > - read_lock(&entries_lock); > - fmt = check_file(bprm); > + read_lock(&ns->entries_lock); > + fmt = check_file(ns, bprm); > if (fmt) > dget(fmt->dentry); > - read_unlock(&entries_lock); > + read_unlock(&ns->entries_lock); > if (!fmt) > return retval; > > @@ -611,19 +625,19 @@ static void bm_evict_inode(struct inode *inode) > kfree(e); > } > > -static void kill_node(Node *e) > +static void kill_node(struct binfmt_namespace *ns, Node *e) > { > struct dentry *dentry; > > - write_lock(&entries_lock); > + write_lock(&ns->entries_lock); > list_del_init(&e->list); > - write_unlock(&entries_lock); > + write_unlock(&ns->entries_lock); > > dentry = e->dentry; > drop_nlink(d_inode(dentry)); > d_drop(dentry); > dput(dentry); > - simple_release_fs(&bm_mnt, &entry_count); > + simple_release_fs(&ns->bm_mnt, &ns->entry_count); > } > > /* /<entry> */ > @@ -653,6 +667,9 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, > struct dentry *root; > Node *e = file_inode(file)->i_private; > int res = parse_command(buffer, count); > + struct binfmt_namespace *ns; > + > + ns = binfmt_ns(file->f_path.dentry->d_sb->s_user_ns); Sorry for being so late to the party. The naked dereferences here are not very pretty and also likely mean you access the wrong dentry when you do (weird but possible) things like: mount -t overlay overlay -o lowerdir=/proc/sys/fs/binfmt_misc,workdir=/work,upperdir=/upper /merged (which might already cause trouble in other parts of this code) so I think it's better if you replace file->f_path.dentry->d_sb->s_user_ns with file_dentry(file)->d_sb->s_user_ns in places where you do naked dereferences now.