Quoting Eric W. Biederman (ebiederm@xxxxxxxxxxxx): > Limit per userns sysctls to only be opened for write by a holder > of CAP_SYS_RESOURCE. > > Add all of the necessary boilerplate for having per user namespace > sysctls. > > Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> > --- > include/linux/user_namespace.h | 4 ++ > kernel/user_namespace.c | 92 +++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 94 insertions(+), 2 deletions(-) > > diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h > index 9217169c64cb..7d59af1f08f1 100644 > --- a/include/linux/user_namespace.h > +++ b/include/linux/user_namespace.h > @@ -39,6 +39,10 @@ struct user_namespace { > struct key *persistent_keyring_register; > struct rw_semaphore persistent_keyring_register_sem; > #endif > +#ifdef CONFIG_SYSCTL > + struct ctl_table_set set; > + struct ctl_table_header *sysctls; > +#endif > }; > > extern struct user_namespace init_user_ns; > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c > index 68f594212759..b5b5cbb8fe92 100644 > --- a/kernel/user_namespace.c > +++ b/kernel/user_namespace.c > @@ -30,6 +30,70 @@ static bool new_idmap_permitted(const struct file *file, > struct user_namespace *ns, int cap_setid, > struct uid_gid_map *map); > > +#ifdef CONFIG_SYSCTL > +static struct ctl_table_set * > +set_lookup(struct ctl_table_root *root) > +{ > + return ¤t->cred->user_ns->set; > +} > + > +static int set_is_seen(struct ctl_table_set *set) > +{ > + return ¤t->cred->user_ns->set == set; > +} > + > +static int set_permissions(struct ctl_table_header *head, > + struct ctl_table *table) > +{ > + struct user_namespace *user_ns = > + container_of(head->set, struct user_namespace, set); > + int mode; > + > + /* Allow users with CAP_SYS_ADMIN unrestrainted access */ nit 1 - s/unrestrainted/unrestrained/ nit 2 - you say CAP_SYS_ADMIN but check for CAP_SYS_RESOURCE > + if (ns_capable(user_ns, CAP_SYS_RESOURCE)) > + mode = (table->mode & S_IRWXU) >> 6; > + else > + /* Allow all others at most read-only access */ > + mode = table->mode & S_IROTH; > + return (mode << 6) | (mode << 3) | mode; > +} > + > +static struct ctl_table_root set_root = { > + .lookup = set_lookup, > + .permissions = set_permissions, > +}; > + > +static struct ctl_table userns_table[] = { > + { } > +}; > +#endif /* CONFIG_SYSCTL */ > + > +static bool setup_userns_sysctls(struct user_namespace *ns) > +{ > +#ifdef CONFIG_SYSCTL > + struct ctl_table *tbl; > + setup_sysctl_set(&ns->set, &set_root, set_is_seen); > + tbl = kmemdup(userns_table, sizeof(userns_table), GFP_KERNEL); > + if (tbl) { > + ns->sysctls = __register_sysctl_table(&ns->set, "userns", tbl); > + } What happens if tbl is null due to oom? Would it be better to just return false in that case here? > + if (!ns->sysctls) { > + kfree(tbl); > + retire_sysctl_set(&ns->set); > + return false; > + } > +#endif > + return true; > +} > + > +static void retire_userns_sysctls(struct user_namespace *ns) > +{ > +#ifdef CONFIG_SYSCTL > + unregister_sysctl_table(ns->sysctls); > + retire_sysctl_set(&ns->set); > +#endif > +} > + > static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) > { > /* Start with the same capabilities as init but useless for doing > @@ -107,12 +171,22 @@ int create_user_ns(struct cred *new) > ns->flags = parent_ns->flags; > mutex_unlock(&userns_state_mutex); > > - set_cred_user_ns(new, ns); > - > #ifdef CONFIG_PERSISTENT_KEYRINGS > init_rwsem(&ns->persistent_keyring_register_sem); > #endif > + ret = -ENOMEM; > + if (!setup_userns_sysctls(ns)) > + goto fail_keyring; > + > + set_cred_user_ns(new, ns); > return 0; > +fail_keyring: > +#ifdef CONFIG_PERSISTENT_KEYRINGS > + key_put(ns->persistent_keyring_register); > +#endif > + ns_free_inum(&ns->ns); > + kmem_cache_free(user_ns_cachep, ns); > + return ret; > } > > int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) > @@ -141,6 +215,7 @@ void free_user_ns(struct user_namespace *ns) > > do { > parent = ns->parent; > + retire_userns_sysctls(ns); > #ifdef CONFIG_PERSISTENT_KEYRINGS > key_put(ns->persistent_keyring_register); > #endif > @@ -1012,9 +1087,22 @@ const struct proc_ns_operations userns_operations = { > .install = userns_install, > }; > > +static struct ctl_table_header *userns_header; > static __init int user_namespaces_init(void) > { > + static struct ctl_table empty[1]; > user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); > + > +#ifdef CONFIG_SYSCTL > + /* > + * Avoid limits in the sysctl limitation by registering > + * "/proc/sys/userns" as an empty directory not in a network > + * namespace. > + */ > + userns_header = register_sysctl("userns", empty); > + BUG_ON(!userns_header); > +#endif > + BUG_ON(!setup_userns_sysctls(&init_user_ns)); > return 0; > } > subsys_initcall(user_namespaces_init); > -- > 2.8.3 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers