On 2014/12/5 9:55, Aditya Kali wrote: > Introduce the ability to create new cgroup namespace. The newly created > cgroup namespace remembers the cgroup of the process at the point > of creation of the cgroup namespace (referred as cgroupns-root). > The main purpose of cgroup namespace is to virtualize the contents > of /proc/self/cgroup file. Processes inside a cgroup namespace > are only able to see paths relative to their namespace root > (unless they are moved outside of their cgroupns-root, at which point > they will see a relative path from their cgroupns-root). > For a correctly setup container this enables container-tools > (like libcontainer, lxc, lmctfy, etc.) to create completely virtualized > containers without leaking system level cgroup hierarchy to the task. > This patch only implements the 'unshare' part of the cgroupns. > > Signed-off-by: Aditya Kali <adityakali@xxxxxxxxxx> > --- > fs/proc/namespaces.c | 1 + > include/linux/cgroup.h | 29 ++++++++- > include/linux/cgroup_namespace.h | 36 +++++++++++ > include/linux/nsproxy.h | 2 + > include/linux/proc_ns.h | 4 ++ > kernel/Makefile | 2 +- > kernel/cgroup.c | 13 ++++ > kernel/cgroup_namespace.c | 127 +++++++++++++++++++++++++++++++++++++++ > kernel/fork.c | 2 +- > kernel/nsproxy.c | 19 +++++- > 10 files changed, 230 insertions(+), 5 deletions(-) > > diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c > index 8902609..55bc5da 100644 > --- a/fs/proc/namespaces.c > +++ b/fs/proc/namespaces.c > @@ -32,6 +32,7 @@ static const struct proc_ns_operations *ns_entries[] = { > &userns_operations, > #endif > &mntns_operations, > + &cgroupns_operations, Should be guarded with CONFIG_CGROUPS ? There are other changes that break compile if !CONFIG_CGROUPS. > }; > > static const struct file_operations ns_file_operations = { > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index 6e7533b..94a5a0c 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -22,6 +22,8 @@ > #include <linux/seq_file.h> > #include <linux/kernfs.h> > #include <linux/wait.h> > +#include <linux/nsproxy.h> > +#include <linux/types.h> > > #ifdef CONFIG_CGROUPS > > @@ -460,6 +462,13 @@ struct cftype { > #endif > }; > > +struct cgroup_namespace { > + atomic_t count; > + unsigned int proc_inum; > + struct user_namespace *user_ns; > + struct cgroup *root_cgrp; > +}; > + > extern struct cgroup_root cgrp_dfl_root; > extern struct css_set init_css_set; > > @@ -584,10 +593,28 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) > return kernfs_name(cgrp->kn, buf, buflen); > } > > +static inline char * __must_check cgroup_path_ns(struct cgroup_namespace *ns, > + struct cgroup *cgrp, char *buf, > + size_t buflen) > +{ > + if (ns) { > + BUG_ON(!cgroup_on_dfl(cgrp)); > + return kernfs_path_from_node(ns->root_cgrp->kn, cgrp->kn, buf, > + buflen); > + } else { > + return kernfs_path(cgrp->kn, buf, buflen); > + } > +} > + > static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, > size_t buflen) > { > - return kernfs_path(cgrp->kn, buf, buflen); > + if (cgroup_on_dfl(cgrp)) { > + return cgroup_path_ns(current->nsproxy->cgroup_ns, cgrp, buf, > + buflen); > + } else { > + return cgroup_path_ns(NULL, cgrp, buf, buflen); > + } > } > > static inline void pr_cont_cgroup_name(struct cgroup *cgrp) > diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h > new file mode 100644 > index 0000000..0b97b8d > --- /dev/null > +++ b/include/linux/cgroup_namespace.h > @@ -0,0 +1,36 @@ > +#ifndef _LINUX_CGROUP_NAMESPACE_H > +#define _LINUX_CGROUP_NAMESPACE_H > + > +#include <linux/nsproxy.h> > +#include <linux/cgroup.h> > +#include <linux/types.h> > +#include <linux/user_namespace.h> > + > +extern struct cgroup_namespace init_cgroup_ns; > + > +static inline struct cgroup *current_cgroupns_root(void) > +{ > + return current->nsproxy->cgroup_ns->root_cgrp; > +} > + > +extern void free_cgroup_ns(struct cgroup_namespace *ns); > + > +static inline struct cgroup_namespace *get_cgroup_ns( > + struct cgroup_namespace *ns) > +{ > + if (ns) > + atomic_inc(&ns->count); > + return ns; > +} > + > +static inline void put_cgroup_ns(struct cgroup_namespace *ns) > +{ > + if (ns && atomic_dec_and_test(&ns->count)) > + free_cgroup_ns(ns); > +} > + > +extern struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, > + struct user_namespace *user_ns, > + struct cgroup_namespace *old_ns); > + > +#endif /* _LINUX_CGROUP_NAMESPACE_H */ > diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h > index 35fa08f..ac0d65b 100644 > --- a/include/linux/nsproxy.h > +++ b/include/linux/nsproxy.h > @@ -8,6 +8,7 @@ struct mnt_namespace; > struct uts_namespace; > struct ipc_namespace; > struct pid_namespace; > +struct cgroup_namespace; > struct fs_struct; > > /* > @@ -33,6 +34,7 @@ struct nsproxy { > struct mnt_namespace *mnt_ns; > struct pid_namespace *pid_ns_for_children; > struct net *net_ns; > + struct cgroup_namespace *cgroup_ns; > }; > extern struct nsproxy init_nsproxy; > > diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h > index 34a1e10..e56dd73 100644 > --- a/include/linux/proc_ns.h > +++ b/include/linux/proc_ns.h > @@ -6,6 +6,8 @@ > > struct pid_namespace; > struct nsproxy; > +struct task_struct; > +struct inode; These two lines seems unnecessary. > > struct proc_ns_operations { > const char *name; > @@ -27,6 +29,7 @@ extern const struct proc_ns_operations ipcns_operations; > extern const struct proc_ns_operations pidns_operations; > extern const struct proc_ns_operations userns_operations; > extern const struct proc_ns_operations mntns_operations; > +extern const struct proc_ns_operations cgroupns_operations; > > /* > * We always define these enumerators > @@ -37,6 +40,7 @@ enum { > PROC_UTS_INIT_INO = 0xEFFFFFFEU, > PROC_USER_INIT_INO = 0xEFFFFFFDU, > PROC_PID_INIT_INO = 0xEFFFFFFCU, > + PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, > }; > > #ifdef CONFIG_PROC_FS > diff --git a/kernel/Makefile b/kernel/Makefile > index dc5c775..d9731e2 100644 > --- a/kernel/Makefile > +++ b/kernel/Makefile > @@ -50,7 +50,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o > obj-$(CONFIG_KEXEC) += kexec.o > obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o > obj-$(CONFIG_COMPAT) += compat.o > -obj-$(CONFIG_CGROUPS) += cgroup.o > +obj-$(CONFIG_CGROUPS) += cgroup.o cgroup_namespace.o > obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o > obj-$(CONFIG_CPUSETS) += cpuset.o > obj-$(CONFIG_UTS_NS) += utsname.o > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index e12d36e..b1ae6d9 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -57,6 +57,8 @@ > #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ > #include <linux/kthread.h> > #include <linux/delay.h> > +#include <linux/proc_ns.h> > +#include <linux/cgroup_namespace.h> > > #include <linux/atomic.h> > > @@ -195,6 +197,15 @@ static void kill_css(struct cgroup_subsys_state *css); > static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], > bool is_add); > > +struct cgroup_namespace init_cgroup_ns = { > + .count = { > + .counter = 1, > + }, .count = ATOMIC_INIT(1) > + .proc_inum = PROC_CGROUP_INIT_INO, > + .user_ns = &init_user_ns, > + .root_cgrp = &cgrp_dfl_root.cgrp, > +}; > + > /* IDR wrappers which synchronize using cgroup_idr_lock */ > static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, > gfp_t gfp_mask) > @@ -4989,6 +5000,8 @@ int __init cgroup_init(void) > unsigned long key; > int ssid, err; > > + get_user_ns(init_cgroup_ns.user_ns); > + > BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); > BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); > > diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c > new file mode 100644 > index 0000000..0e0ef3a > --- /dev/null > +++ b/kernel/cgroup_namespace.c > @@ -0,0 +1,127 @@ > +/* > + * Copyright (C) 2014 Google Inc. > + * > + * Author: Aditya Kali (adityakali@xxxxxxxxxx) > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the Free > + * Software Foundation, version 2 of the License. > + */ > + > +#include <linux/cgroup.h> > +#include <linux/cgroup_namespace.h> > +#include <linux/sched.h> > +#include <linux/slab.h> > +#include <linux/nsproxy.h> > +#include <linux/proc_ns.h> > + > +static struct cgroup_namespace *alloc_cgroup_ns(void) > +{ > + struct cgroup_namespace *new_ns; > + > + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); > + if (new_ns) > + atomic_set(&new_ns->count, 1); > + return new_ns; > +} Better fold this function into copy_cgroup_ns(). > + > +void free_cgroup_ns(struct cgroup_namespace *ns) > +{ > + cgroup_put(ns->root_cgrp); > + put_user_ns(ns->user_ns); > + proc_free_inum(ns->proc_inum); > + kfree(ns); > +} > +EXPORT_SYMBOL(free_cgroup_ns); This should be a static inline function. > + > +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, > + struct user_namespace *user_ns, > + struct cgroup_namespace *old_ns) > +{ > + struct cgroup_namespace *new_ns = NULL; > + struct cgroup *cgrp = NULL; > + int err; > + > + BUG_ON(!old_ns); > + > + if (!(flags & CLONE_NEWCGROUP)) > + return get_cgroup_ns(old_ns); > + > + /* Allow only sysadmin to create cgroup namespace. */ > + err = -EPERM; > + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) > + goto err_out; > + > + /* CGROUPNS only virtualizes the cgroup path on the unified hierarchy. > + */ The comment style should be /* * ... */ > + cgrp = get_task_cgroup(current); > + > + err = -ENOMEM; > + new_ns = alloc_cgroup_ns(); > + if (!new_ns) > + goto err_out; > + > + err = proc_alloc_inum(&new_ns->proc_inum); > + if (err) > + goto err_out; > + > + new_ns->user_ns = get_user_ns(user_ns); > + new_ns->root_cgrp = cgrp; > + > + return new_ns; > + > +err_out: > + if (cgrp) > + cgroup_put(cgrp); > + kfree(new_ns); > + return ERR_PTR(err); > +} > + > +static int cgroupns_install(struct nsproxy *nsproxy, void *ns) > +{ > + pr_info("setns not supported for cgroup namespace"); > + return -EINVAL; > +} > + > +static void *cgroupns_get(struct task_struct *task) > +{ > + struct cgroup_namespace *ns = NULL; > + struct nsproxy *nsproxy; > + > + task_lock(task); > + nsproxy = task->nsproxy; > + if (nsproxy) { > + ns = nsproxy->cgroup_ns; > + get_cgroup_ns(ns); > + } > + task_unlock(task); > + > + return ns; > +} > + > +static void cgroupns_put(void *ns) > +{ > + put_cgroup_ns(ns); > +} > + > +static unsigned int cgroupns_inum(void *ns) > +{ > + struct cgroup_namespace *cgroup_ns = ns; > + > + return cgroup_ns->proc_inum; > +} > + > +const struct proc_ns_operations cgroupns_operations = { > + .name = "cgroup", > + .type = CLONE_NEWCGROUP, > + .get = cgroupns_get, > + .put = cgroupns_put, > + .install = cgroupns_install, > + .inum = cgroupns_inum, > +}; > + > +static __init int cgroup_namespaces_init(void) > +{ > + return 0; > +} > +subsys_initcall(cgroup_namespaces_init); Why provide this unused init function? > diff --git a/kernel/fork.c b/kernel/fork.c > index 9b7d746..d22d793 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1797,7 +1797,7 @@ static int check_unshare_flags(unsigned long unshare_flags) > if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| > CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| > CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| > - CLONE_NEWUSER|CLONE_NEWPID)) > + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) > return -EINVAL; > /* > * Not implemented, but pretend it works if there is nothing to > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c > index ef42d0a..a8b1970 100644 > --- a/kernel/nsproxy.c > +++ b/kernel/nsproxy.c > @@ -25,6 +25,7 @@ > #include <linux/proc_ns.h> > #include <linux/file.h> > #include <linux/syscalls.h> > +#include <linux/cgroup_namespace.h> > > static struct kmem_cache *nsproxy_cachep; > > @@ -39,6 +40,7 @@ struct nsproxy init_nsproxy = { > #ifdef CONFIG_NET > .net_ns = &init_net, > #endif > + .cgroup_ns = &init_cgroup_ns, > }; > > static inline struct nsproxy *create_nsproxy(void) > @@ -92,6 +94,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, > goto out_pid; > } > > + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns, > + tsk->nsproxy->cgroup_ns); > + if (IS_ERR(new_nsp->cgroup_ns)) { > + err = PTR_ERR(new_nsp->cgroup_ns); > + goto out_cgroup; > + } > + > new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); > if (IS_ERR(new_nsp->net_ns)) { > err = PTR_ERR(new_nsp->net_ns); > @@ -101,6 +110,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, > return new_nsp; > > out_net: > + if (new_nsp->cgroup_ns) > + put_cgroup_ns(new_nsp->cgroup_ns); > +out_cgroup: > if (new_nsp->pid_ns_for_children) > put_pid_ns(new_nsp->pid_ns_for_children); > out_pid: > @@ -128,7 +140,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) > struct nsproxy *new_ns; > > if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | > - CLONE_NEWPID | CLONE_NEWNET)))) { > + CLONE_NEWPID | CLONE_NEWNET | > + CLONE_NEWCGROUP)))) { > get_nsproxy(old_ns); > return 0; > } > @@ -165,6 +178,8 @@ void free_nsproxy(struct nsproxy *ns) > put_ipc_ns(ns->ipc_ns); > if (ns->pid_ns_for_children) > put_pid_ns(ns->pid_ns_for_children); > + if (ns->cgroup_ns) > + put_cgroup_ns(ns->cgroup_ns); > put_net(ns->net_ns); > kmem_cache_free(nsproxy_cachep, ns); > } > @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, > int err = 0; > > if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | > - CLONE_NEWNET | CLONE_NEWPID))) > + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) > return 0; > > user_ns = new_cred ? new_cred->user_ns : current_user_ns(); > -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html