On Thu, Jan 03, 2019 at 11:44:16PM +0000, David Howells wrote: > Fix cgroup refcounting by the following means: > > (1) Don't use PERCPU_REF_INIT_DEAD and percpu_ref_reinit(). Using this > causes a problem should kernfs_get_tree() create a superblock and then > fail to create a root inode. The superblock destructor will be invoked > before kernfs_get_tree() returns - and the refcount isn't reinit'd till > after that. > > To this end, cgroup_setup_root() no longer needs a ref_flags argument. > > (2) Provide a flag, CSS_CREATING, that is used to prevent concurrent access > to a cgroup root that is being set up and for which the superblock is > still being set up. This appears to be necessary to hide the fact that > the cgroup is accessible before the superblock is fully initialised. > > (3) Set CSS_CREATING in cgroup1_get_tree() on a new cgroup_root. This is > then cleared in cgroup_do_get_tree(). > > (4) cgroup_get_tree() is made to call cgroup_get() on the root it sets for a > v2 cgroup. Admittedly, this doesn't do anything because CSS_NO_REF is > set, but it future proofs it in case this is changed in future. > > (5) cgroup_fill_super() is created and passed to kernfs_get_tree() in the > kernfs_fs_context struct. This takes an extra ref on the root for the > superblock in the event that a superblock is created. > > struct cgroup_fs_context::root then holds a single ref on the root right > through till it is freed. > > Note that new_root is transferred into the cgroup_fs_context as > is_new_root, though this is probably unnecessary as it's only used to clear > CSS_CREATING - and no one else can gain access to the root until we've > cleared the flag. > I haven't read the patch yet, but all my tests passed without any visible problems with this series. Thanks! https://travis-ci.org/avagin/linux/builds/475101517 Tested-by: Andrei Vagin <avagin@xxxxxxxxx> Reported-by: Andrei Vagin <avagin@xxxxxxxxx> > Fixes: b3678086951a ("kernfs, sysfs, cgroup, intel_rdt: Support fs_context") > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > --- > > include/linux/cgroup-defs.h | 1 + > include/linux/cgroup.h | 4 ++++ > kernel/cgroup/cgroup-internal.h | 3 ++- > kernel/cgroup/cgroup-v1.c | 25 +++++++++---------------- > kernel/cgroup/cgroup.c | 24 +++++++++++++++++++++--- > 5 files changed, 37 insertions(+), 20 deletions(-) > > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > index 5e1694fe035b..f5da2396a809 100644 > --- a/include/linux/cgroup-defs.h > +++ b/include/linux/cgroup-defs.h > @@ -52,6 +52,7 @@ enum { > CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ > CSS_VISIBLE = (1 << 3), /* css is visible to userland */ > CSS_DYING = (1 << 4), /* css is dying */ > + CSS_CREATING = (1 << 5), /* Root css is being constructed */ > }; > > /* bits in struct cgroup flags field */ > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index bb0c7da50ed2..5708ad663572 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -333,6 +333,8 @@ static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) > */ > static inline bool css_tryget(struct cgroup_subsys_state *css) > { > + if (css->flags & CSS_CREATING) > + return false; > if (!(css->flags & CSS_NO_REF)) > return percpu_ref_tryget(&css->refcnt); > return true; > @@ -350,6 +352,8 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) > */ > static inline bool css_tryget_online(struct cgroup_subsys_state *css) > { > + if (css->flags & CSS_CREATING) > + return false; > if (!(css->flags & CSS_NO_REF)) > return percpu_ref_tryget_live(&css->refcnt); > return true; > diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h > index ff86943ea1c8..b22c3d95d8eb 100644 > --- a/kernel/cgroup/cgroup-internal.h > +++ b/kernel/cgroup/cgroup-internal.h > @@ -46,6 +46,7 @@ struct cgroup_fs_context { > unsigned int flags; /* CGRP_ROOT_* flags */ > > /* cgroup1 bits */ > + bool is_new_root; /* ->root is new and needs refcnt init */ > bool cpuset_clone_children; > bool none; /* User explicitly requested empty subsystem */ > bool all_ss; /* Seen 'all' option */ > @@ -214,7 +215,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, > > void cgroup_free_root(struct cgroup_root *root); > void init_cgroup_root(struct cgroup_fs_context *ctx); > -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); > +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); > int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); > int cgroup_do_get_tree(struct fs_context *fc); > > diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c > index 4b189e821cad..0fbbde86a64d 100644 > --- a/kernel/cgroup/cgroup-v1.c > +++ b/kernel/cgroup/cgroup-v1.c > @@ -1156,7 +1156,6 @@ int cgroup1_get_tree(struct fs_context *fc) > struct cgroup_root *root; > struct cgroup_subsys *ss; > int i, ret = cgroup1_validate(fc); > - bool new_root = false; > > if (ret) > return ret; > @@ -1261,12 +1260,19 @@ int cgroup1_get_tree(struct fs_context *fc) > ret = -ENOMEM; > goto err_unlock; > } > - new_root = true; > ctx->root = root; > + ctx->is_new_root = true; > > init_cgroup_root(ctx); > > - ret = cgroup_setup_root(root, ctx->subsys_mask, PERCPU_REF_INIT_DEAD); > + /* > + * There's a race window after we release cgroup_mutex and before > + * allocating a superblock. Make sure a concurrent process won't be > + * able to re-use the root during this window by setting CSS_CREATING. > + */ > + root->cgrp.self.flags |= CSS_CREATING; > + > + ret = cgroup_setup_root(root, ctx->subsys_mask); > if (ret) > goto err_unlock; > > @@ -1275,19 +1281,6 @@ int cgroup1_get_tree(struct fs_context *fc) > > ret = cgroup_do_get_tree(fc); > > - /* > - * There's a race window after we release cgroup_mutex and before > - * allocating a superblock. Make sure a concurrent process won't > - * be able to re-use the root during this window by delaying the > - * initialization of root refcnt. > - */ > - if (new_root) { > - mutex_lock(&cgroup_mutex); > - percpu_ref_reinit(&root->cgrp.self.refcnt); > - mutex_unlock(&cgroup_mutex); > - cgroup_get(&root->cgrp); > - } > - > /* > * If @pinned_sb, we're reusing an existing root and holding an > * extra ref on its sb. Mount is complete. Put the extra ref. > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 2e5150412ae0..091e7eca3661 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -1909,7 +1909,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) > set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); > } > > -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) > +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) > { > LIST_HEAD(tmp_links); > struct cgroup *root_cgrp = &root->cgrp; > @@ -1926,7 +1926,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) > root_cgrp->ancestor_ids[0] = ret; > > ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, > - ref_flags, GFP_KERNEL); > + 0, GFP_KERNEL); > if (ret) > goto out; > > @@ -2010,12 +2010,23 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) > return ret; > } > > +static int cgroup_fill_super(struct super_block *sb, struct fs_context *fc) > +{ > + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); > + > + mutex_lock(&cgroup_mutex); > + cgroup_get(&ctx->root->cgrp); > + mutex_unlock(&cgroup_mutex); > + return 0; > +} > + > int cgroup_do_get_tree(struct fs_context *fc) > { > struct cgroup_fs_context *ctx = cgroup_fc2context(fc); > int ret; > > ctx->kfc.root = ctx->root->kf_root; > + ctx->kfc.fill_super = cgroup_fill_super; > > ret = kernfs_get_tree(fc); > if (ret < 0) > @@ -2046,6 +2057,12 @@ int cgroup_do_get_tree(struct fs_context *fc) > > if (ctx->version == 2) > apply_cgroup_root_flags(ctx->flags); > + > + if (ctx->is_new_root) { > + mutex_lock(&cgroup_mutex); > + ctx->root->cgrp.self.flags &= ~CSS_CREATING; > + mutex_unlock(&cgroup_mutex); > + } > ret = 0; > out_cgrp: > return ret; > @@ -2071,6 +2088,7 @@ static int cgroup_get_tree(struct fs_context *fc) > cgroup_get_live(&cgrp_dfl_root.cgrp); > > ctx->root = &cgrp_dfl_root; > + cgroup_get(&ctx->root->cgrp); > return cgroup_do_get_tree(fc); > > default: > @@ -5420,7 +5438,7 @@ int __init cgroup_init(void) > hash_add(css_set_table, &init_css_set.hlist, > css_set_hash(init_css_set.subsys)); > > - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); > + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); > > mutex_unlock(&cgroup_mutex); > >