Fix cgroup refcounting by the following means: (1) Don't use PERCPU_REF_INIT_DEAD and percpu_ref_reinit(). Using this causes a problem should kernfs_get_tree() create a superblock and then fail to create a root inode. The superblock destructor will be invoked before kernfs_get_tree() returns - and the refcount isn't reinit'd till after that. To this end, cgroup_setup_root() no longer needs a ref_flags argument. (2) Provide a flag, CSS_CREATING, that is used to prevent concurrent access to a cgroup root that is being set up and for which the superblock is still being set up. This appears to be necessary to hide the fact that the cgroup is accessible before the superblock is fully initialised. (3) Set CSS_CREATING in cgroup1_get_tree() on a new cgroup_root. This is then cleared in cgroup_do_get_tree(). (4) cgroup_get_tree() is made to call cgroup_get() on the root it sets for a v2 cgroup. Admittedly, this doesn't do anything because CSS_NO_REF is set, but it future proofs it in case this is changed in future. (5) cgroup_fill_super() is created and passed to kernfs_get_tree() in the kernfs_fs_context struct. This takes an extra ref on the root for the superblock in the event that a superblock is created. struct cgroup_fs_context::root then holds a single ref on the root right through till it is freed. Note that new_root is transferred into the cgroup_fs_context as is_new_root, though this is probably unnecessary as it's only used to clear CSS_CREATING - and no one else can gain access to the root until we've cleared the flag. Fixes: b3678086951a ("kernfs, sysfs, cgroup, intel_rdt: Support fs_context") Signed-off-by: David Howells <dhowells@xxxxxxxxxx> --- include/linux/cgroup-defs.h | 1 + include/linux/cgroup.h | 4 ++++ kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup-v1.c | 25 +++++++++---------------- kernel/cgroup/cgroup.c | 24 +++++++++++++++++++++--- 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5e1694fe035b..f5da2396a809 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -52,6 +52,7 @@ enum { CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_VISIBLE = (1 << 3), /* css is visible to userland */ CSS_DYING = (1 << 4), /* css is dying */ + CSS_CREATING = (1 << 5), /* Root css is being constructed */ }; /* bits in struct cgroup flags field */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bb0c7da50ed2..5708ad663572 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -333,6 +333,8 @@ static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) */ static inline bool css_tryget(struct cgroup_subsys_state *css) { + if (css->flags & CSS_CREATING) + return false; if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; @@ -350,6 +352,8 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) */ static inline bool css_tryget_online(struct cgroup_subsys_state *css) { + if (css->flags & CSS_CREATING) + return false; if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index ff86943ea1c8..b22c3d95d8eb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -46,6 +46,7 @@ struct cgroup_fs_context { unsigned int flags; /* CGRP_ROOT_* flags */ /* cgroup1 bits */ + bool is_new_root; /* ->root is new and needs refcnt init */ bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ @@ -214,7 +215,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 4b189e821cad..0fbbde86a64d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1156,7 +1156,6 @@ int cgroup1_get_tree(struct fs_context *fc) struct cgroup_root *root; struct cgroup_subsys *ss; int i, ret = cgroup1_validate(fc); - bool new_root = false; if (ret) return ret; @@ -1261,12 +1260,19 @@ int cgroup1_get_tree(struct fs_context *fc) ret = -ENOMEM; goto err_unlock; } - new_root = true; ctx->root = root; + ctx->is_new_root = true; init_cgroup_root(ctx); - ret = cgroup_setup_root(root, ctx->subsys_mask, PERCPU_REF_INIT_DEAD); + /* + * There's a race window after we release cgroup_mutex and before + * allocating a superblock. Make sure a concurrent process won't be + * able to re-use the root during this window by setting CSS_CREATING. + */ + root->cgrp.self.flags |= CSS_CREATING; + + ret = cgroup_setup_root(root, ctx->subsys_mask); if (ret) goto err_unlock; @@ -1275,19 +1281,6 @@ int cgroup1_get_tree(struct fs_context *fc) ret = cgroup_do_get_tree(fc); - /* - * There's a race window after we release cgroup_mutex and before - * allocating a superblock. Make sure a concurrent process won't - * be able to re-use the root during this window by delaying the - * initialization of root refcnt. - */ - if (new_root) { - mutex_lock(&cgroup_mutex); - percpu_ref_reinit(&root->cgrp.self.refcnt); - mutex_unlock(&cgroup_mutex); - cgroup_get(&root->cgrp); - } - /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2e5150412ae0..091e7eca3661 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1909,7 +1909,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1926,7 +1926,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, - ref_flags, GFP_KERNEL); + 0, GFP_KERNEL); if (ret) goto out; @@ -2010,12 +2010,23 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) return ret; } +static int cgroup_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + + mutex_lock(&cgroup_mutex); + cgroup_get(&ctx->root->cgrp); + mutex_unlock(&cgroup_mutex); + return 0; +} + int cgroup_do_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; ctx->kfc.root = ctx->root->kf_root; + ctx->kfc.fill_super = cgroup_fill_super; ret = kernfs_get_tree(fc); if (ret < 0) @@ -2046,6 +2057,12 @@ int cgroup_do_get_tree(struct fs_context *fc) if (ctx->version == 2) apply_cgroup_root_flags(ctx->flags); + + if (ctx->is_new_root) { + mutex_lock(&cgroup_mutex); + ctx->root->cgrp.self.flags &= ~CSS_CREATING; + mutex_unlock(&cgroup_mutex); + } ret = 0; out_cgrp: return ret; @@ -2071,6 +2088,7 @@ static int cgroup_get_tree(struct fs_context *fc) cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; + cgroup_get(&ctx->root->cgrp); return cgroup_do_get_tree(fc); default: @@ -5420,7 +5438,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex);