[PATCH 3/5] cgroup: Fix refcounting

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Fix cgroup refcounting by the following means:

 (1) Don't use PERCPU_REF_INIT_DEAD and percpu_ref_reinit().  Using this
     causes a problem should kernfs_get_tree() create a superblock and then
     fail to create a root inode.  The superblock destructor will be invoked
     before kernfs_get_tree() returns - and the refcount isn't reinit'd till
     after that.

     To this end, cgroup_setup_root() no longer needs a ref_flags argument.

 (2) Provide a flag, CSS_CREATING, that is used to prevent concurrent access
     to a cgroup root that is being set up and for which the superblock is
     still being set up.  This appears to be necessary to hide the fact that
     the cgroup is accessible before the superblock is fully initialised.

 (3) Set CSS_CREATING in cgroup1_get_tree() on a new cgroup_root.  This is
     then cleared in cgroup_do_get_tree().

 (4) cgroup_get_tree() is made to call cgroup_get() on the root it sets for a
     v2 cgroup.  Admittedly, this doesn't do anything because CSS_NO_REF is
     set, but it future proofs it in case this is changed in future.

 (5) cgroup_fill_super() is created and passed to kernfs_get_tree() in the
     kernfs_fs_context struct.  This takes an extra ref on the root for the
     superblock in the event that a superblock is created.

     struct cgroup_fs_context::root then holds a single ref on the root right
     through till it is freed.

Note that new_root is transferred into the cgroup_fs_context as
is_new_root, though this is probably unnecessary as it's only used to clear
CSS_CREATING - and no one else can gain access to the root until we've
cleared the flag.

Fixes: b3678086951a ("kernfs, sysfs, cgroup, intel_rdt: Support fs_context")
Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
---

 include/linux/cgroup-defs.h     |    1 +
 include/linux/cgroup.h          |    4 ++++
 kernel/cgroup/cgroup-internal.h |    3 ++-
 kernel/cgroup/cgroup-v1.c       |   25 +++++++++----------------
 kernel/cgroup/cgroup.c          |   24 +++++++++++++++++++++---
 5 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5e1694fe035b..f5da2396a809 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -52,6 +52,7 @@ enum {
 	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
 	CSS_VISIBLE	= (1 << 3), /* css is visible to userland */
 	CSS_DYING	= (1 << 4), /* css is dying */
+	CSS_CREATING	= (1 << 5), /* Root css is being constructed */
 };
 
 /* bits in struct cgroup flags field */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb0c7da50ed2..5708ad663572 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -333,6 +333,8 @@ static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
  */
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
+	if (css->flags & CSS_CREATING)
+		return false;
 	if (!(css->flags & CSS_NO_REF))
 		return percpu_ref_tryget(&css->refcnt);
 	return true;
@@ -350,6 +352,8 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
  */
 static inline bool css_tryget_online(struct cgroup_subsys_state *css)
 {
+	if (css->flags & CSS_CREATING)
+		return false;
 	if (!(css->flags & CSS_NO_REF))
 		return percpu_ref_tryget_live(&css->refcnt);
 	return true;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index ff86943ea1c8..b22c3d95d8eb 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -46,6 +46,7 @@ struct cgroup_fs_context {
 	unsigned int	flags;			/* CGRP_ROOT_* flags */
 
 	/* cgroup1 bits */
+	bool		is_new_root;		/* ->root is new and needs refcnt init */
 	bool		cpuset_clone_children;
 	bool		none;			/* User explicitly requested empty subsystem */
 	bool		all_ss;			/* Seen 'all' option */
@@ -214,7 +215,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
 
 void cgroup_free_root(struct cgroup_root *root);
 void init_cgroup_root(struct cgroup_fs_context *ctx);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
 int cgroup_do_get_tree(struct fs_context *fc);
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 4b189e821cad..0fbbde86a64d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1156,7 +1156,6 @@ int cgroup1_get_tree(struct fs_context *fc)
 	struct cgroup_root *root;
 	struct cgroup_subsys *ss;
 	int i, ret = cgroup1_validate(fc);
-	bool new_root = false;
 
 	if (ret)
 		return ret;
@@ -1261,12 +1260,19 @@ int cgroup1_get_tree(struct fs_context *fc)
 		ret = -ENOMEM;
 		goto err_unlock;
 	}
-	new_root = true;
 	ctx->root = root;
+	ctx->is_new_root = true;
 
 	init_cgroup_root(ctx);
 
-	ret = cgroup_setup_root(root, ctx->subsys_mask, PERCPU_REF_INIT_DEAD);
+	/*
+	 * There's a race window after we release cgroup_mutex and before
+	 * allocating a superblock. Make sure a concurrent process won't be
+	 * able to re-use the root during this window by setting CSS_CREATING.
+	 */
+	root->cgrp.self.flags |= CSS_CREATING;
+
+	ret = cgroup_setup_root(root, ctx->subsys_mask);
 	if (ret)
 		goto err_unlock;
 
@@ -1275,19 +1281,6 @@ int cgroup1_get_tree(struct fs_context *fc)
 
 	ret = cgroup_do_get_tree(fc);
 
-	/*
-	 * There's a race window after we release cgroup_mutex and before
-	 * allocating a superblock. Make sure a concurrent process won't
-	 * be able to re-use the root during this window by delaying the
-	 * initialization of root refcnt.
-	 */
-	if (new_root) {
-		mutex_lock(&cgroup_mutex);
-		percpu_ref_reinit(&root->cgrp.self.refcnt);
-		mutex_unlock(&cgroup_mutex);
-		cgroup_get(&root->cgrp);
-	}
-
 	/*
 	 * If @pinned_sb, we're reusing an existing root and holding an
 	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2e5150412ae0..091e7eca3661 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1909,7 +1909,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
@@ -1926,7 +1926,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	root_cgrp->ancestor_ids[0] = ret;
 
 	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
-			      ref_flags, GFP_KERNEL);
+			      0, GFP_KERNEL);
 	if (ret)
 		goto out;
 
@@ -2010,12 +2010,23 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	return ret;
 }
 
+static int cgroup_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+
+	mutex_lock(&cgroup_mutex);
+	cgroup_get(&ctx->root->cgrp);
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
 int cgroup_do_get_tree(struct fs_context *fc)
 {
 	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
 	int ret;
 
 	ctx->kfc.root = ctx->root->kf_root;
+	ctx->kfc.fill_super = cgroup_fill_super;
 
 	ret = kernfs_get_tree(fc);
 	if (ret < 0)
@@ -2046,6 +2057,12 @@ int cgroup_do_get_tree(struct fs_context *fc)
 
 	if (ctx->version == 2)
 		apply_cgroup_root_flags(ctx->flags);
+
+	if (ctx->is_new_root) {
+		mutex_lock(&cgroup_mutex);
+		ctx->root->cgrp.self.flags &= ~CSS_CREATING;
+		mutex_unlock(&cgroup_mutex);
+	}
 	ret = 0;
 out_cgrp:
 	return ret;
@@ -2071,6 +2088,7 @@ static int cgroup_get_tree(struct fs_context *fc)
 		cgroup_get_live(&cgrp_dfl_root.cgrp);
 
 		ctx->root = &cgrp_dfl_root;
+		cgroup_get(&ctx->root->cgrp);
 		return cgroup_do_get_tree(fc);
 
 	default:
@@ -5420,7 +5438,7 @@ int __init cgroup_init(void)
 	hash_add(css_set_table, &init_css_set.hlist,
 		 css_set_hash(init_css_set.subsys));
 
-	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
+	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
 
 	mutex_unlock(&cgroup_mutex);
 




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux