Re: [PATCH 3/5] cgroup: Fix refcounting

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Jan 03, 2019 at 11:44:16PM +0000, David Howells wrote:
> Fix cgroup refcounting by the following means:
> 
>  (1) Don't use PERCPU_REF_INIT_DEAD and percpu_ref_reinit().  Using this
>      causes a problem should kernfs_get_tree() create a superblock and then
>      fail to create a root inode.  The superblock destructor will be invoked
>      before kernfs_get_tree() returns - and the refcount isn't reinit'd till
>      after that.
> 
>      To this end, cgroup_setup_root() no longer needs a ref_flags argument.
> 
>  (2) Provide a flag, CSS_CREATING, that is used to prevent concurrent access
>      to a cgroup root that is being set up and for which the superblock is
>      still being set up.  This appears to be necessary to hide the fact that
>      the cgroup is accessible before the superblock is fully initialised.
> 
>  (3) Set CSS_CREATING in cgroup1_get_tree() on a new cgroup_root.  This is
>      then cleared in cgroup_do_get_tree().
> 
>  (4) cgroup_get_tree() is made to call cgroup_get() on the root it sets for a
>      v2 cgroup.  Admittedly, this doesn't do anything because CSS_NO_REF is
>      set, but it future proofs it in case this is changed in future.
> 
>  (5) cgroup_fill_super() is created and passed to kernfs_get_tree() in the
>      kernfs_fs_context struct.  This takes an extra ref on the root for the
>      superblock in the event that a superblock is created.
> 
>      struct cgroup_fs_context::root then holds a single ref on the root right
>      through till it is freed.
> 
> Note that new_root is transferred into the cgroup_fs_context as
> is_new_root, though this is probably unnecessary as it's only used to clear
> CSS_CREATING - and no one else can gain access to the root until we've
> cleared the flag.
> 

I haven't read the patch yet, but all my tests passed without any
visible problems with this series. Thanks!

https://travis-ci.org/avagin/linux/builds/475101517

Tested-by: Andrei Vagin <avagin@xxxxxxxxx>
Reported-by: Andrei Vagin <avagin@xxxxxxxxx>

> Fixes: b3678086951a ("kernfs, sysfs, cgroup, intel_rdt: Support fs_context")
> Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
> ---
> 
>  include/linux/cgroup-defs.h     |    1 +
>  include/linux/cgroup.h          |    4 ++++
>  kernel/cgroup/cgroup-internal.h |    3 ++-
>  kernel/cgroup/cgroup-v1.c       |   25 +++++++++----------------
>  kernel/cgroup/cgroup.c          |   24 +++++++++++++++++++++---
>  5 files changed, 37 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 5e1694fe035b..f5da2396a809 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -52,6 +52,7 @@ enum {
>  	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
>  	CSS_VISIBLE	= (1 << 3), /* css is visible to userland */
>  	CSS_DYING	= (1 << 4), /* css is dying */
> +	CSS_CREATING	= (1 << 5), /* Root css is being constructed */
>  };
>  
>  /* bits in struct cgroup flags field */
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index bb0c7da50ed2..5708ad663572 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -333,6 +333,8 @@ static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
>   */
>  static inline bool css_tryget(struct cgroup_subsys_state *css)
>  {
> +	if (css->flags & CSS_CREATING)
> +		return false;
>  	if (!(css->flags & CSS_NO_REF))
>  		return percpu_ref_tryget(&css->refcnt);
>  	return true;
> @@ -350,6 +352,8 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
>   */
>  static inline bool css_tryget_online(struct cgroup_subsys_state *css)
>  {
> +	if (css->flags & CSS_CREATING)
> +		return false;
>  	if (!(css->flags & CSS_NO_REF))
>  		return percpu_ref_tryget_live(&css->refcnt);
>  	return true;
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index ff86943ea1c8..b22c3d95d8eb 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -46,6 +46,7 @@ struct cgroup_fs_context {
>  	unsigned int	flags;			/* CGRP_ROOT_* flags */
>  
>  	/* cgroup1 bits */
> +	bool		is_new_root;		/* ->root is new and needs refcnt init */
>  	bool		cpuset_clone_children;
>  	bool		none;			/* User explicitly requested empty subsystem */
>  	bool		all_ss;			/* Seen 'all' option */
> @@ -214,7 +215,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
>  
>  void cgroup_free_root(struct cgroup_root *root);
>  void init_cgroup_root(struct cgroup_fs_context *ctx);
> -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
> +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
>  int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
>  int cgroup_do_get_tree(struct fs_context *fc);
>  
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 4b189e821cad..0fbbde86a64d 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -1156,7 +1156,6 @@ int cgroup1_get_tree(struct fs_context *fc)
>  	struct cgroup_root *root;
>  	struct cgroup_subsys *ss;
>  	int i, ret = cgroup1_validate(fc);
> -	bool new_root = false;
>  
>  	if (ret)
>  		return ret;
> @@ -1261,12 +1260,19 @@ int cgroup1_get_tree(struct fs_context *fc)
>  		ret = -ENOMEM;
>  		goto err_unlock;
>  	}
> -	new_root = true;
>  	ctx->root = root;
> +	ctx->is_new_root = true;
>  
>  	init_cgroup_root(ctx);
>  
> -	ret = cgroup_setup_root(root, ctx->subsys_mask, PERCPU_REF_INIT_DEAD);
> +	/*
> +	 * There's a race window after we release cgroup_mutex and before
> +	 * allocating a superblock. Make sure a concurrent process won't be
> +	 * able to re-use the root during this window by setting CSS_CREATING.
> +	 */
> +	root->cgrp.self.flags |= CSS_CREATING;
> +
> +	ret = cgroup_setup_root(root, ctx->subsys_mask);
>  	if (ret)
>  		goto err_unlock;
>  
> @@ -1275,19 +1281,6 @@ int cgroup1_get_tree(struct fs_context *fc)
>  
>  	ret = cgroup_do_get_tree(fc);
>  
> -	/*
> -	 * There's a race window after we release cgroup_mutex and before
> -	 * allocating a superblock. Make sure a concurrent process won't
> -	 * be able to re-use the root during this window by delaying the
> -	 * initialization of root refcnt.
> -	 */
> -	if (new_root) {
> -		mutex_lock(&cgroup_mutex);
> -		percpu_ref_reinit(&root->cgrp.self.refcnt);
> -		mutex_unlock(&cgroup_mutex);
> -		cgroup_get(&root->cgrp);
> -	}
> -
>  	/*
>  	 * If @pinned_sb, we're reusing an existing root and holding an
>  	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 2e5150412ae0..091e7eca3661 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1909,7 +1909,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
>  		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
>  }
>  
> -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
> +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
>  {
>  	LIST_HEAD(tmp_links);
>  	struct cgroup *root_cgrp = &root->cgrp;
> @@ -1926,7 +1926,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
>  	root_cgrp->ancestor_ids[0] = ret;
>  
>  	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
> -			      ref_flags, GFP_KERNEL);
> +			      0, GFP_KERNEL);
>  	if (ret)
>  		goto out;
>  
> @@ -2010,12 +2010,23 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
>  	return ret;
>  }
>  
> +static int cgroup_fill_super(struct super_block *sb, struct fs_context *fc)
> +{
> +	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> +
> +	mutex_lock(&cgroup_mutex);
> +	cgroup_get(&ctx->root->cgrp);
> +	mutex_unlock(&cgroup_mutex);
> +	return 0;
> +}
> +
>  int cgroup_do_get_tree(struct fs_context *fc)
>  {
>  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
>  	int ret;
>  
>  	ctx->kfc.root = ctx->root->kf_root;
> +	ctx->kfc.fill_super = cgroup_fill_super;
>  
>  	ret = kernfs_get_tree(fc);
>  	if (ret < 0)
> @@ -2046,6 +2057,12 @@ int cgroup_do_get_tree(struct fs_context *fc)
>  
>  	if (ctx->version == 2)
>  		apply_cgroup_root_flags(ctx->flags);
> +
> +	if (ctx->is_new_root) {
> +		mutex_lock(&cgroup_mutex);
> +		ctx->root->cgrp.self.flags &= ~CSS_CREATING;
> +		mutex_unlock(&cgroup_mutex);
> +	}
>  	ret = 0;
>  out_cgrp:
>  	return ret;
> @@ -2071,6 +2088,7 @@ static int cgroup_get_tree(struct fs_context *fc)
>  		cgroup_get_live(&cgrp_dfl_root.cgrp);
>  
>  		ctx->root = &cgrp_dfl_root;
> +		cgroup_get(&ctx->root->cgrp);
>  		return cgroup_do_get_tree(fc);
>  
>  	default:
> @@ -5420,7 +5438,7 @@ int __init cgroup_init(void)
>  	hash_add(css_set_table, &init_css_set.hlist,
>  		 css_set_hash(init_css_set.subsys));
>  
> -	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
> +	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
>  
>  	mutex_unlock(&cgroup_mutex);
>  
> 



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux