Re: [PATCH v3 bpf-next 3/5] bpf: Make cgroup storages shared across attaches on the same cgroup

Martin KaFai Lau <kafai@xxxxxx> · Fri, 17 Jul 2020 20:30:44 -0700




On Thu, Jul 16, 2020 at 07:16:27PM -0500, YiFei Zhu wrote:

> Fourth, on attach, we reuse the old storage if the key already
> exists in the map. Because the rbtree traversal holds the spinlock
> of the map, during which we can't allocate a new storage if we
> don't find an old storage, instead we preallocate the storage
> unconditionally, and free the preallocated storage if we find an
> old storage in the map. This results in a change of semantics in
> bpf_cgroup_storage{,s}_link, and rename cgroup_storage_insert to
> cgroup_storage_lookup_insert that does both lookup and conditionally
> insert or free. bpf_cgroup_storage{,s}_link also tracks exactly
> which storages are reused in an array of bools, so it can unlink
> and free the new storages in the event that attachment failed
> later than link. bpf_cgroup_storages_{free,unlink} accepts the
> bool array in order to facilitate that.

[ ... ]

> ---
>  include/linux/bpf-cgroup.h     | 15 +++---
>  include/uapi/linux/bpf.h       |  2 +-
>  kernel/bpf/cgroup.c            | 69 +++++++++++++++------------
>  kernel/bpf/core.c              | 12 -----
>  kernel/bpf/local_storage.c     | 85 ++++++++++++++++------------------
>  tools/include/uapi/linux/bpf.h |  2 +-
>  6 files changed, 91 insertions(+), 94 deletions(-)
> 
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 2c6f26670acc..c83cd8862298 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -46,7 +46,8 @@ struct bpf_cgroup_storage {
>  	};
>  	struct bpf_cgroup_storage_map *map;
>  	struct bpf_cgroup_storage_key key;
> -	struct list_head list;
> +	struct list_head list_map;
> +	struct list_head list_cg;
>  	struct rb_node node;
>  	struct rcu_head rcu;
>  };
> @@ -78,6 +79,9 @@ struct cgroup_bpf {
>  	struct list_head progs[MAX_BPF_ATTACH_TYPE];
>  	u32 flags[MAX_BPF_ATTACH_TYPE];
>  
> +	/* list of cgroup shared storages */
> +	struct list_head storages;
> +
>  	/* temp storage for effective prog array used by prog_attach/detach */
>  	struct bpf_prog_array *inactive;
>  
> @@ -164,12 +168,11 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
>  struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
>  					enum bpf_cgroup_storage_type stype);
>  void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
> -void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
> -			     struct cgroup *cgroup,
> -			     enum bpf_attach_type type);
> +struct bpf_cgroup_storage *
> +bpf_cgroup_storage_link(struct bpf_cgroup_storage *new_storage,
> +			struct cgroup *cgroup, bool *storage_reused);
>  void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
>  int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map);
> -void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map);
>  
>  int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
>  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> @@ -383,8 +386,6 @@ static inline void bpf_cgroup_storage_set(
>  	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
>  static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
>  					    struct bpf_map *map) { return 0; }
> -static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux,
> -					      struct bpf_map *map) {}
>  static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
>  	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; }
>  static inline void bpf_cgroup_storage_free(
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 7ac3992dacfe..b14f008ad028 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -78,7 +78,7 @@ struct bpf_lpm_trie_key {
>  
>  struct bpf_cgroup_storage_key {
>  	__u64	cgroup_inode_id;	/* cgroup inode id */
> -	__u32	attach_type;		/* program attach type */
> +	__u32	attach_type;		/* program attach type, unused */
>  };
>  
>  /* BPF syscall commands, see bpf(2) man-page for details. */
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index ac53102e244a..6b1ef4a809bb 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -28,12 +28,14 @@ void cgroup_bpf_offline(struct cgroup *cgrp)
>  	percpu_ref_kill(&cgrp->bpf.refcnt);
>  }
>  
> -static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
> +static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[],
> +				     bool *storage_reused)
>  {
>  	enum bpf_cgroup_storage_type stype;
>  
>  	for_each_cgroup_storage_type(stype)
> -		bpf_cgroup_storage_free(storages[stype]);
> +		if (!storage_reused || !storage_reused[stype])
> +			bpf_cgroup_storage_free(storages[stype]);
>  }
>  
>  static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
> @@ -45,7 +47,7 @@ static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
>  		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
>  		if (IS_ERR(storages[stype])) {
>  			storages[stype] = NULL;
> -			bpf_cgroup_storages_free(storages);
> +			bpf_cgroup_storages_free(storages, NULL);
>  			return -ENOMEM;
>  		}
>  	}
> @@ -63,21 +65,24 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
>  }
>  
>  static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
> -				     struct cgroup* cgrp,
> -				     enum bpf_attach_type attach_type)
> +				     struct cgroup *cgrp, bool *storage_reused)
>  {
>  	enum bpf_cgroup_storage_type stype;
>  
>  	for_each_cgroup_storage_type(stype)
> -		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
> +		storages[stype] =
> +			bpf_cgroup_storage_link(storages[stype], cgrp,
> +					        &storage_reused[stype]);
>  }
>  
> -static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
> +static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[],
> +				       bool *storage_reused)
>  {
>  	enum bpf_cgroup_storage_type stype;
>  
>  	for_each_cgroup_storage_type(stype)
> -		bpf_cgroup_storage_unlink(storages[stype]);
> +		if (!storage_reused || !storage_reused[stype])
> +			bpf_cgroup_storage_unlink(storages[stype]);
>  }
>  
>  /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
> @@ -101,22 +106,23 @@ static void cgroup_bpf_release(struct work_struct *work)
>  	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
>  					       bpf.release_work);
>  	struct bpf_prog_array *old_array;
> +	struct list_head *storages = &cgrp->bpf.storages;
> +	struct bpf_cgroup_storage *storage, *stmp;
> +
>  	unsigned int type;
>  
>  	mutex_lock(&cgroup_mutex);
>  
>  	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
>  		struct list_head *progs = &cgrp->bpf.progs[type];
> -		struct bpf_prog_list *pl, *tmp;
> +		struct bpf_prog_list *pl, *pltmp;
>  
> -		list_for_each_entry_safe(pl, tmp, progs, node) {
> +		list_for_each_entry_safe(pl, pltmp, progs, node) {
>  			list_del(&pl->node);
>  			if (pl->prog)
>  				bpf_prog_put(pl->prog);
>  			if (pl->link)
>  				bpf_cgroup_link_auto_detach(pl->link);
> -			bpf_cgroup_storages_unlink(pl->storage);
> -			bpf_cgroup_storages_free(pl->storage);
>  			kfree(pl);
>  			static_branch_dec(&cgroup_bpf_enabled_key);
>  		}
> @@ -126,6 +132,11 @@ static void cgroup_bpf_release(struct work_struct *work)
>  		bpf_prog_array_free(old_array);
>  	}
>  
> +	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
> +		bpf_cgroup_storage_unlink(storage);
> +		bpf_cgroup_storage_free(storage);
> +	}
> +
>  	mutex_unlock(&cgroup_mutex);
>  
>  	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
> @@ -290,6 +301,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
>  	for (i = 0; i < NR; i++)
>  		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
>  
> +	INIT_LIST_HEAD(&cgrp->bpf.storages);
> +
>  	for (i = 0; i < NR; i++)
>  		if (compute_effective_progs(cgrp, i, &arrays[i]))
>  			goto cleanup;
> @@ -422,7 +435,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
>  	struct list_head *progs = &cgrp->bpf.progs[type];
>  	struct bpf_prog *old_prog = NULL;
>  	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
> -	struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
> +	bool storage_reused[MAX_BPF_CGROUP_STORAGE_TYPE];
>  	struct bpf_prog_list *pl;
>  	int err;
>  
> @@ -455,22 +468,22 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
>  	if (IS_ERR(pl))
>  		return PTR_ERR(pl);
>  
> -	if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
> -		return -ENOMEM;
> -
>  	if (pl) {
>  		old_prog = pl->prog;
> -		bpf_cgroup_storages_unlink(pl->storage);
> -		bpf_cgroup_storages_assign(old_storage, pl->storage);
>  	} else {
>  		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
> -		if (!pl) {
> -			bpf_cgroup_storages_free(storage);
> +		if (!pl)
>  			return -ENOMEM;
> -		}
> +
>  		list_add_tail(&pl->node, progs);
>  	}
>  
> +	err = bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog);
> +	if (err)
> +		goto cleanup;
> +
> +	bpf_cgroup_storages_link(storage, cgrp, storage_reused);
> +
>  	pl->prog = prog;
>  	pl->link = link;
>  	bpf_cgroup_storages_assign(pl->storage, storage);
> @@ -478,24 +491,24 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
>  
>  	err = update_effective_progs(cgrp, type);
>  	if (err)
> -		goto cleanup;
> +		goto cleanup_unlink;
>  
> -	bpf_cgroup_storages_free(old_storage);
>  	if (old_prog)
>  		bpf_prog_put(old_prog);
>  	else
>  		static_branch_inc(&cgroup_bpf_enabled_key);
> -	bpf_cgroup_storages_link(pl->storage, cgrp, type);
>  	return 0;
>  
> +cleanup_unlink:
> +	bpf_cgroup_storages_unlink(storage, storage_reused);
> +
I still failed to understand why there is a need to do this dance
that always link/publish first and then unlink/unpublish during failure.
It causes all these changes to add and track "storage_reused" params
in a few functions for handling this one failure. That also requires
to introduce the cgroup_storage_lookup_insert().

Going back to my earlier comment in v2 which I didn't here any feedback:

**** snippet ****
>> lookup old, found=>reuse, not-found=>alloc.
>>
>> Only publish the new storage after the attach has succeeded.
*** snippet ****

I try to put them in code here (uncompiled code).  wdyt?

static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
				     struct bpf_cgroup_storage *new_storages[],
				     struct bpf_prog *prog,
				     struct cgroup *cgrp)
{
	enum bpf_cgroup_storage_type stype;
	struct bpf_cgroup_storage_key key;
	struct bpf_map *map;

	key.cgroup_inode_id = cgroup_id(cgrp);
	key.attach_type = 0;

	for_each_cgroup_storage_type(stype) {
		map = prog->aux->cgroup_storage[stype];
		if (!map)
			continue;

		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
		if (!storages[stype]) {
			struct bpf_cgroup_storage *new_storage;

			new_storage = bpf_cgroup_storage_alloc(prog, stype);
			if (IS_ERR(new_storage)) {
				bpf_cgroup_storages_free(new_storages);
				return PTR_ERR(new_storage);
			}
			storages[stype] = new_storage;
			new_storages[stype] = new_storage;
		}
	}

	return 0;
}

@@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
-	struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_prog_list *pl;
 	int err;
 
@@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (IS_ERR(pl))
 		return PTR_ERR(pl);
 
-	if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
+	if (bpf_cgroup_storages_alloc(storage, new_storage,
+				      prog ? : link->link.prog, cgrp))
 		return -ENOMEM;
 
 	if (pl) {
 		old_prog = pl->prog;
-		bpf_cgroup_storages_unlink(pl->storage);
-		bpf_cgroup_storages_assign(old_storage, pl->storage);
 	} else {
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
-			bpf_cgroup_storages_free(storage);
+			bpf_cgroup_storages_free(new_storage);
 			return -ENOMEM;
 		}
 		list_add_tail(&pl->node, progs);
@@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (err)
 		goto cleanup;
 
-	bpf_cgroup_storages_free(old_storage);
 	if (old_prog)
 		bpf_prog_put(old_prog);
 	else
 		static_branch_inc(&cgroup_bpf_enabled_key);
-	bpf_cgroup_storages_link(pl->storage, cgrp, type);
+	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
 cleanup:
@@ -493,9 +508,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 		pl->prog = old_prog;
 		pl->link = NULL;
 	}
-	bpf_cgroup_storages_free(pl->storage);
-	bpf_cgroup_storages_assign(pl->storage, old_storage);
-	bpf_cgroup_storages_link(pl->storage, cgrp, type);
+	bpf_cgroup_storages_free(new_storage);
 	if (!old_prog) {
 		list_del(&pl->node);
 		kfree(pl);

[ ... ]

> diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> index 51bd5a8cb01b..78ffe69ff1d8 100644
> --- a/kernel/bpf/local_storage.c
> +++ b/kernel/bpf/local_storage.c
[ ... ]
> @@ -318,6 +313,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
>  static void cgroup_storage_map_free(struct bpf_map *_map)
>  {
>  	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> +	struct list_head *storages = &map->list;
> +	struct bpf_cgroup_storage *storage, *stmp;
> +
> +	mutex_lock(&cgroup_mutex);
> +
> +	list_for_each_entry_safe(storage, stmp, storages, list_map) {
> +		bpf_cgroup_storage_unlink(storage);
> +		bpf_cgroup_storage_free(storage);
> +	}
> +
> +	mutex_unlock(&cgroup_mutex);
>  
>  	WARN_ON(!RB_EMPTY_ROOT(&map->root));
>  	WARN_ON(!list_empty(&map->list));
> @@ -431,13 +437,10 @@ int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
>  
>  	spin_lock_bh(&map->lock);
>  
> -	if (map->aux && map->aux != aux)
> -		goto unlock;
>  	if (aux->cgroup_storage[stype] &&
>  	    aux->cgroup_storage[stype] != _map)
>  		goto unlock;
>  
> -	map->aux = aux;
Is spin_lock_bh(&map->lock) still required in this function?