On Mon, Aug 03, 2020 at 06:46:51PM +0200, KP Singh wrote: > From: KP Singh <kpsingh@xxxxxxxxxx> > > Refactor the functionality in bpf_sk_storage.c so that concept of > storage linked to kernel objects can be extended to other objects like > inode, task_struct etc. > > Each new local storage will still be a separate map and provide its own > set of helpers. This allows for future object specific extensions and > still share a lot of the underlying implementation. > > This includes the changes suggested by Martin in: > > https://lore.kernel.org/bpf/20200725013047.4006241-1-kafai@xxxxxx/ > > which adds map_local_storage_charge, map_local_storage_uncharge, > and map_owner_storage_ptr. A description will still be useful in the commit message to talk about the new map_ops, e.g. they allow kernel object to optionally have different mem-charge strategy. > > Co-developed-by: Martin KaFai Lau <kafai@xxxxxx> > Signed-off-by: KP Singh <kpsingh@xxxxxxxxxx> > --- > include/linux/bpf.h | 9 ++ > include/net/bpf_sk_storage.h | 51 +++++++ > include/uapi/linux/bpf.h | 8 +- > net/core/bpf_sk_storage.c | 246 +++++++++++++++++++++------------ > tools/include/uapi/linux/bpf.h | 8 +- > 5 files changed, 233 insertions(+), 89 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index cef4ef0d2b4e..8e1e23c60dc7 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -34,6 +34,9 @@ struct btf_type; > struct exception_table_entry; > struct seq_operations; > struct bpf_iter_aux_info; > +struct bpf_local_storage; > +struct bpf_local_storage_map; > +struct bpf_local_storage_elem; "struct bpf_local_storage_elem" is not needed. > > extern struct idr btf_idr; > extern spinlock_t btf_idr_lock; > @@ -104,6 +107,12 @@ struct bpf_map_ops { > __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, > struct poll_table_struct *pts); > > + /* Functions called by bpf_local_storage maps */ > + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, > + void *owner, u32 size); > + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, > + void *owner, u32 size); > + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); > /* BTF name and id of struct allocated by map_alloc */ > const char * const map_btf_name; > int *map_btf_id; > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h > index 950c5aaba15e..05b777950eb3 100644 > --- a/include/net/bpf_sk_storage.h > +++ b/include/net/bpf_sk_storage.h > @@ -3,8 +3,15 @@ > #ifndef _BPF_SK_STORAGE_H > #define _BPF_SK_STORAGE_H > > +#include <linux/rculist.h> > +#include <linux/list.h> > +#include <linux/hash.h> > #include <linux/types.h> > #include <linux/spinlock.h> > +#include <linux/bpf.h> > +#include <net/sock.h> > +#include <uapi/linux/sock_diag.h> > +#include <uapi/linux/btf.h> > > struct sock; > > @@ -34,6 +41,50 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); > void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, > u16 idx); > > +/* Helper functions for bpf_local_storage */ > +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); > + > +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); > + > +struct bpf_local_storage_data * > +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, > + struct bpf_local_storage_map *smap, > + bool cacheit_lockit); > + > +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); > + > +int bpf_local_storage_map_check_btf(const struct bpf_map *map, > + const struct btf *btf, > + const struct btf_type *key_type, > + const struct btf_type *value_type); > + > +void bpf_selem_link_storage(struct bpf_local_storage *local_storage, > + struct bpf_local_storage_elem *selem); > + > +bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage, > + struct bpf_local_storage_elem *selem, > + bool uncharge_omem); > + > +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); > + > +void bpf_selem_link_map(struct bpf_local_storage_map *smap, > + struct bpf_local_storage_elem *selem); > + > +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); > + > +struct bpf_local_storage_elem * > +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, > + bool charge_mem); > + > +int > +bpf_local_storage_alloc(void *owner, > + struct bpf_local_storage_map *smap, > + struct bpf_local_storage_elem *first_selem); > + > +struct bpf_local_storage_data * > +bpf_local_storage_update(void *owner, struct bpf_map *map, void *value, Nit. It may be more consistent to take "struct bpf_local_storage_map *smap" instead of "struct bpf_map *map" here. bpf_local_storage_map_check_btf() will be the only one taking "struct bpf_map *map". > + u64 map_flags); > + > #ifdef CONFIG_BPF_SYSCALL > int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); > struct bpf_sk_storage_diag * > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index b134e679e9db..35629752cec8 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -3647,9 +3647,13 @@ enum { > BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), > }; > > -/* BPF_FUNC_sk_storage_get flags */ > +/* BPF_FUNC_<local>_storage_get flags */ BPF_FUNC_<kernel_obj>_storage_get flags? > enum { > - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), > + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), > + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility > + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. > + */ > + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, > }; > > /* BPF_FUNC_read_branch_records flags. */ > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c > index 99dde7b74767..bb2375769ca1 100644 > --- a/net/core/bpf_sk_storage.c > +++ b/net/core/bpf_sk_storage.c > @@ -84,7 +84,7 @@ struct bpf_local_storage_elem { > struct bpf_local_storage { > struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; > struct hlist_head list; /* List of bpf_local_storage_elem */ > - struct sock *owner; /* The object that owns the the above "list" of > + void *owner; /* The object that owns the the above "list" of > * bpf_local_storage_elem. > */ > struct rcu_head rcu; > @@ -110,6 +110,33 @@ static int omem_charge(struct sock *sk, unsigned int size) > return -ENOMEM; > } > > +static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) > +{ > + struct bpf_map *map = &smap->map; > + > + if (!map->ops->map_local_storage_charge) > + return 0; > + > + return map->ops->map_local_storage_charge(smap, owner, size); > +} > + > +static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, > + u32 size) > +{ > + struct bpf_map *map = &smap->map; > + > + if (map->ops->map_local_storage_uncharge) > + map->ops->map_local_storage_uncharge(smap, owner, size); > +} > + > +static struct bpf_local_storage __rcu ** > +owner_storage(struct bpf_local_storage_map *smap, void *owner) > +{ > + struct bpf_map *map = &smap->map; > + > + return map->ops->map_owner_storage_ptr(owner); > +} > + > static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) > { > return !hlist_unhashed(&selem->snode); > @@ -120,13 +147,13 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) > return !hlist_unhashed(&selem->map_node); > } > > -static struct bpf_local_storage_elem * > -bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, > - void *value, bool charge_omem) > +struct bpf_local_storage_elem * > +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, > + void *value, bool charge_mem) > { > struct bpf_local_storage_elem *selem; > > - if (charge_omem && omem_charge(sk, smap->elem_size)) > + if (charge_mem && mem_charge(smap, owner, smap->elem_size)) > return NULL; > > selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); > @@ -136,40 +163,42 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, > return selem; > } > > - if (charge_omem) > - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); > + if (charge_mem) > + mem_uncharge(smap, owner, smap->elem_size); > > return NULL; > } > > -/* sk_storage->lock must be held and selem->sk_storage == sk_storage. > +/* local_storage->lock must be held and selem->sk_storage == sk_storage. This name change belongs to patch 1. Also, selem->"local_"storage == "local_"storage > * The caller must ensure selem->smap is still valid to be > * dereferenced for its smap->elem_size and smap->cache_idx. > */ [ ... ] > @@ -367,7 +401,7 @@ static int sk_storage_alloc(struct sock *sk, > /* Note that even first_selem was linked to smap's > * bucket->list, first_selem can be freed immediately > * (instead of kfree_rcu) because > - * bpf_sk_storage_map_free() does a > + * bpf_local_storage_map_free() does a This name change belongs to patch 1 also. > * synchronize_rcu() before walking the bucket->list. > * Hence, no one is accessing selem from the > * bucket->list under rcu_read_lock(). > @@ -377,8 +411,8 @@ static int sk_storage_alloc(struct sock *sk, > return 0; > > uncharge: > - kfree(sk_storage); > - atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); > + kfree(storage); > + mem_uncharge(smap, owner, sizeof(*storage)); > return err; > } > > @@ -387,9 +421,9 @@ static int sk_storage_alloc(struct sock *sk, > * Otherwise, it will become a leak (and other memory issues > * during map destruction). > */ > -static struct bpf_local_storage_data * > -bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, > - u64 map_flags) > +struct bpf_local_storage_data * > +bpf_local_storage_update(void *owner, struct bpf_map *map, > + void *value, u64 map_flags) > { > struct bpf_local_storage_data *old_sdata = NULL; > struct bpf_local_storage_elem *selem; > @@ -404,21 +438,21 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, > return ERR_PTR(-EINVAL); > > smap = (struct bpf_local_storage_map *)map; > - local_storage = rcu_dereference(sk->sk_bpf_storage); > + local_storage = rcu_dereference(*owner_storage(smap, owner)); > if (!local_storage || hlist_empty(&local_storage->list)) { > - /* Very first elem for this object */ > + /* Very first elem for the owner */ > err = check_flags(NULL, map_flags); > if (err) > return ERR_PTR(err); > > - selem = bpf_selem_alloc(smap, sk, value, true); > + selem = bpf_selem_alloc(smap, owner, value, true); > if (!selem) > return ERR_PTR(-ENOMEM); > > - err = sk_storage_alloc(sk, smap, selem); > + err = bpf_local_storage_alloc(owner, smap, selem); > if (err) { > kfree(selem); > - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); > + mem_uncharge(smap, owner, smap->elem_size); > return ERR_PTR(err); > } > > @@ -430,8 +464,8 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, > * such that it can avoid taking the local_storage->lock > * and changing the lists. > */ > - old_sdata = > - bpf_local_storage_lookup(local_storage, smap, false); > + old_sdata = bpf_local_storage_lookup(local_storage, smap, > + false); Pure indentation change. The same line has been changed in patch 1. Please change the identation in patch 1 if the above way is preferred. > err = check_flags(old_sdata, map_flags); > if (err) > return ERR_PTR(err); > @@ -475,7 +509,7 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, > * old_sdata will not be uncharged later during > * bpf_selem_unlink_storage(). > */ > - selem = bpf_selem_alloc(smap, sk, value, !old_sdata); > + selem = bpf_selem_alloc(smap, owner, value, !old_sdata); > if (!selem) { > err = -ENOMEM; > goto unlock_err; > @@ -567,7 +601,7 @@ void bpf_sk_storage_free(struct sock *sk) > * Thus, no elem can be added-to or deleted-from the > * sk_storage->list by the bpf_prog or by the bpf-map's syscall. > * > - * It is racing with bpf_sk_storage_map_free() alone > + * It is racing with bpf_local_storage_map_free() alone This name change belongs to patch 1 also. > * when unlinking elem from the sk_storage->list and > * the map's bucket->list. > */ > @@ -587,17 +621,12 @@ void bpf_sk_storage_free(struct sock *sk) > kfree_rcu(sk_storage, rcu); > } > > -static void bpf_local_storage_map_free(struct bpf_map *map) > +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) > { > struct bpf_local_storage_elem *selem; > - struct bpf_local_storage_map *smap; > struct bpf_local_storage_map_bucket *b; > unsigned int i; > > - smap = (struct bpf_local_storage_map *)map; > - > - bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); > - > /* Note that this map might be concurrently cloned from > * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone > * RCU read section to finish before proceeding. New RCU > @@ -607,11 +636,12 @@ static void bpf_local_storage_map_free(struct bpf_map *map) > > /* bpf prog and the userspace can no longer access this map > * now. No new selem (of this map) can be added > - * to the sk->sk_bpf_storage or to the map bucket's list. > + * to the bpf_local_storage or to the map bucket's list. s/bpf_local_storage/owner->storage/ > * > * The elem of this map can be cleaned up here > * or > - * by bpf_sk_storage_free() during __sk_destruct(). > + * by bpf_local_storage_free() during the destruction of the > + * owner object. eg. __sk_destruct. This belongs to patch 1 also. > */ > for (i = 0; i < (1U << smap->bucket_log); i++) { > b = &smap->buckets[i]; > @@ -627,22 +657,31 @@ static void bpf_local_storage_map_free(struct bpf_map *map) > rcu_read_unlock(); > } > > - /* bpf_sk_storage_free() may still need to access the map. > - * e.g. bpf_sk_storage_free() has unlinked selem from the map > + /* bpf_local_storage_free() may still need to access the map. It is confusing. There is no bpf_local_storage_free(). > + * e.g. bpf_local_storage_free() has unlinked selem from the map > * which then made the above while((selem = ...)) loop > * exited immediately. > * > - * However, the bpf_sk_storage_free() still needs to access > + * However, the bpf_local_storage_free() still needs to access Same here. > * the smap->elem_size to do the uncharging in > * bpf_selem_unlink_storage(). > * > * Hence, wait another rcu grace period for the > - * bpf_sk_storage_free() to finish. > + * bpf_local_storage_free() to finish. and here. > */ > synchronize_rcu(); > > kvfree(smap->buckets); > - kfree(map); > + kfree(smap); > +} > + > +static void sk_storage_map_free(struct bpf_map *map) > +{ > + struct bpf_local_storage_map *smap; > + > + smap = (struct bpf_local_storage_map *)map; > + bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); > + bpf_local_storage_map_free(smap); > } > > /* U16_MAX is much more than enough for sk local storage > @@ -654,7 +693,7 @@ static void bpf_local_storage_map_free(struct bpf_map *map) > sizeof(struct bpf_local_storage_elem)), \ > (U16_MAX - sizeof(struct bpf_local_storage_elem))) > > -static int bpf_local_storage_map_alloc_check(union bpf_attr *attr) > +int bpf_local_storage_map_alloc_check(union bpf_attr *attr) > { > if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || > !(attr->map_flags & BPF_F_NO_PREALLOC) || > @@ -673,7 +712,7 @@ static int bpf_local_storage_map_alloc_check(union bpf_attr *attr) > return 0; > } > > -static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr) > +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) > { > struct bpf_local_storage_map *smap; > unsigned int i; > @@ -711,9 +750,21 @@ static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr) > raw_spin_lock_init(&smap->buckets[i].lock); > } > > - smap->elem_size = sizeof(struct bpf_local_storage_elem) + attr->value_size; > - smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); > + smap->elem_size = > + sizeof(struct bpf_local_storage_elem) + attr->value_size; Same line has changed in patch 1. Change the indentation in patch 1 also if the above way is desired. Others LGTM.