On Thu, Aug 26, 2021 at 11:51:26PM +0000, KP Singh wrote: > Other maps like hashmaps are already available to sleepable programs. > Sleepable BPF programs run under trace RCU. Allow task, local and inode > storage to be used from sleepable programs. > > The local storage code mostly runs under the programs RCU read section > (in __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable}) > (rcu_read_lock or rcu_read_lock_trace) with the exception the logic > where the map is freed. > > After some discussions and help from Jann Horn, the following changes > were made: > > bpf_local_storage{_elem} are freed with a kfree_rcu > wrapped with a call_rcu_tasks_trace callback instead of a direct > kfree_rcu which does not respect the trace RCU grace periods. The > callback frees the storage/selem with kfree_rcu which handles the normal > RCU grace period similar to BPF trampolines. > > Update the synchronise_rcu to synchronize_rcu_mult in the map freeing > code to wait for trace RCU and normal RCU grace periods. > While this is an expensive operation, bpf_local_storage_map_free > is not called from within a BPF program, rather only called when the > owning object is being freed. > > Update the dereferencing of the pointers to use rcu_derference_protected > (with either the trace or normal RCU locks held) and add warnings in the > beginning of the get and delete helpers. > > Signed-off-by: KP Singh <kpsingh@xxxxxxxxxx> > --- > include/linux/bpf_local_storage.h | 5 ++++ > kernel/bpf/bpf_inode_storage.c | 9 +++++-- > kernel/bpf/bpf_local_storage.c | 43 ++++++++++++++++++++++++------- > kernel/bpf/bpf_task_storage.c | 6 ++++- > kernel/bpf/verifier.c | 3 +++ > net/core/bpf_sk_storage.c | 8 +++++- > 6 files changed, 61 insertions(+), 13 deletions(-) > > diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h > index 24496bc28e7b..8453488b334d 100644 > --- a/include/linux/bpf_local_storage.h > +++ b/include/linux/bpf_local_storage.h > @@ -16,6 +16,9 @@ > > #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 > > +#define bpf_local_storage_rcu_lock_held() \ > + (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ > + rcu_read_lock_bh_held()) There is a similar test in hashtab. May be renaming it to a more generic name such that it can be reused there? [ ... ] > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c > index b305270b7a4b..7760bc4e9565 100644 > --- a/kernel/bpf/bpf_local_storage.c > +++ b/kernel/bpf/bpf_local_storage.c > @@ -11,6 +11,8 @@ > #include <net/sock.h> > #include <uapi/linux/sock_diag.h> > #include <uapi/linux/btf.h> > +#include <linux/rcupdate_trace.h> > +#include <linux/rcupdate_wait.h> > > #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) > > @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, > return NULL; > } > > +void bpf_local_storage_free_rcu(struct rcu_head *rcu) > +{ > + struct bpf_local_storage *local_storage; > + > + local_storage = container_of(rcu, struct bpf_local_storage, rcu); > + kfree_rcu(local_storage, rcu); > +} > + > +static void bpf_selem_free_rcu(struct rcu_head *rcu) > +{ > + struct bpf_local_storage_elem *selem; > + > + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); > + kfree_rcu(selem, rcu); > +} > + > /* local_storage->lock must be held and selem->local_storage == local_storage. > * The caller must ensure selem->smap is still valid to be > * dereferenced for its smap->elem_size and smap->cache_idx. > @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, > * > * Although the unlock will be done under > * rcu_read_lock(), it is more intutivie to > - * read if kfree_rcu(local_storage, rcu) is done > + * read if the freeing of the storage is done > * after the raw_spin_unlock_bh(&local_storage->lock). > * > * Hence, a "bool free_local_storage" is returned > - * to the caller which then calls the kfree_rcu() > - * after unlock. > + * to the caller which then calls then frees the storage after > + * all the RCU grace periods have expired. > */ > } > hlist_del_init_rcu(&selem->snode); > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, > SDATA(selem)) > RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); > > - kfree_rcu(selem, rcu); > + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); Although the common use case is usually storage_get() much more often than storage_delete(), do you aware any performance impact for the bpf prog that does a lot of storage_delete()? > > return free_local_storage; > } > @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) > raw_spin_unlock_irqrestore(&local_storage->lock, flags); > > if (free_local_storage) > - kfree_rcu(local_storage, rcu); > + call_rcu_tasks_trace(&local_storage->rcu, > + bpf_local_storage_free_rcu); > } > > void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, > struct bpf_local_storage_elem *selem; > > /* Fast path (cache hit) */ > - sdata = rcu_dereference(local_storage->cache[smap->cache_idx]); > + sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx], > + bpf_local_storage_rcu_lock_held()); There are other places using rcu_dereference() also. e.g. in bpf_local_storage_update(). Should they be changed also? [ ... ] > --- a/net/core/bpf_sk_storage.c > +++ b/net/core/bpf_sk_storage.c > @@ -13,6 +13,7 @@ > #include <net/sock.h> > #include <uapi/linux/sock_diag.h> > #include <uapi/linux/btf.h> > +#include <linux/rcupdate_trace.h> > > DEFINE_BPF_STORAGE_CACHE(sk_cache); > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) > struct bpf_local_storage *sk_storage; > struct bpf_local_storage_map *smap; > > - sk_storage = rcu_dereference(sk->sk_bpf_storage); > + sk_storage = rcu_dereference_protected(sk->sk_bpf_storage, > + bpf_local_storage_rcu_lock_held()); > if (!sk_storage) > return NULL; > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > { > struct bpf_local_storage_data *sdata; > > + WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held()); > if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) sk is protected by rcu_read_lock here. Is it always safe to access it with the rcu_read_lock_trace alone ? > return (unsigned long)NULL; >