On Thu, Aug 08, 2019 at 08:28:30AM -0700, Stanislav Fomichev wrote: > On 08/08, Martin Lau wrote: > > On Wed, Aug 07, 2019 at 08:47:18AM -0700, Stanislav Fomichev wrote: > > > Add new helper bpf_sk_storage_clone which optionally clones sk storage > > > and call it from bpf_sk_storage_clone. Reuse the gap in > > > bpf_sk_storage_elem to store clone/non-clone flag. > > > > > > Cc: Martin KaFai Lau <kafai@xxxxxx> > > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> > > > --- > > > include/net/bpf_sk_storage.h | 10 ++++ > > > include/uapi/linux/bpf.h | 1 + > > > net/core/bpf_sk_storage.c | 102 +++++++++++++++++++++++++++++++++-- > > > net/core/sock.c | 9 ++-- > > > 4 files changed, 115 insertions(+), 7 deletions(-) > > > > > > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h > > > index b9dcb02e756b..8e4f831d2e52 100644 > > > --- a/include/net/bpf_sk_storage.h > > > +++ b/include/net/bpf_sk_storage.h > > > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk); > > > extern const struct bpf_func_proto bpf_sk_storage_get_proto; > > > extern const struct bpf_func_proto bpf_sk_storage_delete_proto; > > > > > > +#ifdef CONFIG_BPF_SYSCALL > > > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); > > > +#else > > > +static inline int bpf_sk_storage_clone(const struct sock *sk, > > > + struct sock *newsk) > > > +{ > > > + return 0; > > > +} > > > +#endif > > > + > > > #endif /* _BPF_SK_STORAGE_H */ > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > > index 4393bd4b2419..00459ca4c8cf 100644 > > > --- a/include/uapi/linux/bpf.h > > > +++ b/include/uapi/linux/bpf.h > > > @@ -2931,6 +2931,7 @@ enum bpf_func_id { > > > > > > /* BPF_FUNC_sk_storage_get flags */ > > > #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) > > > +#define BPF_SK_STORAGE_GET_F_CLONE (1ULL << 1) > > It is only used in bpf_sk_storage_get(). > > What if the elem is created from bpf_fd_sk_storage_update_elem() > > i.e. from the syscall API ? > > > > What may be the use case for a map to have both CLONE and non-CLONE > > elements? If it is not the case, would it be better to add > > BPF_F_CLONE to bpf_attr->map_flags? > I didn't think about putting it on the map itself since the API > is on a per-element, but it does make sense. I can't come up > with a use-case for a per-element selective clone/non-clone. > Thanks, will move to the map itself. > > > > > > > /* Mode for BPF_FUNC_skb_adjust_room helper. */ > > > enum bpf_adj_room_mode { > > > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c > > > index 94c7f77ecb6b..b6dea67965bc 100644 > > > --- a/net/core/bpf_sk_storage.c > > > +++ b/net/core/bpf_sk_storage.c > > > @@ -12,6 +12,9 @@ > > > > > > static atomic_t cache_idx; > > > > > > +#define BPF_SK_STORAGE_GET_F_MASK (BPF_SK_STORAGE_GET_F_CREATE | \ > > > + BPF_SK_STORAGE_GET_F_CLONE) > > > + > > > struct bucket { > > > struct hlist_head list; > > > raw_spinlock_t lock; > > > @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem { > > > struct hlist_node snode; /* Linked to bpf_sk_storage */ > > > struct bpf_sk_storage __rcu *sk_storage; > > > struct rcu_head rcu; > > > - /* 8 bytes hole */ > > > + u8 clone:1; > > > + /* 7 bytes hole */ > > > /* The data is stored in aother cacheline to minimize > > > * the number of cachelines access during a cache hit. > > > */ > > > @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) > > > return 0; > > > } > > > > > > -/* Called by __sk_destruct() */ > > > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ > > > void bpf_sk_storage_free(struct sock *sk) > > > { > > > struct bpf_sk_storage_elem *selem; > > > @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) > > > return err; > > > } > > > > > > +static struct bpf_sk_storage_elem * > > > +bpf_sk_storage_clone_elem(struct sock *newsk, > > > + struct bpf_sk_storage_map *smap, > > > + struct bpf_sk_storage_elem *selem) > > > +{ > > > + struct bpf_sk_storage_elem *copy_selem; > > > + > > > + copy_selem = selem_alloc(smap, newsk, NULL, true); > > > + if (!copy_selem) > > > + return ERR_PTR(-ENOMEM); > > nit. > > may be just return NULL as selem_alloc() does. > Sounds good. > > > > + > > > + if (map_value_has_spin_lock(&smap->map)) > > > + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, > > > + SDATA(selem)->data, true); > > > + else > > > + copy_map_value(&smap->map, SDATA(copy_selem)->data, > > > + SDATA(selem)->data); > > > + > > > + return copy_selem; > > > +} > > > + > > > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) > > > +{ > > > + struct bpf_sk_storage *new_sk_storage = NULL; > > > + struct bpf_sk_storage *sk_storage; > > > + struct bpf_sk_storage_elem *selem; > > > + int ret; > > > + > > > + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); > > > + > > > + rcu_read_lock(); > > > + sk_storage = rcu_dereference(sk->sk_bpf_storage); > > > + > > > + if (!sk_storage || hlist_empty(&sk_storage->list)) > > > + goto out; > > > + > > > + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { > > > + struct bpf_sk_storage_map *smap; > > > + struct bpf_sk_storage_elem *copy_selem; > > > + > > > + if (!selem->clone) > > > + continue; > > > + > > > + smap = rcu_dereference(SDATA(selem)->smap); > > > + if (!smap) > > smap should not be NULL. > I see; you never set it back to NULL and we are guaranteed that the > map is still around due to rcu. Removed. > > > > + continue; > > > + > > > + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); > > > + if (IS_ERR(copy_selem)) { > > > + ret = PTR_ERR(copy_selem); > > > + goto err; > > > + } > > > + > > > + if (!new_sk_storage) { > > > + ret = sk_storage_alloc(newsk, smap, copy_selem); > > > + if (ret) { > > > + kfree(copy_selem); > > > + atomic_sub(smap->elem_size, > > > + &newsk->sk_omem_alloc); > > > + goto err; > > > + } > > > + > > > + new_sk_storage = rcu_dereference(copy_selem->sk_storage); > > > + continue; > > > + } > > > + > > > + raw_spin_lock_bh(&new_sk_storage->lock); > > > + selem_link_map(smap, copy_selem); > > Unlike the existing selem-update use-cases in bpf_sk_storage.c, > > the smap->map.refcnt has not been held here. Reading the smap > > is fine. However, adding a new selem to a deleting smap is an issue. > > Hence, I think bpf_map_inc_not_zero() should be done first. > In this case, I should probably do it after smap = rcu_deref()? Right. and bpf_map_put should be called when done. Becasue of bpf_map_put, it may be a good idea to add a comment to the first synchronize_rcu() in bpf_sk_storage_map_free() since this new bpf_sk_storage_clone() also depends on it now, which makes it different from other bpf maps. > > > > + __selem_link_sk(new_sk_storage, copy_selem); > > > + raw_spin_unlock_bh(&new_sk_storage->lock); > > > + } > > > + > > > +out: > > > + rcu_read_unlock(); > > > + return 0; > > > + > > > +err: > > > + rcu_read_unlock(); > > > + > > > + bpf_sk_storage_free(newsk); > > > + return ret; > > > +} > > > + > > > BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > > > void *, value, u64, flags) > > > { > > > struct bpf_sk_storage_data *sdata; > > > > > > - if (flags > BPF_SK_STORAGE_GET_F_CREATE) > > > + if (flags & ~BPF_SK_STORAGE_GET_F_MASK) > > > + return (unsigned long)NULL; > > > + > > > + if ((flags & BPF_SK_STORAGE_GET_F_CLONE) && > > > + !(flags & BPF_SK_STORAGE_GET_F_CREATE)) > > > return (unsigned long)NULL; > > > > > > sdata = sk_storage_lookup(sk, map, true); > > > if (sdata) > > > return (unsigned long)sdata->data; > > > > > > - if (flags == BPF_SK_STORAGE_GET_F_CREATE && > > > + if ((flags & BPF_SK_STORAGE_GET_F_CREATE) && > > > /* Cannot add new elem to a going away sk. > > > * Otherwise, the new elem may become a leak > > > * (and also other memory issues during map > > > @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > > > /* sk must be a fullsock (guaranteed by verifier), > > > * so sock_gen_put() is unnecessary. > > > */ > > > + if (!IS_ERR(sdata)) > > > + SELEM(sdata)->clone = > > > + !!(flags & BPF_SK_STORAGE_GET_F_CLONE); > > > sock_put(sk); > > > return IS_ERR(sdata) ? > > > (unsigned long)NULL : (unsigned long)sdata->data; > > > diff --git a/net/core/sock.c b/net/core/sock.c > > > index d57b0cc995a0..f5e801a9cea4 100644 > > > --- a/net/core/sock.c > > > +++ b/net/core/sock.c > > > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) > > > goto out; > > > } > > > RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); > > > -#ifdef CONFIG_BPF_SYSCALL > > > - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); > > > -#endif > > > + > > > + if (bpf_sk_storage_clone(sk, newsk)) { > > > + sk_free_unlock_clone(newsk); > > > + newsk = NULL; > > > + goto out; > > > + } > > > > > > newsk->sk_err = 0; > > > newsk->sk_err_soft = 0; > > > -- > > > 2.22.0.770.g0f2c4a37fd-goog > > >