On 8/7/19 8:47 AM, Stanislav Fomichev wrote: > Add new helper bpf_sk_storage_clone which optionally clones sk storage > and call it from bpf_sk_storage_clone. Reuse the gap in > bpf_sk_storage_elem to store clone/non-clone flag. > > Cc: Martin KaFai Lau <kafai@xxxxxx> > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> I tried to see whether I can find any missing race conditions in the code but I failed. So except a minor comments below, Acked-by: Yonghong Song <yhs@xxxxxx> > --- > include/net/bpf_sk_storage.h | 10 ++++ > include/uapi/linux/bpf.h | 1 + > net/core/bpf_sk_storage.c | 102 +++++++++++++++++++++++++++++++++-- > net/core/sock.c | 9 ++-- > 4 files changed, 115 insertions(+), 7 deletions(-) > > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h > index b9dcb02e756b..8e4f831d2e52 100644 > --- a/include/net/bpf_sk_storage.h > +++ b/include/net/bpf_sk_storage.h > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk); > extern const struct bpf_func_proto bpf_sk_storage_get_proto; > extern const struct bpf_func_proto bpf_sk_storage_delete_proto; > > +#ifdef CONFIG_BPF_SYSCALL > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); > +#else > +static inline int bpf_sk_storage_clone(const struct sock *sk, > + struct sock *newsk) > +{ > + return 0; > +} > +#endif > + > #endif /* _BPF_SK_STORAGE_H */ > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 4393bd4b2419..00459ca4c8cf 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -2931,6 +2931,7 @@ enum bpf_func_id { > > /* BPF_FUNC_sk_storage_get flags */ > #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) > +#define BPF_SK_STORAGE_GET_F_CLONE (1ULL << 1) > > /* Mode for BPF_FUNC_skb_adjust_room helper. */ > enum bpf_adj_room_mode { > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c > index 94c7f77ecb6b..b6dea67965bc 100644 > --- a/net/core/bpf_sk_storage.c > +++ b/net/core/bpf_sk_storage.c > @@ -12,6 +12,9 @@ > > static atomic_t cache_idx; > > +#define BPF_SK_STORAGE_GET_F_MASK (BPF_SK_STORAGE_GET_F_CREATE | \ > + BPF_SK_STORAGE_GET_F_CLONE) > + > struct bucket { > struct hlist_head list; > raw_spinlock_t lock; > @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem { > struct hlist_node snode; /* Linked to bpf_sk_storage */ > struct bpf_sk_storage __rcu *sk_storage; > struct rcu_head rcu; > - /* 8 bytes hole */ > + u8 clone:1; > + /* 7 bytes hole */ > /* The data is stored in aother cacheline to minimize > * the number of cachelines access during a cache hit. > */ > @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) > return 0; > } > > -/* Called by __sk_destruct() */ > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ > void bpf_sk_storage_free(struct sock *sk) > { > struct bpf_sk_storage_elem *selem; > @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) > return err; > } > > +static struct bpf_sk_storage_elem * > +bpf_sk_storage_clone_elem(struct sock *newsk, > + struct bpf_sk_storage_map *smap, > + struct bpf_sk_storage_elem *selem) > +{ > + struct bpf_sk_storage_elem *copy_selem; > + > + copy_selem = selem_alloc(smap, newsk, NULL, true); > + if (!copy_selem) > + return ERR_PTR(-ENOMEM); > + > + if (map_value_has_spin_lock(&smap->map)) > + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, > + SDATA(selem)->data, true); > + else > + copy_map_value(&smap->map, SDATA(copy_selem)->data, > + SDATA(selem)->data); > + > + return copy_selem; > +} > + > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) > +{ > + struct bpf_sk_storage *new_sk_storage = NULL; > + struct bpf_sk_storage *sk_storage; > + struct bpf_sk_storage_elem *selem; > + int ret; > + > + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); > + > + rcu_read_lock(); > + sk_storage = rcu_dereference(sk->sk_bpf_storage); > + > + if (!sk_storage || hlist_empty(&sk_storage->list)) > + goto out; > + > + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { > + struct bpf_sk_storage_map *smap; > + struct bpf_sk_storage_elem *copy_selem; > + > + if (!selem->clone) > + continue; > + > + smap = rcu_dereference(SDATA(selem)->smap); > + if (!smap) > + continue; > + > + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); > + if (IS_ERR(copy_selem)) { > + ret = PTR_ERR(copy_selem); > + goto err; > + } > + > + if (!new_sk_storage) { > + ret = sk_storage_alloc(newsk, smap, copy_selem); > + if (ret) { > + kfree(copy_selem); > + atomic_sub(smap->elem_size, > + &newsk->sk_omem_alloc); > + goto err; > + } > + > + new_sk_storage = rcu_dereference(copy_selem->sk_storage); > + continue; > + } > + > + raw_spin_lock_bh(&new_sk_storage->lock); > + selem_link_map(smap, copy_selem); > + __selem_link_sk(new_sk_storage, copy_selem); > + raw_spin_unlock_bh(&new_sk_storage->lock); Considering in this particular case, new socket is not visible to outside world yet (both kernel and user space), map_delete/map_update operations are not applicable in this situation, so the above raw_spin_lock_bh() probably not needed. > + } > + > +out: > + rcu_read_unlock(); > + return 0; > + > +err: > + rcu_read_unlock(); > + > + bpf_sk_storage_free(newsk); > + return ret; > +} > + > BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > void *, value, u64, flags) > { > struct bpf_sk_storage_data *sdata; > > - if (flags > BPF_SK_STORAGE_GET_F_CREATE) > + if (flags & ~BPF_SK_STORAGE_GET_F_MASK) > + return (unsigned long)NULL; > + > + if ((flags & BPF_SK_STORAGE_GET_F_CLONE) && > + !(flags & BPF_SK_STORAGE_GET_F_CREATE)) > return (unsigned long)NULL; > > sdata = sk_storage_lookup(sk, map, true); > if (sdata) > return (unsigned long)sdata->data; > > - if (flags == BPF_SK_STORAGE_GET_F_CREATE && > + if ((flags & BPF_SK_STORAGE_GET_F_CREATE) && > /* Cannot add new elem to a going away sk. > * Otherwise, the new elem may become a leak > * (and also other memory issues during map > @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > /* sk must be a fullsock (guaranteed by verifier), > * so sock_gen_put() is unnecessary. > */ > + if (!IS_ERR(sdata)) > + SELEM(sdata)->clone = > + !!(flags & BPF_SK_STORAGE_GET_F_CLONE); > sock_put(sk); > return IS_ERR(sdata) ? > (unsigned long)NULL : (unsigned long)sdata->data; > diff --git a/net/core/sock.c b/net/core/sock.c > index d57b0cc995a0..f5e801a9cea4 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) > goto out; > } > RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); > -#ifdef CONFIG_BPF_SYSCALL > - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); > -#endif > + > + if (bpf_sk_storage_clone(sk, newsk)) { > + sk_free_unlock_clone(newsk); > + newsk = NULL; > + goto out; > + } > > newsk->sk_err = 0; > newsk->sk_err_soft = 0; >