On 8/7/19 5:05 PM, Stanislav Fomichev wrote: > On 08/07, Yonghong Song wrote: >> >> >> On 8/7/19 8:47 AM, Stanislav Fomichev wrote: >>> Add new helper bpf_sk_storage_clone which optionally clones sk storage >>> and call it from bpf_sk_storage_clone. Reuse the gap in >>> bpf_sk_storage_elem to store clone/non-clone flag. >>> >>> Cc: Martin KaFai Lau <kafai@xxxxxx> >>> Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> >> >> I tried to see whether I can find any missing race conditions in >> the code but I failed. So except a minor comments below, > Thanks for a review! > >> Acked-by: Yonghong Song <yhs@xxxxxx> >> >>> --- >>> include/net/bpf_sk_storage.h | 10 ++++ >>> include/uapi/linux/bpf.h | 1 + >>> net/core/bpf_sk_storage.c | 102 +++++++++++++++++++++++++++++++++-- >>> net/core/sock.c | 9 ++-- >>> 4 files changed, 115 insertions(+), 7 deletions(-) >>> >>> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h >>> index b9dcb02e756b..8e4f831d2e52 100644 >>> --- a/include/net/bpf_sk_storage.h >>> +++ b/include/net/bpf_sk_storage.h >>> @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk); >>> extern const struct bpf_func_proto bpf_sk_storage_get_proto; >>> extern const struct bpf_func_proto bpf_sk_storage_delete_proto; >>> >>> +#ifdef CONFIG_BPF_SYSCALL >>> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); >>> +#else >>> +static inline int bpf_sk_storage_clone(const struct sock *sk, >>> + struct sock *newsk) >>> +{ >>> + return 0; >>> +} >>> +#endif >>> + >>> #endif /* _BPF_SK_STORAGE_H */ >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>> index 4393bd4b2419..00459ca4c8cf 100644 >>> --- a/include/uapi/linux/bpf.h >>> +++ b/include/uapi/linux/bpf.h >>> @@ -2931,6 +2931,7 @@ enum bpf_func_id { >>> >>> /* BPF_FUNC_sk_storage_get flags */ >>> #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) >>> +#define BPF_SK_STORAGE_GET_F_CLONE (1ULL << 1) >>> >>> /* Mode for BPF_FUNC_skb_adjust_room helper. */ >>> enum bpf_adj_room_mode { >>> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c >>> index 94c7f77ecb6b..b6dea67965bc 100644 >>> --- a/net/core/bpf_sk_storage.c >>> +++ b/net/core/bpf_sk_storage.c >>> @@ -12,6 +12,9 @@ >>> >>> static atomic_t cache_idx; >>> >>> +#define BPF_SK_STORAGE_GET_F_MASK (BPF_SK_STORAGE_GET_F_CREATE | \ >>> + BPF_SK_STORAGE_GET_F_CLONE) >>> + >>> struct bucket { >>> struct hlist_head list; >>> raw_spinlock_t lock; >>> @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem { >>> struct hlist_node snode; /* Linked to bpf_sk_storage */ >>> struct bpf_sk_storage __rcu *sk_storage; >>> struct rcu_head rcu; >>> - /* 8 bytes hole */ >>> + u8 clone:1; >>> + /* 7 bytes hole */ >>> /* The data is stored in aother cacheline to minimize >>> * the number of cachelines access during a cache hit. >>> */ >>> @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) >>> return 0; >>> } >>> >>> -/* Called by __sk_destruct() */ >>> +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ >>> void bpf_sk_storage_free(struct sock *sk) >>> { >>> struct bpf_sk_storage_elem *selem; >>> @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) >>> return err; >>> } >>> >>> +static struct bpf_sk_storage_elem * >>> +bpf_sk_storage_clone_elem(struct sock *newsk, >>> + struct bpf_sk_storage_map *smap, >>> + struct bpf_sk_storage_elem *selem) >>> +{ >>> + struct bpf_sk_storage_elem *copy_selem; >>> + >>> + copy_selem = selem_alloc(smap, newsk, NULL, true); >>> + if (!copy_selem) >>> + return ERR_PTR(-ENOMEM); >>> + >>> + if (map_value_has_spin_lock(&smap->map)) >>> + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, >>> + SDATA(selem)->data, true); >>> + else >>> + copy_map_value(&smap->map, SDATA(copy_selem)->data, >>> + SDATA(selem)->data); >>> + >>> + return copy_selem; >>> +} >>> + >>> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) >>> +{ >>> + struct bpf_sk_storage *new_sk_storage = NULL; >>> + struct bpf_sk_storage *sk_storage; >>> + struct bpf_sk_storage_elem *selem; >>> + int ret; >>> + >>> + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); >>> + >>> + rcu_read_lock(); >>> + sk_storage = rcu_dereference(sk->sk_bpf_storage); >>> + >>> + if (!sk_storage || hlist_empty(&sk_storage->list)) >>> + goto out; >>> + >>> + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { >>> + struct bpf_sk_storage_map *smap; >>> + struct bpf_sk_storage_elem *copy_selem; >>> + >>> + if (!selem->clone) >>> + continue; >>> + >>> + smap = rcu_dereference(SDATA(selem)->smap); >>> + if (!smap) >>> + continue; >>> + >>> + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); >>> + if (IS_ERR(copy_selem)) { >>> + ret = PTR_ERR(copy_selem); >>> + goto err; >>> + } >>> + >>> + if (!new_sk_storage) { >>> + ret = sk_storage_alloc(newsk, smap, copy_selem); >>> + if (ret) { >>> + kfree(copy_selem); >>> + atomic_sub(smap->elem_size, >>> + &newsk->sk_omem_alloc); >>> + goto err; >>> + } >>> + >>> + new_sk_storage = rcu_dereference(copy_selem->sk_storage); >>> + continue; >>> + } >>> + >>> + raw_spin_lock_bh(&new_sk_storage->lock); >>> + selem_link_map(smap, copy_selem); >>> + __selem_link_sk(new_sk_storage, copy_selem); >>> + raw_spin_unlock_bh(&new_sk_storage->lock); >> >> Considering in this particular case, new socket is not visible to >> outside world yet (both kernel and user space), map_delete/map_update >> operations are not applicable in this situation, so >> the above raw_spin_lock_bh() probably not needed. > I agree, it's doing nothing, but __selem_link_sk has the following comment: > /* sk_storage->lock must be held and sk_storage->list cannot be empty */ > > Just wanted to keep that invariant for this call site as well (in case > we add some lockdep enforcement or smth else). WDYT? Agree. Let us keep the locking to be consistent with other uses in the same file. This is not the critical path.