Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 8/7/19 5:05 PM, Stanislav Fomichev wrote:
> On 08/07, Yonghong Song wrote:
>>
>>
>> On 8/7/19 8:47 AM, Stanislav Fomichev wrote:
>>> Add new helper bpf_sk_storage_clone which optionally clones sk storage
>>> and call it from bpf_sk_storage_clone. Reuse the gap in
>>> bpf_sk_storage_elem to store clone/non-clone flag.
>>>
>>> Cc: Martin KaFai Lau <kafai@xxxxxx>
>>> Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
>>
>> I tried to see whether I can find any missing race conditions in
>> the code but I failed. So except a minor comments below,
> Thanks for a review!
> 
>> Acked-by: Yonghong Song <yhs@xxxxxx>
>>
>>> ---
>>>    include/net/bpf_sk_storage.h |  10 ++++
>>>    include/uapi/linux/bpf.h     |   1 +
>>>    net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
>>>    net/core/sock.c              |   9 ++--
>>>    4 files changed, 115 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
>>> index b9dcb02e756b..8e4f831d2e52 100644
>>> --- a/include/net/bpf_sk_storage.h
>>> +++ b/include/net/bpf_sk_storage.h
>>> @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
>>>    extern const struct bpf_func_proto bpf_sk_storage_get_proto;
>>>    extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
>>>    
>>> +#ifdef CONFIG_BPF_SYSCALL
>>> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
>>> +#else
>>> +static inline int bpf_sk_storage_clone(const struct sock *sk,
>>> +				       struct sock *newsk)
>>> +{
>>> +	return 0;
>>> +}
>>> +#endif
>>> +
>>>    #endif /* _BPF_SK_STORAGE_H */
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 4393bd4b2419..00459ca4c8cf 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -2931,6 +2931,7 @@ enum bpf_func_id {
>>>    
>>>    /* BPF_FUNC_sk_storage_get flags */
>>>    #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
>>> +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
>>>    
>>>    /* Mode for BPF_FUNC_skb_adjust_room helper. */
>>>    enum bpf_adj_room_mode {
>>> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
>>> index 94c7f77ecb6b..b6dea67965bc 100644
>>> --- a/net/core/bpf_sk_storage.c
>>> +++ b/net/core/bpf_sk_storage.c
>>> @@ -12,6 +12,9 @@
>>>    
>>>    static atomic_t cache_idx;
>>>    
>>> +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
>>> +					 BPF_SK_STORAGE_GET_F_CLONE)
>>> +
>>>    struct bucket {
>>>    	struct hlist_head list;
>>>    	raw_spinlock_t lock;
>>> @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
>>>    	struct hlist_node snode;	/* Linked to bpf_sk_storage */
>>>    	struct bpf_sk_storage __rcu *sk_storage;
>>>    	struct rcu_head rcu;
>>> -	/* 8 bytes hole */
>>> +	u8 clone:1;
>>> +	/* 7 bytes hole */
>>>    	/* The data is stored in aother cacheline to minimize
>>>    	 * the number of cachelines access during a cache hit.
>>>    	 */
>>> @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
>>>    	return 0;
>>>    }
>>>    
>>> -/* Called by __sk_destruct() */
>>> +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
>>>    void bpf_sk_storage_free(struct sock *sk)
>>>    {
>>>    	struct bpf_sk_storage_elem *selem;
>>> @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
>>>    	return err;
>>>    }
>>>    
>>> +static struct bpf_sk_storage_elem *
>>> +bpf_sk_storage_clone_elem(struct sock *newsk,
>>> +			  struct bpf_sk_storage_map *smap,
>>> +			  struct bpf_sk_storage_elem *selem)
>>> +{
>>> +	struct bpf_sk_storage_elem *copy_selem;
>>> +
>>> +	copy_selem = selem_alloc(smap, newsk, NULL, true);
>>> +	if (!copy_selem)
>>> +		return ERR_PTR(-ENOMEM);
>>> +
>>> +	if (map_value_has_spin_lock(&smap->map))
>>> +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
>>> +				      SDATA(selem)->data, true);
>>> +	else
>>> +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
>>> +			       SDATA(selem)->data);
>>> +
>>> +	return copy_selem;
>>> +}
>>> +
>>> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
>>> +{
>>> +	struct bpf_sk_storage *new_sk_storage = NULL;
>>> +	struct bpf_sk_storage *sk_storage;
>>> +	struct bpf_sk_storage_elem *selem;
>>> +	int ret;
>>> +
>>> +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
>>> +
>>> +	rcu_read_lock();
>>> +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
>>> +
>>> +	if (!sk_storage || hlist_empty(&sk_storage->list))
>>> +		goto out;
>>> +
>>> +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
>>> +		struct bpf_sk_storage_map *smap;
>>> +		struct bpf_sk_storage_elem *copy_selem;
>>> +
>>> +		if (!selem->clone)
>>> +			continue;
>>> +
>>> +		smap = rcu_dereference(SDATA(selem)->smap);
>>> +		if (!smap)
>>> +			continue;
>>> +
>>> +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
>>> +		if (IS_ERR(copy_selem)) {
>>> +			ret = PTR_ERR(copy_selem);
>>> +			goto err;
>>> +		}
>>> +
>>> +		if (!new_sk_storage) {
>>> +			ret = sk_storage_alloc(newsk, smap, copy_selem);
>>> +			if (ret) {
>>> +				kfree(copy_selem);
>>> +				atomic_sub(smap->elem_size,
>>> +					   &newsk->sk_omem_alloc);
>>> +				goto err;
>>> +			}
>>> +
>>> +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
>>> +			continue;
>>> +		}
>>> +
>>> +		raw_spin_lock_bh(&new_sk_storage->lock);
>>> +		selem_link_map(smap, copy_selem);
>>> +		__selem_link_sk(new_sk_storage, copy_selem);
>>> +		raw_spin_unlock_bh(&new_sk_storage->lock);
>>
>> Considering in this particular case, new socket is not visible to
>> outside world yet (both kernel and user space), map_delete/map_update
>> operations are not applicable in this situation, so
>> the above raw_spin_lock_bh() probably not needed.
> I agree, it's doing nothing, but __selem_link_sk has the following comment:
> /* sk_storage->lock must be held and sk_storage->list cannot be empty */
> 
> Just wanted to keep that invariant for this call site as well (in case
> we add some lockdep enforcement or smth else). WDYT?

Agree. Let us keep the locking to be consistent with other uses in
the same file. This is not the critical path.





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux