Re: [PATCH 4/7] bpf: udp: Implement batching for sockets iterator

Paolo Abeni <pabeni@xxxxxxxxxx> · Thu, 20 Apr 2023 16:07:21 +0200

On Tue, 2023-04-18 at 15:31 +0000, Aditi Ghag wrote:
> Batch UDP sockets from BPF iterator that allows for overlapping locking
> semantics in BPF/kernel helpers executed in BPF programs.  This facilitates
> BPF socket destroy kfunc (introduced by follow-up patches) to execute from
> BPF iterator programs.
> 
> Previously, BPF iterators acquired the sock lock and sockets hash table
> bucket lock while executing BPF programs. This prevented BPF helpers that
> again acquire these locks to be executed from BPF iterators.  With the
> batching approach, we acquire a bucket lock, batch all the bucket sockets,
> and then release the bucket lock. This enables BPF or kernel helpers to
> skip sock locking when invoked in the supported BPF contexts.
> 
> The batching logic is similar to the logic implemented in TCP iterator:
> https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@xxxxxx/.
> 
> Suggested-by: Martin KaFai Lau <martin.lau@xxxxxxxxxx>
> Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx>
> ---
>  net/ipv4/udp.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 203 insertions(+), 6 deletions(-)
> 
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 8689ed171776..f1c001641e53 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -3148,6 +3148,145 @@ struct bpf_iter__udp {
>  	int bucket __aligned(8);
>  };
>  
> +struct bpf_udp_iter_state {
> +	struct udp_iter_state state;
> +	unsigned int cur_sk;
> +	unsigned int end_sk;
> +	unsigned int max_sk;
> +	int offset;
> +	struct sock **batch;
> +	bool st_bucket_done;
> +};
> +
> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
> +				      unsigned int new_batch_sz);
> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> +{
> +	struct bpf_udp_iter_state *iter = seq->private;
> +	struct udp_iter_state *state = &iter->state;
> +	struct net *net = seq_file_net(seq);
> +	struct udp_seq_afinfo afinfo;
> +	struct udp_table *udptable;
> +	unsigned int batch_sks = 0;
> +	bool resized = false;
> +	struct sock *sk;
> +
> +	/* The current batch is done, so advance the bucket. */
> +	if (iter->st_bucket_done) {
> +		state->bucket++;
> +		iter->offset = 0;
> +	}
> +
> +	afinfo.family = AF_UNSPEC;
> +	afinfo.udp_table = NULL;
> +	udptable = udp_get_table_afinfo(&afinfo, net);
> +
> +again:
> +	/* New batch for the next bucket.
> +	 * Iterate over the hash table to find a bucket with sockets matching
> +	 * the iterator attributes, and return the first matching socket from
> +	 * the bucket. The remaining matched sockets from the bucket are batched
> +	 * before releasing the bucket lock. This allows BPF programs that are
> +	 * called in seq_show to acquire the bucket lock if needed.
> +	 */
> +	iter->cur_sk = 0;
> +	iter->end_sk = 0;
> +	iter->st_bucket_done = false;
> +	batch_sks = 0;
> +
> +	for (; state->bucket <= udptable->mask; state->bucket++) {
> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
> +
> +		if (hlist_empty(&hslot2->head)) {
> +			iter->offset = 0;
> +			continue;
> +		}
> +
> +		spin_lock_bh(&hslot2->lock);
> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
> +			if (seq_sk_match(seq, sk)) {
> +				/* Resume from the last iterated socket at the
> +				 * offset in the bucket before iterator was stopped.
> +				 */
> +				if (iter->offset) {
> +					--iter->offset;
> +					continue;
> +				}
> +				if (iter->end_sk < iter->max_sk) {
> +					sock_hold(sk);
> +					iter->batch[iter->end_sk++] = sk;
> +				}
> +				batch_sks++;
> +			}
> +		}
> +		spin_unlock_bh(&hslot2->lock);
> +
> +		if (iter->end_sk)
> +			break;
> +
> +		/* Reset the current bucket's offset before moving to the next bucket. */
> +		iter->offset = 0;
> +	}
> +
> +	/* All done: no batch made. */
> +	if (!iter->end_sk)
> +		return NULL;
> +
> +	if (iter->end_sk == batch_sks) {
> +		/* Batching is done for the current bucket; return the first
> +		 * socket to be iterated from the batch.
> +		 */
> +		iter->st_bucket_done = true;
> +		goto ret;
> +	}
> +	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
> +		resized = true;
> +		/* Go back to the previous bucket to resize its batch. */
> +		state->bucket--;
> +		goto again;
> +	}
> +ret:
> +	return iter->batch[0];
> +}
> +
> +static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct bpf_udp_iter_state *iter = seq->private;
> +	struct sock *sk;
> +
> +	/* Whenever seq_next() is called, the iter->cur_sk is
> +	 * done with seq_show(), so unref the iter->cur_sk.
> +	 */
> +	if (iter->cur_sk < iter->end_sk) {
> +		sock_put(iter->batch[iter->cur_sk++]);
> +		++iter->offset;
> +	}
> +
> +	/* After updating iter->cur_sk, check if there are more sockets
> +	 * available in the current bucket batch.
> +	 */
> +	if (iter->cur_sk < iter->end_sk) {
> +		sk = iter->batch[iter->cur_sk];
> +	} else {
> +		// Prepare a new batch.

Minor nit: please use /* */ even for single line comments.

Thanks

Paolo