Re: [PATCH v4 bpf-next 1/4] bpf: Implement batching in UDP iterator

Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx> · Mon, 27 Mar 2023 08:52:26 -0700

> On Mar 24, 2023, at 2:56 PM, Stanislav Fomichev <sdf@xxxxxxxxxx> wrote:
> 
> On 03/23, Aditi Ghag wrote:
>> Batch UDP sockets from BPF iterator that allows for overlapping locking
>> semantics in BPF/kernel helpers executed in BPF programs.  This facilitates
>> BPF socket destroy kfunc (introduced by follow-up patches) to execute from
>> BPF iterator programs.
> 
>> Previously, BPF iterators acquired the sock lock and sockets hash table
>> bucket lock while executing BPF programs. This prevented BPF helpers that
>> again acquire these locks to be executed from BPF iterators.  With the
>> batching approach, we acquire a bucket lock, batch all the bucket sockets,
>> and then release the bucket lock. This enables BPF or kernel helpers to
>> skip sock locking when invoked in the supported BPF contexts.
> 
>> The batching logic is similar to the logic implemented in TCP iterator:
>> https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@xxxxxx/.
> 
>> Suggested-by: Martin KaFai Lau <martin.lau@xxxxxxxxxx>
>> Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx>
>> ---
>>  include/net/udp.h |   1 +
>>  net/ipv4/udp.c    | 255 ++++++++++++++++++++++++++++++++++++++++++++--
>>  2 files changed, 247 insertions(+), 9 deletions(-)
> 
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index de4b528522bb..d2999447d3f2 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
>>  struct udp_iter_state {
>>  	struct seq_net_private  p;
>>  	int			bucket;
>> +	int			offset;
>>  	struct udp_seq_afinfo	*bpf_seq_afinfo;
>>  };
> 
>> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
>> index c605d171eb2d..58c620243e47 100644
>> --- a/net/ipv4/udp.c
>> +++ b/net/ipv4/udp.c
>> @@ -3152,6 +3152,171 @@ struct bpf_iter__udp {
>>  	int bucket __aligned(8);
>>  };
> 
>> +struct bpf_udp_iter_state {
>> +	struct udp_iter_state state;
>> +	unsigned int cur_sk;
>> +	unsigned int end_sk;
>> +	unsigned int max_sk;
>> +	struct sock **batch;
>> +	bool st_bucket_done;
>> +};
>> +
>> +static unsigned short seq_file_family(const struct seq_file *seq);
>> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
>> +				      unsigned int new_batch_sz);
>> +
>> +static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
>> +{
>> +	unsigned short family = seq_file_family(seq);
>> +
>> +	/* AF_UNSPEC is used as a match all */
>> +	return ((family == AF_UNSPEC || family == sk->sk_family) &&
>> +		net_eq(sock_net(sk), seq_file_net(seq)));
>> +}
>> +
>> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>> +{
>> +	struct bpf_udp_iter_state *iter = seq->private;
>> +	struct udp_iter_state *state = &iter->state;
>> +	struct net *net = seq_file_net(seq);
>> +	struct udp_seq_afinfo *afinfo = state->bpf_seq_afinfo;
>> +	struct udp_table *udptable;
>> +	struct sock *first_sk = NULL;
>> +	struct sock *sk;
>> +	unsigned int bucket_sks = 0;
>> +	bool resized = false;
>> +	int offset = 0;
>> +	int new_offset;
>> +
>> +	/* The current batch is done, so advance the bucket. */
>> +	if (iter->st_bucket_done) {
>> +		state->bucket++;
>> +		state->offset = 0;
>> +	}
>> +
>> +	udptable = udp_get_table_afinfo(afinfo, net);
>> +
>> +	if (state->bucket > udptable->mask) {
>> +		state->bucket = 0;
>> +		state->offset = 0;
>> +		return NULL;
>> +	}
>> +
>> +again:
>> +	/* New batch for the next bucket.
>> +	 * Iterate over the hash table to find a bucket with sockets matching
>> +	 * the iterator attributes, and return the first matching socket from
>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>> +	 * before releasing the bucket lock. This allows BPF programs that are
>> +	 * called in seq_show to acquire the bucket lock if needed.
>> +	 */
>> +	iter->cur_sk = 0;
>> +	iter->end_sk = 0;
>> +	iter->st_bucket_done = false;
>> +	first_sk = NULL;
>> +	bucket_sks = 0;
>> +	offset = state->offset;
>> +	new_offset = offset;
>> +
>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>> +		struct udp_hslot *hslot = &udptable->hash[state->bucket];
>> +
>> +		if (hlist_empty(&hslot->head)) {
>> +			offset = 0;
>> +			continue;
>> +		}
>> +
>> +		spin_lock_bh(&hslot->lock);
>> +		/* Resume from the last saved position in a bucket before
>> +		 * iterator was stopped.
>> +		 */
>> +		while (offset-- > 0) {
>> +			sk_for_each(sk, &hslot->head)
>> +				continue;
>> +		}
>> +		sk_for_each(sk, &hslot->head) {
>> +			if (seq_sk_match(seq, sk)) {
>> +				if (!first_sk)
>> +					first_sk = sk;
>> +				if (iter->end_sk < iter->max_sk) {
>> +					sock_hold(sk);
>> +					iter->batch[iter->end_sk++] = sk;
>> +				}
>> +				bucket_sks++;
>> +			}
>> +			new_offset++;
>> +		}
>> +		spin_unlock_bh(&hslot->lock);
>> +
>> +		if (first_sk)
>> +			break;
>> +
>> +		/* Reset the current bucket's offset before moving to the next bucket. */
>> +		offset = 0;
>> +		new_offset = 0;
>> +	}
>> +
>> +	/* All done: no batch made. */
>> +	if (!first_sk)
>> +		goto ret;
>> +
>> +	if (iter->end_sk == bucket_sks) {
>> +		/* Batching is done for the current bucket; return the first
>> +		 * socket to be iterated from the batch.
>> +		 */
>> +		iter->st_bucket_done = true;
>> +		goto ret;
>> +	}
>> +	if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
>> +		resized = true;
>> +		/* Go back to the previous bucket to resize its batch. */
>> +		state->bucket--;
>> +		goto again;
>> +	}
>> +ret:
>> +	state->offset = new_offset;
>> +	return first_sk;
>> +}
>> +
>> +static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>> +{
>> +	struct bpf_udp_iter_state *iter = seq->private;
>> +	struct udp_iter_state *state = &iter->state;
>> +	struct sock *sk;
>> +
>> +	/* Whenever seq_next() is called, the iter->cur_sk is
>> +	 * done with seq_show(), so unref the iter->cur_sk.
>> +	 */
>> +	if (iter->cur_sk < iter->end_sk) {
>> +		sock_put(iter->batch[iter->cur_sk++]);
>> +		++state->offset;
>> +	}
>> +
>> +	/* After updating iter->cur_sk, check if there are more sockets
>> +	 * available in the current bucket batch.
>> +	 */
>> +	if (iter->cur_sk < iter->end_sk) {
>> +		sk = iter->batch[iter->cur_sk];
>> +	} else {
>> +		// Prepare a new batch.
>> +		sk = bpf_iter_udp_batch(seq);
>> +	}
>> +
>> +	++*pos;
>> +	return sk;
>> +}
>> +
>> +static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
>> +{
>> +	/* bpf iter does not support lseek, so it always
>> +	 * continue from where it was stop()-ped.
>> +	 */
>> +	if (*pos)
>> +		return bpf_iter_udp_batch(seq);
>> +
>> +	return SEQ_START_TOKEN;
>> +}
>> +
>>  static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
>>  			     struct udp_sock *udp_sk, uid_t uid, int bucket)
>>  {
>> @@ -3172,18 +3337,38 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
>>  	struct bpf_prog *prog;
>>  	struct sock *sk = v;
>>  	uid_t uid;
>> +	bool slow;
>> +	int rc;
> 
>>  	if (v == SEQ_START_TOKEN)
>>  		return 0;
> 
> 
> [..]
> 
>> +	slow = lock_sock_fast(sk);
>> +
>> +	if (unlikely(sk_unhashed(sk))) {
>> +		rc = SEQ_SKIP;
>> +		goto unlock;
>> +	}
>> +
> 
> Should we use non-fast version here for consistency with tcp?

We could, but I don't see a problem with acquiring fast version for UDP so we could just stick with it. The TCP change warrants a code comment though, I'll add it in the next reversion. 

> 
> 
>>  	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
>>  	meta.seq = seq;
>>  	prog = bpf_iter_get_info(&meta, false);
>> -	return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
>> +	rc = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
>> +
>> +unlock:
>> +	unlock_sock_fast(sk, slow);
>> +	return rc;
>> +}
>> +
>> +static void bpf_iter_udp_unref_batch(struct bpf_udp_iter_state *iter)
>> +{
>> +	while (iter->cur_sk < iter->end_sk)
>> +		sock_put(iter->batch[iter->cur_sk++]);
>>  }
> 
>>  static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
>>  {
>> +	struct bpf_udp_iter_state *iter = seq->private;
>>  	struct bpf_iter_meta meta;
>>  	struct bpf_prog *prog;
> 
>> @@ -3194,15 +3379,31 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
>>  			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
>>  	}
> 
>> -	udp_seq_stop(seq, v);
>> +	if (iter->cur_sk < iter->end_sk) {
>> +		bpf_iter_udp_unref_batch(iter);
>> +		iter->st_bucket_done = false;
>> +	}
>>  }
> 
>>  static const struct seq_operations bpf_iter_udp_seq_ops = {
>> -	.start		= udp_seq_start,
>> -	.next		= udp_seq_next,
>> +	.start		= bpf_iter_udp_seq_start,
>> +	.next		= bpf_iter_udp_seq_next,
>>  	.stop		= bpf_iter_udp_seq_stop,
>>  	.show		= bpf_iter_udp_seq_show,
>>  };
>> +
>> +static unsigned short seq_file_family(const struct seq_file *seq)
>> +{
>> +	const struct udp_seq_afinfo *afinfo;
>> +
>> +	/* BPF iterator: bpf programs to filter sockets. */
>> +	if (seq->op == &bpf_iter_udp_seq_ops)
>> +		return AF_UNSPEC;
>> +
>> +	/* Proc fs iterator */
>> +	afinfo = pde_data(file_inode(seq->file));
>> +	return afinfo->family;
>> +}
>>  #endif
> 
>>  const struct seq_operations udp_seq_ops = {
>> @@ -3413,9 +3614,30 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
>>  DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
>>  		     struct udp_sock *udp_sk, uid_t uid, int bucket)
> 
>> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
>> +				      unsigned int new_batch_sz)
>> +{
>> +	struct sock **new_batch;
>> +
>> +	new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
>> +				   GFP_USER | __GFP_NOWARN);
>> +	if (!new_batch)
>> +		return -ENOMEM;
>> +
>> +	bpf_iter_udp_unref_batch(iter);
>> +	kvfree(iter->batch);
>> +	iter->batch = new_batch;
>> +	iter->max_sk = new_batch_sz;
>> +
>> +	return 0;
>> +}
>> +
>> +#define INIT_BATCH_SZ 16
>> +
>>  static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
>>  {
>> -	struct udp_iter_state *st = priv_data;
>> +	struct bpf_udp_iter_state *iter = priv_data;
>> +	struct udp_iter_state *st = &iter->state;
>>  	struct udp_seq_afinfo *afinfo;
>>  	int ret;
> 
>> @@ -3427,24 +3649,39 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
>>  	afinfo->udp_table = NULL;
>>  	st->bpf_seq_afinfo = afinfo;
>>  	ret = bpf_iter_init_seq_net(priv_data, aux);
>> -	if (ret)
>> +	if (ret) {
>>  		kfree(afinfo);
>> +		return ret;
>> +	}
>> +	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
>> +	if (ret) {
>> +		bpf_iter_fini_seq_net(priv_data);
>> +		return ret;
>> +	}
>> +	iter->cur_sk = 0;
>> +	iter->end_sk = 0;
>> +	iter->st_bucket_done = false;
>> +	st->bucket = 0;
>> +	st->offset = 0;
>> +
>>  	return ret;
>>  }
> 
>>  static void bpf_iter_fini_udp(void *priv_data)
>>  {
>> -	struct udp_iter_state *st = priv_data;
>> +	struct bpf_udp_iter_state *iter = priv_data;
>> +	struct udp_iter_state *st = &iter->state;
> 
>> -	kfree(st->bpf_seq_afinfo);
>>  	bpf_iter_fini_seq_net(priv_data);
>> +	kfree(st->bpf_seq_afinfo);
>> +	kvfree(iter->batch);
>>  }
> 
>>  static const struct bpf_iter_seq_info udp_seq_info = {
>>  	.seq_ops		= &bpf_iter_udp_seq_ops,
>>  	.init_seq_private	= bpf_iter_init_udp,
>>  	.fini_seq_private	= bpf_iter_fini_udp,
>> -	.seq_priv_size		= sizeof(struct udp_iter_state),
>> +	.seq_priv_size		= sizeof(struct bpf_udp_iter_state),
>>  };
> 
>>  static struct bpf_iter_reg udp_reg_info = {
>> --
>> 2.34.1