On Tue, 2023-04-18 at 15:31 +0000, Aditi Ghag wrote: > Batch UDP sockets from BPF iterator that allows for overlapping locking > semantics in BPF/kernel helpers executed in BPF programs. This facilitates > BPF socket destroy kfunc (introduced by follow-up patches) to execute from > BPF iterator programs. > > Previously, BPF iterators acquired the sock lock and sockets hash table > bucket lock while executing BPF programs. This prevented BPF helpers that > again acquire these locks to be executed from BPF iterators. With the > batching approach, we acquire a bucket lock, batch all the bucket sockets, > and then release the bucket lock. This enables BPF or kernel helpers to > skip sock locking when invoked in the supported BPF contexts. > > The batching logic is similar to the logic implemented in TCP iterator: > https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@xxxxxx/. > > Suggested-by: Martin KaFai Lau <martin.lau@xxxxxxxxxx> > Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx> > --- > net/ipv4/udp.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 203 insertions(+), 6 deletions(-) > > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index 8689ed171776..f1c001641e53 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -3148,6 +3148,145 @@ struct bpf_iter__udp { > int bucket __aligned(8); > }; > > +struct bpf_udp_iter_state { > + struct udp_iter_state state; > + unsigned int cur_sk; > + unsigned int end_sk; > + unsigned int max_sk; > + int offset; > + struct sock **batch; > + bool st_bucket_done; > +}; > + > +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, > + unsigned int new_batch_sz); > +static struct sock *bpf_iter_udp_batch(struct seq_file *seq) > +{ > + struct bpf_udp_iter_state *iter = seq->private; > + struct udp_iter_state *state = &iter->state; > + struct net *net = seq_file_net(seq); > + struct udp_seq_afinfo afinfo; > + struct udp_table *udptable; > + unsigned int batch_sks = 0; > + bool resized = false; > + struct sock *sk; > + > + /* The current batch is done, so advance the bucket. */ > + if (iter->st_bucket_done) { > + state->bucket++; > + iter->offset = 0; > + } > + > + afinfo.family = AF_UNSPEC; > + afinfo.udp_table = NULL; > + udptable = udp_get_table_afinfo(&afinfo, net); > + > +again: > + /* New batch for the next bucket. > + * Iterate over the hash table to find a bucket with sockets matching > + * the iterator attributes, and return the first matching socket from > + * the bucket. The remaining matched sockets from the bucket are batched > + * before releasing the bucket lock. This allows BPF programs that are > + * called in seq_show to acquire the bucket lock if needed. > + */ > + iter->cur_sk = 0; > + iter->end_sk = 0; > + iter->st_bucket_done = false; > + batch_sks = 0; > + > + for (; state->bucket <= udptable->mask; state->bucket++) { > + struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; > + > + if (hlist_empty(&hslot2->head)) { > + iter->offset = 0; > + continue; > + } > + > + spin_lock_bh(&hslot2->lock); > + udp_portaddr_for_each_entry(sk, &hslot2->head) { > + if (seq_sk_match(seq, sk)) { > + /* Resume from the last iterated socket at the > + * offset in the bucket before iterator was stopped. > + */ > + if (iter->offset) { > + --iter->offset; > + continue; > + } > + if (iter->end_sk < iter->max_sk) { > + sock_hold(sk); > + iter->batch[iter->end_sk++] = sk; > + } > + batch_sks++; > + } > + } > + spin_unlock_bh(&hslot2->lock); > + > + if (iter->end_sk) > + break; > + > + /* Reset the current bucket's offset before moving to the next bucket. */ > + iter->offset = 0; > + } > + > + /* All done: no batch made. */ > + if (!iter->end_sk) > + return NULL; > + > + if (iter->end_sk == batch_sks) { > + /* Batching is done for the current bucket; return the first > + * socket to be iterated from the batch. > + */ > + iter->st_bucket_done = true; > + goto ret; > + } > + if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) { > + resized = true; > + /* Go back to the previous bucket to resize its batch. */ > + state->bucket--; > + goto again; > + } > +ret: > + return iter->batch[0]; > +} > + > +static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) > +{ > + struct bpf_udp_iter_state *iter = seq->private; > + struct sock *sk; > + > + /* Whenever seq_next() is called, the iter->cur_sk is > + * done with seq_show(), so unref the iter->cur_sk. > + */ > + if (iter->cur_sk < iter->end_sk) { > + sock_put(iter->batch[iter->cur_sk++]); > + ++iter->offset; > + } > + > + /* After updating iter->cur_sk, check if there are more sockets > + * available in the current bucket batch. > + */ > + if (iter->cur_sk < iter->end_sk) { > + sk = iter->batch[iter->cur_sk]; > + } else { > + // Prepare a new batch. Minor nit: please use /* */ even for single line comments. Thanks Paolo