On 2/23/23 1:53 PM, Aditi Ghag wrote:
+struct bpf_udp_iter_state {
+ struct udp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ struct sock **batch;
+ bool st_bucket_done;
+};
+
+static unsigned short seq_file_family(const struct seq_file *seq);
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+ unsigned int new_batch_sz);
+
+static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+ unsigned short family = seq_file_family(seq);
+
+ /* AF_UNSPEC is used as a match all */
+ return ((family == AF_UNSPEC || family == sk->sk_family) &&
+ net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct udp_iter_state *state = &iter->state;
+ struct net *net = seq_file_net(seq);
+ struct udp_seq_afinfo *afinfo = state->bpf_seq_afinfo;
+ struct udp_table *udptable;
+ struct sock *first_sk = NULL;
+ struct sock *sk;
+ unsigned int bucket_sks = 0;
+ bool first;
+ bool resized = false;
+
+ /* The current batch is done, so advance the bucket. */
+ if (iter->st_bucket_done)
+ state->bucket++;
+
+ udptable = udp_get_table_afinfo(afinfo, net);
+
+again:
+ /* New batch for the next bucket.
+ * Iterate over the hash table to find a bucket with sockets matching
+ * the iterator attributes, and return the first matching socket from
+ * the bucket. The remaining matched sockets from the bucket are batched
+ * before releasing the bucket lock. This allows BPF programs that are
+ * called in seq_show to acquire the bucket lock if needed.
+ */
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+ iter->st_bucket_done = false;
+ first = true;
+
+ for (; state->bucket <= udptable->mask; state->bucket++) {
+ struct udp_hslot *hslot = &udptable->hash[state->bucket];
Since it is mostly separated from the proc's udp-seq-file now, may as well
iterate the udptable->hash"2" which is hashed by both addr and port such that
each batch should be smaller.
+
+ if (hlist_empty(&hslot->head))
+ continue;
+
+ spin_lock_bh(&hslot->lock);
+ sk_for_each(sk, &hslot->head) {
+ if (seq_sk_match(seq, sk)) {
+ if (first) {
+ first_sk = sk;
+ first = false;
+ }
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++] = sk;
+ }
+ bucket_sks++;
+ }
+ }
+ spin_unlock_bh(&hslot->lock);
+ if (first_sk)
+ break;
+ }
+
+ /* All done: no batch made. */
+ if (!first_sk)
+ return NULL;
I think first_sk and bucket_sks need to be reset on the "again" case also?
If bpf_iter_udp_seq_stop() is called before a batch has been fully processed by
the bpf prog in ".show", how does the next bpf_iter_udp_seq_start() continue
from where it left off? The bpf_tcp_iter remembers the bucket and the
offset-in-this-bucket. I think bpf_udp_iter can do something similar.
+
+ if (iter->end_sk == bucket_sks) {
+ /* Batching is done for the current bucket; return the first
+ * socket to be iterated from the batch.
+ */
+ iter->st_bucket_done = true;
+ return first_sk;
+ }
+ if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
+ resized = true;
+ /* Go back to the previous bucket to resize its batch. */
+ state->bucket--;
+ goto again;
+ }
+ return first_sk;
+}
+
[ ... ]
static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
{
- struct udp_iter_state *st = priv_data;
+ struct bpf_udp_iter_state *iter = priv_data;
+ struct udp_iter_state *st = &iter->state;
struct udp_seq_afinfo *afinfo;
int ret;
@@ -3427,24 +3623,34 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
afinfo->udp_table = NULL;
st->bpf_seq_afinfo = afinfo;
Is bpf_seq_afinfo still needed in 'struct udp_iter_state'? Can it be removed?
ret = bpf_iter_init_seq_net(priv_data, aux);
- if (ret)
+ if (ret) {
kfree(afinfo);
+ return ret;
+ }
+ ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+ if (ret) {
+ bpf_iter_fini_seq_net(priv_data);
+ return ret;
+ }
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
return ret;
}
static void bpf_iter_fini_udp(void *priv_data)
{
- struct udp_iter_state *st = priv_data;
+ struct bpf_udp_iter_state *iter = priv_data;
- kfree(st->bpf_seq_afinfo);
bpf_iter_fini_seq_net(priv_data);
+ kfree(iter->batch);
kvfree