On 12/20/23 6:24 AM, Daniel Borkmann wrote:
On 12/19/23 8:32 PM, Martin KaFai Lau wrote:
From: Martin KaFai Lau <martin.lau@xxxxxxxxxx>
The bpf_iter_udp iterates all udp_sk by iterating the udp_table.
The bpf_iter_udp stores all udp_sk of a bucket while iterating
the udp_table. The term used in the kernel code is "batch" the
whole bucket. The reason for batching is to allow lock_sock() on
each socket before calling the bpf prog such that the bpf prog can
safely call helper/kfunc that changes the sk's state,
e.g. bpf_setsockopt.
There is a bug in the bpf_iter_udp_batch() function that stops
the userspace from making forward progress.
The case that triggers the bug is the userspace passed in
a very small read buffer. When the bpf prog does bpf_seq_printf,
the userspace read buffer is not enough to capture the whole "batch".
When the read buffer is not enough for the whole "batch", the kernel
will remember the offset of the batch in iter->offset such that
the next userspace read() can continue from where it left off.
The kernel will skip the number (== "iter->offset") of sockets in
the next read(). However, the code directly decrements the
"--iter->offset". This is incorrect because the next read() may
not consume the whole "batch" either and the next next read() will
start from offset 0.
Doing "--iter->offset" is essentially making backward progress.
The net effect is the userspace will keep reading from the beginning
of a bucket and the process will never finish. "iter->offset" must always
go forward until the whole "batch" (or bucket) is consumed by the
userspace.
This patch fixes it by doing the decrement in a local stack
variable.
nit: Probably makes sense to also state here that bpf_iter_tcp does
not have this issue, so it's clear from commit message that you did
also audit the TCP one.
Ack.
Cc: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx>
Fixes: c96dac8d369f ("bpf: udp: Implement batching for sockets iterator")
Signed-off-by: Martin KaFai Lau <martin.lau@xxxxxxxxxx>
---
net/ipv4/udp.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89e5a806b82e..6cf4151c2eb4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3141,6 +3141,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file
*seq)
unsigned int batch_sks = 0;
bool resized = false;
struct sock *sk;
+ int offset;
/* The current batch is done, so advance the bucket. */
if (iter->st_bucket_done) {
@@ -3162,6 +3163,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file
*seq)
iter->end_sk = 0;
iter->st_bucket_done = false;
batch_sks = 0;
+ offset = iter->offset;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
@@ -3177,8 +3179,8 @@ static struct sock *bpf_iter_udp_batch(struct seq_file
*seq)
/* Resume from the last iterated socket at the
* offset in the bucket before iterator was stopped.
*/
- if (iter->offset) {
- --iter->offset;
+ if (offset) {
+ --offset;
continue;
}
if (iter->end_sk < iter->max_sk) {
Do we also need something like :
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6cf4151c2eb4..ef4fc436253d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3169,7 +3169,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
if (hlist_empty(&hslot2->head)) {
- iter->offset = 0;
+ iter->offset = offset = 0;
continue;
}
@@ -3196,7 +3196,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
break;
/* Reset the current bucket's offset before moving to the next
bucket. */
- iter->offset = 0;
+ iter->offset = offset = 0;
}
/* All done: no batch made. */
For the case when upon retry the current batch is done earlier than expected ?
Good catch. It will unnecessary skip in the following batch/bucket if there is
changes in the current batch/bucket.
From looking at the loop again, I think it is better not to change the
iter->offset during the for loop. Only update iter->offset after the for loop
has concluded.
The non-zero iter->offset is only useful for the first bucket, so does a test on
the first bucket (state->bucket == bucket) before skipping sockets. Something
like this:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89e5a806b82e..a993f364d6ae 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3139,6 +3139,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
struct net *net = seq_file_net(seq);
struct udp_table *udptable;
unsigned int batch_sks = 0;
+ int bucket, bucket_offset;
bool resized = false;
struct sock *sk;
@@ -3162,14 +3163,14 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
iter->end_sk = 0;
iter->st_bucket_done = false;
batch_sks = 0;
+ bucket = state->bucket;
+ bucket_offset = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
- if (hlist_empty(&hslot2->head)) {
- iter->offset = 0;
+ if (hlist_empty(&hslot2->head))
continue;
- }
spin_lock_bh(&hslot2->lock);
udp_portaddr_for_each_entry(sk, &hslot2->head) {
@@ -3177,8 +3178,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
/* Resume from the last iterated socket at the
* offset in the bucket before iterator was stopped.
*/
- if (iter->offset) {
- --iter->offset;
+ if (state->bucket == bucket &&
+ bucket_offset < iter->offset) {
+ ++bucket_offset;
continue;
}
if (iter->end_sk < iter->max_sk) {
@@ -3192,10 +3194,10 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
if (iter->end_sk)
break;
+ }
- /* Reset the current bucket's offset before moving to the next bucket. */
+ if (state->bucket != bucket)
iter->offset = 0;
- }
/* All done: no batch made. */
if (!iter->end_sk)
From 2d70cbeb92daff876dcf88733b8d745393f033b0 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@xxxxxxxxxx>
Date: Mon, 18 Dec 2023 17:33:10 -0800
Subject: [PATCH v2 bpf 1/2] bpf: Avoid iter->offset making backward progress
in bpf_iter_udp
The bpf_iter_udp iterates all udp_sk by iterating the udp_table.
The bpf_iter_udp stores all udp_sk of a bucket while iterating
the udp_table. The term used in the kernel code is "batch" the
whole bucket. The reason for batching is to allow lock_sock() on
each socket before calling the bpf prog such that the bpf prog can
safely call helper/kfunc that changes the sk's state,
e.g. bpf_setsockopt.
There is a bug in the bpf_iter_udp_batch() function that stops
the userspace from making forward progress.
The case that triggers the bug is the userspace passed in
a very small read buffer. When the bpf prog does bpf_seq_printf,
the userspace read buffer is not enough to capture the whole "batch".
When the read buffer is not enough for the whole "batch", the kernel
will remember the offset of the batch in iter->offset such that
the next userspace read() can continue from where it left off.
The kernel will skip the number (== "iter->offset") of sockets in
the next read(). However, the code directly decrements the
"--iter->offset". This is incorrect because the next read() may
not consume the whole "batch" either and the next next read() will
start from offset 0.
Doing "--iter->offset" is essentially making backward progress.
The net effect is the userspace will keep reading from the beginning
of a bucket and the process will never finish. "iter->offset" must always
go forward until the whole "batch" (or bucket) is consumed by the
userspace.
This patch fixes it by doing the decrement in a local stack
variable.
Cc: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx>
Fixes: c96dac8d369f ("bpf: udp: Implement batching for sockets iterator")
Signed-off-by: Martin KaFai Lau <martin.lau@xxxxxxxxxx>
---
net/ipv4/udp.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89e5a806b82e..a993f364d6ae 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3139,6 +3139,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
struct net *net = seq_file_net(seq);
struct udp_table *udptable;
unsigned int batch_sks = 0;
+ int bucket, bucket_offset;
bool resized = false;
struct sock *sk;
@@ -3162,14 +3163,14 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
iter->end_sk = 0;
iter->st_bucket_done = false;
batch_sks = 0;
+ bucket = state->bucket;
+ bucket_offset = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
- if (hlist_empty(&hslot2->head)) {
- iter->offset = 0;
+ if (hlist_empty(&hslot2->head))
continue;
- }
spin_lock_bh(&hslot2->lock);
udp_portaddr_for_each_entry(sk, &hslot2->head) {
@@ -3177,8 +3178,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
/* Resume from the last iterated socket at the
* offset in the bucket before iterator was stopped.
*/
- if (iter->offset) {
- --iter->offset;
+ if (state->bucket == bucket &&
+ bucket_offset < iter->offset) {
+ ++bucket_offset;
continue;
}
if (iter->end_sk < iter->max_sk) {
@@ -3192,10 +3194,10 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
if (iter->end_sk)
break;
+ }
- /* Reset the current bucket's offset before moving to the next bucket. */
+ if (state->bucket != bucket)
iter->offset = 0;
- }
/* All done: no batch made. */
if (!iter->end_sk)
--
2.34.1