Patch "bpf, sockmap: Avoid returning unneeded EAGAIN when redirecting to self" has been added to the 5.9-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Sun, 22 Nov 2020 22:16:19 -0500

This is a note to let you know that I've just added the patch titled

    bpf, sockmap: Avoid returning unneeded EAGAIN when redirecting to self

to the 5.9-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     bpf-sockmap-avoid-returning-unneeded-eagain-when-red.patch
and it can be found in the queue-5.9 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit de66736db60beac6bdf3d3872bd9d294d94b6934
Author: John Fastabend <john.fastabend@xxxxxxxxx>
Date:   Mon Nov 16 14:28:46 2020 -0800

    bpf, sockmap: Avoid returning unneeded EAGAIN when redirecting to self
    
    [ Upstream commit 6fa9201a898983da731fca068bb4b5c941537588 ]
    
    If a socket redirects to itself and it is under memory pressure it is
    possible to get a socket stuck so that recv() returns EAGAIN and the
    socket can not advance for some time. This happens because when
    redirecting a skb to the same socket we received the skb on we first
    check if it is OK to enqueue the skb on the receiving socket by checking
    memory limits. But, if the skb is itself the object holding the memory
    needed to enqueue the skb we will keep retrying from kernel side
    and always fail with EAGAIN. Then userspace will get a recv() EAGAIN
    error if there are no skbs in the psock ingress queue. This will continue
    until either some skbs get kfree'd causing the memory pressure to
    reduce far enough that we can enqueue the pending packet or the
    socket is destroyed. In some cases its possible to get a socket
    stuck for a noticeable amount of time if the socket is only receiving
    skbs from sk_skb verdict programs. To reproduce I make the socket
    memory limits ridiculously low so sockets are always under memory
    pressure. More often though if under memory pressure it looks like
    a spurious EAGAIN error on user space side causing userspace to retry
    and typically enough has moved on the memory side that it works.
    
    To fix skip memory checks and skb_orphan if receiving on the same
    sock as already assigned.
    
    For SK_PASS cases this is easy, its always the same socket so we
    can just omit the orphan/set_owner pair.
    
    For backlog cases we need to check skb->sk and decide if the orphan
    and set_owner pair are needed.
    
    Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
    Signed-off-by: John Fastabend <john.fastabend@xxxxxxxxx>
    Signed-off-by: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
    Reviewed-by: Jakub Sitnicki <jakub@xxxxxxxxxxxxxx>
    Link: https://lore.kernel.org/bpf/160556572660.73229.12566203819812939627.stgit@john-XPS-13-9370
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 4ac112cc490c5..97f4c11a2ea7a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,38 +399,38 @@ out:
 }
 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
 
-static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+						  struct sk_buff *skb)
 {
-	struct sock *sk = psock->sk;
-	int copied = 0, num_sge;
 	struct sk_msg *msg;
 
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
-		return -EAGAIN;
+		return NULL;
+
+	if (!sk_rmem_schedule(sk, skb, skb->truesize))
+		return NULL;
 
 	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
 	if (unlikely(!msg))
-		return -EAGAIN;
-	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
-		kfree(msg);
-		return -EAGAIN;
-	}
+		return NULL;
 
 	sk_msg_init(msg);
-	num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+	return msg;
+}
+
+static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
+					struct sk_psock *psock,
+					struct sock *sk,
+					struct sk_msg *msg)
+{
+	int num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+	int copied;
+
 	if (unlikely(num_sge < 0)) {
 		kfree(msg);
 		return num_sge;
 	}
 
-	/* This will transition ownership of the data from the socket where
-	 * the BPF program was run initiating the redirect to the socket
-	 * we will eventually receive this data on. The data will be released
-	 * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
-	 * into user buffers.
-	 */
-	skb_set_owner_r(skb, sk);
-
 	copied = skb->len;
 	msg->sg.start = 0;
 	msg->sg.size = copied;
@@ -442,6 +442,40 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 	return copied;
 }
 
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+	struct sock *sk = psock->sk;
+	struct sk_msg *msg;
+
+	msg = sk_psock_create_ingress_msg(sk, skb);
+	if (!msg)
+		return -EAGAIN;
+
+	/* This will transition ownership of the data from the socket where
+	 * the BPF program was run initiating the redirect to the socket
+	 * we will eventually receive this data on. The data will be released
+	 * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
+	 * into user buffers.
+	 */
+	skb_set_owner_r(skb, sk);
+	return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
+/* Puts an skb on the ingress queue of the socket already assigned to the
+ * skb. In this case we do not need to check memory limits or skb_set_owner_r
+ * because the skb is already accounted for here.
+ */
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb)
+{
+	struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+	struct sock *sk = psock->sk;
+
+	if (unlikely(!msg))
+		return -EAGAIN;
+	sk_msg_init(msg);
+	return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
 static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 			       u32 off, u32 len, bool ingress)
 {
@@ -784,7 +818,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 		 * retrying later from workqueue.
 		 */
 		if (skb_queue_empty(&psock->ingress_skb)) {
-			err = sk_psock_skb_ingress(psock, skb);
+			err = sk_psock_skb_ingress_self(psock, skb);
 		}
 		if (err < 0) {
 			skb_queue_tail(&psock->ingress_skb, skb);