Patch "bpf, sockmap: Improved check for empty queue" has been added to the 6.1-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Sun, 28 May 2023 22:47:31 -0400

This is a note to let you know that I've just added the patch titled

    bpf, sockmap: Improved check for empty queue

to the 6.1-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     bpf-sockmap-improved-check-for-empty-queue.patch
and it can be found in the queue-6.1 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit b30cce8f5c1fd0d245cc4075f7cfe47d1ea91c85
Author: John Fastabend <john.fastabend@xxxxxxxxx>
Date:   Mon May 22 19:56:08 2023 -0700

    bpf, sockmap: Improved check for empty queue
    
    [ Upstream commit 405df89dd52cbcd69a3cd7d9a10d64de38f854b2 ]
    
    We noticed some rare sk_buffs were stepping past the queue when system was
    under memory pressure. The general theory is to skip enqueueing
    sk_buffs when its not necessary which is the normal case with a system
    that is properly provisioned for the task, no memory pressure and enough
    cpu assigned.
    
    But, if we can't allocate memory due to an ENOMEM error when enqueueing
    the sk_buff into the sockmap receive queue we push it onto a delayed
    workqueue to retry later. When a new sk_buff is received we then check
    if that queue is empty. However, there is a problem with simply checking
    the queue length. When a sk_buff is being processed from the ingress queue
    but not yet on the sockmap msg receive queue its possible to also recv
    a sk_buff through normal path. It will check the ingress queue which is
    zero and then skip ahead of the pkt being processed.
    
    Previously we used sock lock from both contexts which made the problem
    harder to hit, but not impossible.
    
    To fix instead of popping the skb from the queue entirely we peek the
    skb from the queue and do the copy there. This ensures checks to the
    queue length are non-zero while skb is being processed. Then finally
    when the entire skb has been copied to user space queue or another
    socket we pop it off the queue. This way the queue length check allows
    bypassing the queue only after the list has been completely processed.
    
    To reproduce issue we run NGINX compliance test with sockmap running and
    observe some flakes in our testing that we attributed to this issue.
    
    Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
    Suggested-by: Jakub Sitnicki <jakub@xxxxxxxxxxxxxx>
    Signed-off-by: John Fastabend <john.fastabend@xxxxxxxxx>
    Signed-off-by: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
    Tested-by: William Findlay <will@xxxxxxxxxxxxx>
    Reviewed-by: Jakub Sitnicki <jakub@xxxxxxxxxxxxxx>
    Link: https://lore.kernel.org/bpf/20230523025618.113937-5-john.fastabend@xxxxxxxxx
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 904ff9a32ad61..054d7911bfc9f 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -71,7 +71,6 @@ struct sk_psock_link {
 };
 
 struct sk_psock_work_state {
-	struct sk_buff			*skb;
 	u32				len;
 	u32				off;
 };
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 2dfb6e31e8d04..d3ffca1b96462 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -621,16 +621,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 
 static void sk_psock_skb_state(struct sk_psock *psock,
 			       struct sk_psock_work_state *state,
-			       struct sk_buff *skb,
 			       int len, int off)
 {
 	spin_lock_bh(&psock->ingress_lock);
 	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
-		state->skb = skb;
 		state->len = len;
 		state->off = off;
-	} else {
-		sock_drop(psock->sk, skb);
 	}
 	spin_unlock_bh(&psock->ingress_lock);
 }
@@ -641,23 +637,17 @@ static void sk_psock_backlog(struct work_struct *work)
 	struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
 	struct sk_psock_work_state *state = &psock->work_state;
 	struct sk_buff *skb = NULL;
+	u32 len = 0, off = 0;
 	bool ingress;
-	u32 len, off;
 	int ret;
 
 	mutex_lock(&psock->work_mutex);
-	if (unlikely(state->skb)) {
-		spin_lock_bh(&psock->ingress_lock);
-		skb = state->skb;
+	if (unlikely(state->len)) {
 		len = state->len;
 		off = state->off;
-		state->skb = NULL;
-		spin_unlock_bh(&psock->ingress_lock);
 	}
-	if (skb)
-		goto start;
 
-	while ((skb = skb_dequeue(&psock->ingress_skb))) {
+	while ((skb = skb_peek(&psock->ingress_skb))) {
 		len = skb->len;
 		off = 0;
 		if (skb_bpf_strparser(skb)) {
@@ -666,7 +656,6 @@ static void sk_psock_backlog(struct work_struct *work)
 			off = stm->offset;
 			len = stm->full_len;
 		}
-start:
 		ingress = skb_bpf_ingress(skb);
 		skb_bpf_redirect_clear(skb);
 		do {
@@ -676,8 +665,7 @@ static void sk_psock_backlog(struct work_struct *work)
 							  len, ingress);
 			if (ret <= 0) {
 				if (ret == -EAGAIN) {
-					sk_psock_skb_state(psock, state, skb,
-							   len, off);
+					sk_psock_skb_state(psock, state, len, off);
 
 					/* Delay slightly to prioritize any
 					 * other work that might be here.
@@ -689,15 +677,16 @@ static void sk_psock_backlog(struct work_struct *work)
 				/* Hard errors break pipe and stop xmit. */
 				sk_psock_report_error(psock, ret ? -ret : EPIPE);
 				sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
-				sock_drop(psock->sk, skb);
 				goto end;
 			}
 			off += ret;
 			len -= ret;
 		} while (len);
 
-		if (!ingress)
+		skb = skb_dequeue(&psock->ingress_skb);
+		if (!ingress) {
 			kfree_skb(skb);
+		}
 	}
 end:
 	mutex_unlock(&psock->work_mutex);
@@ -790,11 +779,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
 		skb_bpf_redirect_clear(skb);
 		sock_drop(psock->sk, skb);
 	}
-	kfree_skb(psock->work_state.skb);
-	/* We null the skb here to ensure that calls to sk_psock_backlog
-	 * do not pick up the free'd skb.
-	 */
-	psock->work_state.skb = NULL;
 	__sk_psock_purge_ingress_msg(psock);
 }
 
@@ -813,7 +797,6 @@ void sk_psock_stop(struct sk_psock *psock)
 	spin_lock_bh(&psock->ingress_lock);
 	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 	sk_psock_cork_free(psock);
-	__sk_psock_zap_ingress(psock);
 	spin_unlock_bh(&psock->ingress_lock);
 }
 
@@ -828,6 +811,7 @@ static void sk_psock_destroy(struct work_struct *work)
 	sk_psock_done_strp(psock);
 
 	cancel_delayed_work_sync(&psock->work);
+	__sk_psock_zap_ingress(psock);
 	mutex_destroy(&psock->work_mutex);
 
 	psock_progs_drop(&psock->progs);