On Tue, May 02, 2023 at 08:51 AM -07, John Fastabend wrote: > We noticed some rare sk_buffs were stepping past the queue when system was > under memory pressure. The general theory is to skip enqueueing > sk_buffs when its not necessary which is the normal case with a system > that is properly provisioned for the task, no memory pressure and enough > cpu assigned. > > But, if we can't allocate memory due to an ENOMEM error when enqueueing > the sk_buff into the sockmap receive queue we push it onto a delayed > workqueue to retry later. When a new sk_buff is received we then check > if that queue is empty. However, there is a problem with simply checking > the queue length. When a sk_buff is being processed from the ingress queue > but not yet on the sockmap msg receive queue its possible to also recv > a sk_buff through normal path. It will check the ingress queue which is > zero and then skip ahead of the pkt being processed. > > Previously we used sock lock from both contexts which made the problem > harder to hit, but not impossible. > > To fix instead of popping the skb from the queue entirely we peek the > skb from the queue and do the copy there. This ensures checks to the > queue length are non-zero while skb is being processed. Then finally > when the entire skb has been copied to user space queue or another > socket we pop it off the queue. This way the queue length check allows > bypassing the queue only after the list has been completely processed. > > To reproduce issue we run NGINX compliance test with sockmap running and > observe some flakes in our testing that we attributed to this issue. > > Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") > Tested-by: William Findlay <will@xxxxxxxxxxxxx> > Suggested-by: Jakub Sitnicki <jakub@xxxxxxxxxxxxxx> > Signed-off-by: John Fastabend <john.fastabend@xxxxxxxxx> > --- > include/linux/skmsg.h | 1 - > net/core/skmsg.c | 32 ++++++++------------------------ > 2 files changed, 8 insertions(+), 25 deletions(-) > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > index 904ff9a32ad6..054d7911bfc9 100644 > --- a/include/linux/skmsg.h > +++ b/include/linux/skmsg.h > @@ -71,7 +71,6 @@ struct sk_psock_link { > }; > > struct sk_psock_work_state { > - struct sk_buff *skb; > u32 len; > u32 off; > }; > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > index 3f95c460c261..bc5ca973400c 100644 > --- a/net/core/skmsg.c > +++ b/net/core/skmsg.c > @@ -622,16 +622,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, > > static void sk_psock_skb_state(struct sk_psock *psock, > struct sk_psock_work_state *state, > - struct sk_buff *skb, > int len, int off) > { > spin_lock_bh(&psock->ingress_lock); > if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { > - state->skb = skb; > state->len = len; > state->off = off; > - } else { > - sock_drop(psock->sk, skb); > } > spin_unlock_bh(&psock->ingress_lock); > } > @@ -642,23 +638,17 @@ static void sk_psock_backlog(struct work_struct *work) > struct sk_psock *psock = container_of(dwork, struct sk_psock, work); > struct sk_psock_work_state *state = &psock->work_state; > struct sk_buff *skb = NULL; > + u32 len = 0, off = 0; > bool ingress; > - u32 len, off; > int ret; > > mutex_lock(&psock->work_mutex); > - if (unlikely(state->skb)) { > - spin_lock_bh(&psock->ingress_lock); > - skb = state->skb; > + if (unlikely(state->len)) { > len = state->len; > off = state->off; > - state->skb = NULL; > - spin_unlock_bh(&psock->ingress_lock); > } > - if (skb) > - goto start; > > - while ((skb = skb_dequeue(&psock->ingress_skb))) { > + while ((skb = skb_peek(&psock->ingress_skb))) { > len = skb->len; > off = 0; > if (skb_bpf_strparser(skb)) { > @@ -667,7 +657,6 @@ static void sk_psock_backlog(struct work_struct *work) > off = stm->offset; > len = stm->full_len; > } > -start: > ingress = skb_bpf_ingress(skb); > skb_bpf_redirect_clear(skb); > do { > @@ -677,8 +666,7 @@ static void sk_psock_backlog(struct work_struct *work) > len, ingress); > if (ret <= 0) { > if (ret == -EAGAIN) { > - sk_psock_skb_state(psock, state, skb, > - len, off); > + sk_psock_skb_state(psock, state, len, off); > > /* Delay slightly to prioritize any > * other work that might be here. I've been staring at this bit and I think it doesn't matter if we update psock->work_state when SK_PSOCK_TX_ENABLED has been cleared. But what I think we shouldn't be doing here is scheduling sk_psock_backlog again if SK_PSOCK_TX_ENABLED got cleared by sk_psock_stop. > @@ -689,15 +677,16 @@ static void sk_psock_backlog(struct work_struct *work) > /* Hard errors break pipe and stop xmit. */ > sk_psock_report_error(psock, ret ? -ret : EPIPE); > sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); > - sock_drop(psock->sk, skb); > goto end; > } > off += ret; > len -= ret; > } while (len); > > - if (!ingress) > + skb = skb_dequeue(&psock->ingress_skb); > + if (!ingress) { > kfree_skb(skb); > + } > } > end: > mutex_unlock(&psock->work_mutex);