Re: [PATCH bpf 07/11] bpf: sockmap incorrectly handling copied_seq

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Mar 21, 2023 at 2:52 PM John Fastabend <john.fastabend@xxxxxxxxx> wrote:
>
> The read_skb() logic is incrementing the tcp->copied_seq which is used for
> among other things calculating how many outstanding bytes can be read by
> the application. This results in application errors, if the application
> does an ioctl(FIONREAD) we return zero because this is calculated from
> the copied_seq value.
>
> To fix this we move tcp->copied_seq accounting into the recv handler so
> that we update these when the recvmsg() hook is called and data is in
> fact copied into user buffers. This gives an accurate FIONREAD value
> as expected and improves ACK handling. Before we were calling the
> tcp_rcv_space_adjust() which would update 'number of bytes copied to
> user in last RTT' which is wrong for programs returning SK_PASS. The
> bytes are only copied to the user when recvmsg is handled.
>
> Doing the fix for recvmsg is straightforward, but fixing redirect and
> SK_DROP pkts is a bit tricker. Build a tcp_psock_eat() helper and then
> call this from skmsg handlers. This fixes another issue where a broken
> socket with a BPF program doing a resubmit could hang the receiver. This
> happened because although read_skb() consumed the skb through sock_drop()
> it did not update the copied_seq. Now if a single reccv socket is
> redirecting to many sockets (for example for lb) the receiver sk will be
> hung even though we might expect it to continue. The hang comes from
> not updating the copied_seq numbers and memory pressure resulting from
> that.
>
> We have a slight layer problem of calling tcp_eat_skb even if its not
> a TCP socket. To fix we could refactor and create per type receiver
> handlers. I decided this is more work than we want in the fix and we
> already have some small tweaks depending on caller that use the
> helper skb_bpf_strparser(). So we extend that a bit and always set
> the strparser bit when it is in use and then we can gate the
> seq_copied updates on this.
>
> Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
> Signed-off-by: John Fastabend <john.fastabend@xxxxxxxxx>
> ---
>  include/net/tcp.h  |  3 +++
>  net/core/skmsg.c   |  7 +++++--
>  net/ipv4/tcp.c     | 10 +---------
>  net/ipv4/tcp_bpf.c | 28 +++++++++++++++++++++++++++-
>  4 files changed, 36 insertions(+), 12 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index db9f828e9d1e..674044b8bdaf 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1467,6 +1467,8 @@ static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
>  }
>
>  void tcp_cleanup_rbuf(struct sock *sk, int copied);
> +void __tcp_cleanup_rbuf(struct sock *sk, int copied);
> +
>
>  /* We provision sk_rcvbuf around 200% of sk_rcvlowat.
>   * If 87.5 % (7/8) of the space has been consumed, we want to override
> @@ -2321,6 +2323,7 @@ struct sk_psock;
>  struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
>  int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
>  void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
> +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb);
>  #endif /* CONFIG_BPF_SYSCALL */
>
>  int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index 10e5481da662..b141b422697c 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -1051,11 +1051,14 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
>                 mutex_unlock(&psock->work_mutex);
>                 break;
>         case __SK_REDIRECT:
> +               tcp_eat_skb(psock->sk, skb);
>                 err = sk_psock_skb_redirect(psock, skb);
>                 break;
>         case __SK_DROP:
>         default:
>  out_free:
> +               tcp_eat_skb(psock->sk, skb);
> +               skb_bpf_redirect_clear(skb);
>                 sock_drop(psock->sk, skb);
>         }
>
> @@ -1100,8 +1103,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
>                 skb_dst_drop(skb);
>                 skb_bpf_redirect_clear(skb);
>                 ret = bpf_prog_run_pin_on_cpu(prog, skb);
> -               if (ret == SK_PASS)
> -                       skb_bpf_set_strparser(skb);
> +               skb_bpf_set_strparser(skb);
>                 ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
>                 skb->sk = NULL;
>         }
> @@ -1207,6 +1209,7 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
>         psock = sk_psock(sk);
>         if (unlikely(!psock)) {
>                 len = 0;
> +               tcp_eat_skb(sk, skb);
>                 sock_drop(sk, skb);
>                 goto out;
>         }
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 6572962b0237..e2594d8e3429 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1568,7 +1568,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
>   * calculation of whether or not we must ACK for the sake of
>   * a window update.
>   */
> -static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
> +void __tcp_cleanup_rbuf(struct sock *sk, int copied)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>         bool time_to_ack = false;
> @@ -1783,14 +1783,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
>                         break;
>                 }
>         }
> -       WRITE_ONCE(tp->copied_seq, seq);
> -
> -       tcp_rcv_space_adjust(sk);
> -
> -       /* Clean up data we have read: This will do ACK frames. */
> -       if (copied > 0)
> -               __tcp_cleanup_rbuf(sk, copied);
> -
>         return copied;
>  }
>  EXPORT_SYMBOL(tcp_read_skb);
> diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
> index b1ba58be0c5a..c0e5680dccc0 100644
> --- a/net/ipv4/tcp_bpf.c
> +++ b/net/ipv4/tcp_bpf.c
> @@ -11,6 +11,24 @@
>  #include <net/inet_common.h>
>  #include <net/tls.h>
>
> +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
> +{
> +       struct tcp_sock *tcp;
> +       int copied;
> +
> +       if (!skb || !skb->len || !sk_is_tcp(sk))
> +               return;
> +
> +       if (skb_bpf_strparser(skb))
> +               return;
> +
> +       tcp = tcp_sk(sk);
> +       copied = tcp->copied_seq + skb->len;
> +       WRITE_ONCE(tcp->copied_seq, skb->len);

It seems your tests are unable to catch this bug :/

> +       tcp_rcv_space_adjust(sk);
> +       __tcp_cleanup_rbuf(sk, skb->len);
> +}
> +
>  static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
>                            struct sk_msg *msg, u32 apply_bytes, int flags)
>  {
> @@ -198,8 +216,10 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
>                                   int flags,
>                                   int *addr_len)
>  {
> +       struct tcp_sock *tcp = tcp_sk(sk);
> +       u32 seq = tcp->copied_seq;
>         struct sk_psock *psock;
> -       int copied;
> +       int copied = 0;
>
>         if (unlikely(flags & MSG_ERRQUEUE))
>                 return inet_recv_error(sk, msg, len, addr_len);
> @@ -241,9 +261,11 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
>
>                 if (is_fin) {
>                         copied = 0;
> +                       seq++;
>                         goto out;
>                 }
>         }
> +       seq += copied;
>         if (!copied) {
>                 long timeo;
>                 int data;
> @@ -281,6 +303,10 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
>                 copied = -EAGAIN;
>         }
>  out:
> +       WRITE_ONCE(tcp->copied_seq, seq);
> +       tcp_rcv_space_adjust(sk);
> +       if (copied > 0)
> +               __tcp_cleanup_rbuf(sk, copied);
>         release_sock(sk);
>         sk_psock_put(sk, psock);
>         return copied;
> --
> 2.33.0
>




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux