Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.

Arjun Roy <arjunroy@xxxxxxxxxx> · Fri, 10 Apr 2020 12:15:29 -0700

On Fri, Apr 10, 2020 at 12:04 PM Andrew Morton
<akpm@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> On Fri, 21 Feb 2020 13:21:41 -0800 Arjun Roy <arjunroy@xxxxxxxxxx> wrote:
>
> > I remain a bit concerned regarding the merge process for this specific
> > patch (0003, the net/ipv4/tcp.c change) since I have other in-flight
> > changes for TCP receive zerocopy that I'd like to upstream for
> > net-next - and would like to avoid weird merge issues.
> >
> > So perhaps the following could work:
> >
> > 1. Andrew, perhaps we could remove this particular patch (0003, the
> > net/ipv4/tcp.c change) from mm-next; that way we merge
> > vm_insert_pages() but not the call-site within TCP, for now.
> > 2. net-next will eventually pick vm_insert_pages() up.
> > 3. I can modify the zerocopy code to use it at that point?
> >
> > Else I'm concerned a complicated merge situation may result.
>
> The merge situation is quite clean.
>
> I guess I'll hold off on
> net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy.patch (below) and
> shall send it to davem after Linus has merged the prerequisites.
>

Acknowledged, thank you!
(resend because gmail conveniently forgot my plain text mode preference...)

-Arjun

>
> From: Arjun Roy <arjunroy@xxxxxxxxxx>
> Subject: net-zerocopy: use vm_insert_pages() for tcp rcv zerocopy
>
> Use vm_insert_pages() for tcp receive zerocopy.  Spin lock cycles (as
> reported by perf) drop from a couple of percentage points to a fraction of
> a percent.  This results in a roughly 6% increase in efficiency, measured
> roughly as zerocopy receive count divided by CPU utilization.
>
> The intention of this patchset is to reduce atomic ops for tcp zerocopy
> receives, which normally hits the same spinlock multiple times
> consecutively.
>
> [akpm@xxxxxxxxxxxxxxxxxxxx: suppress gcc-7.2.0 warning]
> Link: http://lkml.kernel.org/r/20200128025958.43490-3-arjunroy.kdev@xxxxxxxxx
> Signed-off-by: Arjun Roy <arjunroy@xxxxxxxxxx>
> Signed-off-by: Eric Dumazet <edumazet@xxxxxxxxxx>
> Signed-off-by: Soheil Hassas Yeganeh <soheil@xxxxxxxxxx>
> Cc: David Miller <davem@xxxxxxxxxxxxx>
> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> Cc: Jason Gunthorpe <jgg@xxxxxxxx>
> Cc: Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx>
> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> ---
>
>  net/ipv4/tcp.c |   70 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 63 insertions(+), 7 deletions(-)
>
> --- a/net/ipv4/tcp.c~net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy
> +++ a/net/ipv4/tcp.c
> @@ -1734,14 +1734,48 @@ int tcp_mmap(struct file *file, struct s
>  }
>  EXPORT_SYMBOL(tcp_mmap);
>
> +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
> +                                       struct page **pages,
> +                                       unsigned long pages_to_map,
> +                                       unsigned long *insert_addr,
> +                                       u32 *length_with_pending,
> +                                       u32 *seq,
> +                                       struct tcp_zerocopy_receive *zc)
> +{
> +       unsigned long pages_remaining = pages_to_map;
> +       int bytes_mapped;
> +       int ret;
> +
> +       ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
> +       bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
> +       /* Even if vm_insert_pages fails, it may have partially succeeded in
> +        * mapping (some but not all of the pages).
> +        */
> +       *seq += bytes_mapped;
> +       *insert_addr += bytes_mapped;
> +       if (ret) {
> +               /* But if vm_insert_pages did fail, we have to unroll some state
> +                * we speculatively touched before.
> +                */
> +               const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
> +               *length_with_pending -= bytes_not_mapped;
> +               zc->recv_skip_hint += bytes_not_mapped;
> +       }
> +       return ret;
> +}
> +
>  static int tcp_zerocopy_receive(struct sock *sk,
>                                 struct tcp_zerocopy_receive *zc)
>  {
>         unsigned long address = (unsigned long)zc->address;
>         u32 length = 0, seq, offset, zap_len;
> +       #define PAGE_BATCH_SIZE 8
> +       struct page *pages[PAGE_BATCH_SIZE];
>         const skb_frag_t *frags = NULL;
>         struct vm_area_struct *vma;
>         struct sk_buff *skb = NULL;
> +       unsigned long pg_idx = 0;
> +       unsigned long curr_addr;
>         struct tcp_sock *tp;
>         int inq;
>         int ret;
> @@ -1754,6 +1788,8 @@ static int tcp_zerocopy_receive(struct s
>
>         sock_rps_record_flow(sk);
>
> +       tp = tcp_sk(sk);
> +
>         down_read(&current->mm->mmap_sem);
>
>         ret = -EINVAL;
> @@ -1762,7 +1798,6 @@ static int tcp_zerocopy_receive(struct s
>                 goto out;
>         zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
>
> -       tp = tcp_sk(sk);
>         seq = tp->copied_seq;
>         inq = tcp_inq(sk);
>         zc->length = min_t(u32, zc->length, inq);
> @@ -1774,8 +1809,20 @@ static int tcp_zerocopy_receive(struct s
>                 zc->recv_skip_hint = zc->length;
>         }
>         ret = 0;
> +       curr_addr = address;
>         while (length + PAGE_SIZE <= zc->length) {
>                 if (zc->recv_skip_hint < PAGE_SIZE) {
> +                       /* If we're here, finish the current batch. */
> +                       if (pg_idx) {
> +                               ret = tcp_zerocopy_vm_insert_batch(vma, pages,
> +                                                                  pg_idx,
> +                                                                  &curr_addr,
> +                                                                  &length,
> +                                                                  &seq, zc);
> +                               if (ret)
> +                                       goto out;
> +                               pg_idx = 0;
> +                       }
>                         if (skb) {
>                                 if (zc->recv_skip_hint > 0)
>                                         break;
> @@ -1784,7 +1831,6 @@ static int tcp_zerocopy_receive(struct s
>                         } else {
>                                 skb = tcp_recv_skb(sk, seq, &offset);
>                         }
> -
>                         zc->recv_skip_hint = skb->len - offset;
>                         offset -= skb_headlen(skb);
>                         if ((int)offset < 0 || skb_has_frag_list(skb))
> @@ -1808,14 +1854,24 @@ static int tcp_zerocopy_receive(struct s
>                         zc->recv_skip_hint -= remaining;
>                         break;
>                 }
> -               ret = vm_insert_page(vma, address + length,
> -                                    skb_frag_page(frags));
> -               if (ret)
> -                       break;
> +               pages[pg_idx] = skb_frag_page(frags);
> +               pg_idx++;
>                 length += PAGE_SIZE;
> -               seq += PAGE_SIZE;
>                 zc->recv_skip_hint -= PAGE_SIZE;
>                 frags++;
> +               if (pg_idx == PAGE_BATCH_SIZE) {
> +                       ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
> +                                                          &curr_addr, &length,
> +                                                          &seq, zc);
> +                       if (ret)
> +                               goto out;
> +                       pg_idx = 0;
> +               }
> +       }
> +       if (pg_idx) {
> +               ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
> +                                                  &curr_addr, &length, &seq,
> +                                                  zc);
>         }
>  out:
>         up_read(&current->mm->mmap_sem);
> _
>