On Thu, Apr 6, 2023 at 5:43 AM David Howells <dhowells@xxxxxxxxxx> wrote: > > Make TCP's sendmsg() support MSG_SPLICE_PAGES. This causes pages to be > spliced from the source iterator. > > This allows ->sendpage() to be replaced by something that can handle > multiple multipage folios in a single transaction. > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Eric Dumazet <edumazet@xxxxxxxxxx> > cc: "David S. Miller" <davem@xxxxxxxxxxxxx> > cc: David Ahern <dsahern@xxxxxxxxxx> > cc: Jakub Kicinski <kuba@xxxxxxxxxx> > cc: Paolo Abeni <pabeni@xxxxxxxxxx> > cc: Jens Axboe <axboe@xxxxxxxxx> > cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > cc: netdev@xxxxxxxxxxxxxxx > --- > net/ipv4/tcp.c | 67 ++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 60 insertions(+), 7 deletions(-) > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index fd68d49490f2..510bacc7ce7b 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -1221,7 +1221,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > int flags, err, copied = 0; > int mss_now = 0, size_goal, copied_syn = 0; > int process_backlog = 0; > - bool zc = false; > + int zc = 0; > long timeo; > > flags = msg->msg_flags; > @@ -1232,17 +1232,22 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > if (msg->msg_ubuf) { > uarg = msg->msg_ubuf; > net_zcopy_get(uarg); > - zc = sk->sk_route_caps & NETIF_F_SG; > + if (sk->sk_route_caps & NETIF_F_SG) > + zc = 1; > } else if (sock_flag(sk, SOCK_ZEROCOPY)) { > uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); > if (!uarg) { > err = -ENOBUFS; > goto out_err; > } > - zc = sk->sk_route_caps & NETIF_F_SG; > - if (!zc) > + if (sk->sk_route_caps & NETIF_F_SG) > + zc = 1; > + else > uarg_to_msgzc(uarg)->zerocopy = 0; > } > + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { > + if (sk->sk_route_caps & NETIF_F_SG) > + zc = 2; > } > > if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && > @@ -1305,7 +1310,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > goto do_error; > > while (msg_data_left(msg)) { > - int copy = 0; > + ssize_t copy = 0; > > skb = tcp_write_queue_tail(sk); > if (skb) > @@ -1346,7 +1351,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > if (copy > msg_data_left(msg)) > copy = msg_data_left(msg); > > - if (!zc) { > + if (zc == 0) { > bool merge = true; > int i = skb_shinfo(skb)->nr_frags; > struct page_frag *pfrag = sk_page_frag(sk); > @@ -1391,7 +1396,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > page_ref_inc(pfrag->page); > } > pfrag->offset += copy; > - } else { > + } else if (zc == 1) { Instead of 1 and 2, MSG_ZEROCOPY and MSG_SPLICE_PAGES make the code more self-documenting. > /* First append to a fragless skb builds initial > * pure zerocopy skb > */ > @@ -1412,6 +1417,54 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > if (err < 0) > goto do_error; > copy = err; > + } else if (zc == 2) { > + /* Splice in data. */ > + struct page *page = NULL, **pages = &page; > + size_t off = 0, part; > + bool can_coalesce; > + int i = skb_shinfo(skb)->nr_frags; > + > + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, > + copy, 1, 0, &off); > + if (copy <= 0) { > + err = copy ?: -EIO; > + goto do_error; > + } > + > + can_coalesce = skb_can_coalesce(skb, i, page, off); > + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { > + tcp_mark_push(tp, skb); > + iov_iter_revert(&msg->msg_iter, copy); > + goto new_segment; > + } > + if (tcp_downgrade_zcopy_pure(sk, skb)) { > + iov_iter_revert(&msg->msg_iter, copy); > + goto wait_for_space; > + } > + > + part = tcp_wmem_schedule(sk, copy); > + iov_iter_revert(&msg->msg_iter, copy - part); > + if (!part) > + goto wait_for_space; > + copy = part; > + > + if (can_coalesce) { > + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); > + } else { > + get_page(page); > + skb_fill_page_desc_noacc(skb, i, page, off, copy); > + } > + page = NULL; > + > + if (!(flags & MSG_NO_SHARED_FRAGS)) > + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; > + > + skb->len += copy; > + skb->data_len += copy; > + skb->truesize += copy; > + sk_wmem_queued_add(sk, copy); > + sk_mem_charge(sk, copy); > + Similar to udp, perhaps in a helper?