On 4/6/23 7:56 PM, Willem de Bruijn wrote: > On Thu, Apr 6, 2023 at 5:43 AM David Howells <dhowells@xxxxxxxxxx> wrote: >> >> Make TCP's sendmsg() support MSG_SPLICE_PAGES. This causes pages to be >> spliced from the source iterator. >> >> This allows ->sendpage() to be replaced by something that can handle >> multiple multipage folios in a single transaction. >> >> Signed-off-by: David Howells <dhowells@xxxxxxxxxx> >> cc: Eric Dumazet <edumazet@xxxxxxxxxx> >> cc: "David S. Miller" <davem@xxxxxxxxxxxxx> >> cc: David Ahern <dsahern@xxxxxxxxxx> >> cc: Jakub Kicinski <kuba@xxxxxxxxxx> >> cc: Paolo Abeni <pabeni@xxxxxxxxxx> >> cc: Jens Axboe <axboe@xxxxxxxxx> >> cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> >> cc: netdev@xxxxxxxxxxxxxxx >> --- >> net/ipv4/tcp.c | 67 ++++++++++++++++++++++++++++++++++++++++++++------ >> 1 file changed, 60 insertions(+), 7 deletions(-) >> >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c >> index fd68d49490f2..510bacc7ce7b 100644 >> --- a/net/ipv4/tcp.c >> +++ b/net/ipv4/tcp.c >> @@ -1221,7 +1221,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> int flags, err, copied = 0; >> int mss_now = 0, size_goal, copied_syn = 0; >> int process_backlog = 0; >> - bool zc = false; >> + int zc = 0; >> long timeo; >> >> flags = msg->msg_flags; >> @@ -1232,17 +1232,22 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> if (msg->msg_ubuf) { >> uarg = msg->msg_ubuf; >> net_zcopy_get(uarg); >> - zc = sk->sk_route_caps & NETIF_F_SG; >> + if (sk->sk_route_caps & NETIF_F_SG) >> + zc = 1; >> } else if (sock_flag(sk, SOCK_ZEROCOPY)) { >> uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); >> if (!uarg) { >> err = -ENOBUFS; >> goto out_err; >> } >> - zc = sk->sk_route_caps & NETIF_F_SG; >> - if (!zc) >> + if (sk->sk_route_caps & NETIF_F_SG) >> + zc = 1; >> + else >> uarg_to_msgzc(uarg)->zerocopy = 0; >> } >> + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { >> + if (sk->sk_route_caps & NETIF_F_SG) >> + zc = 2; >> } >> >> if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && >> @@ -1305,7 +1310,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> goto do_error; >> >> while (msg_data_left(msg)) { >> - int copy = 0; >> + ssize_t copy = 0; >> >> skb = tcp_write_queue_tail(sk); >> if (skb) >> @@ -1346,7 +1351,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> if (copy > msg_data_left(msg)) >> copy = msg_data_left(msg); >> >> - if (!zc) { >> + if (zc == 0) { >> bool merge = true; >> int i = skb_shinfo(skb)->nr_frags; >> struct page_frag *pfrag = sk_page_frag(sk); >> @@ -1391,7 +1396,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> page_ref_inc(pfrag->page); >> } >> pfrag->offset += copy; >> - } else { >> + } else if (zc == 1) { > > Instead of 1 and 2, MSG_ZEROCOPY and MSG_SPLICE_PAGES make the code > more self-documenting. > >> /* First append to a fragless skb builds initial >> * pure zerocopy skb >> */ >> @@ -1412,6 +1417,54 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> if (err < 0) >> goto do_error; >> copy = err; >> + } else if (zc == 2) { >> + /* Splice in data. */ >> + struct page *page = NULL, **pages = &page; >> + size_t off = 0, part; >> + bool can_coalesce; >> + int i = skb_shinfo(skb)->nr_frags; >> + >> + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, >> + copy, 1, 0, &off); >> + if (copy <= 0) { >> + err = copy ?: -EIO; >> + goto do_error; >> + } >> + >> + can_coalesce = skb_can_coalesce(skb, i, page, off); >> + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { >> + tcp_mark_push(tp, skb); >> + iov_iter_revert(&msg->msg_iter, copy); >> + goto new_segment; >> + } >> + if (tcp_downgrade_zcopy_pure(sk, skb)) { >> + iov_iter_revert(&msg->msg_iter, copy); >> + goto wait_for_space; >> + } >> + >> + part = tcp_wmem_schedule(sk, copy); >> + iov_iter_revert(&msg->msg_iter, copy - part); >> + if (!part) >> + goto wait_for_space; >> + copy = part; >> + >> + if (can_coalesce) { >> + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); >> + } else { >> + get_page(page); >> + skb_fill_page_desc_noacc(skb, i, page, off, copy); >> + } >> + page = NULL; >> + >> + if (!(flags & MSG_NO_SHARED_FRAGS)) >> + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; >> + >> + skb->len += copy; >> + skb->data_len += copy; >> + skb->truesize += copy; >> + sk_wmem_queued_add(sk, copy); >> + sk_mem_charge(sk, copy); >> + > > Similar to udp, perhaps in a helper? tcp_sendmsg_locked is already more than 250 lines long and this 47 lines is compounding it. I was staring at this code 2 weeks ago wondering if it can be split or refactored to reduce the complexity.