Make __zerocopy_sg_from_iter() call iov_iter_extract_pages() to get pages that have been ref'd, pinned or left alone as appropriate. As this is only used for source buffers, pinning isn't an option, but being unref'd is. The way __zerocopy_sg_from_iter() merges fragments is also altered, such that fragments must also match their cleanup modes to be merged. An extra helper and wrapper, folio_put_unpin_sub() and page_put_unpin_sub() are added to allow multiple refs to be put/unpinned. Signed-off-by: David Howells <dhowells@xxxxxxxxxx> cc: "David S. Miller" <davem@xxxxxxxxxxxxx> cc: Eric Dumazet <edumazet@xxxxxxxxxx> cc: Jakub Kicinski <kuba@xxxxxxxxxx> cc: Paolo Abeni <pabeni@xxxxxxxxxx> cc: netdev@xxxxxxxxxxxxxxx --- include/linux/mm.h | 2 ++ mm/gup.c | 25 +++++++++++++++++++++++++ net/core/datagram.c | 23 +++++++++++++---------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f14edb192394..e3923b89c75e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1368,7 +1368,9 @@ static inline bool is_cow_mapping(vm_flags_t flags) #endif void folio_put_unpin(struct folio *folio, unsigned int flags); +void folio_put_unpin_sub(struct folio *folio, unsigned int flags, unsigned int refs); void page_put_unpin(struct page *page, unsigned int flags); +void page_put_unpin_sub(struct page *page, unsigned int flags, unsigned int refs); /* * The identification function is mainly used by the buddy allocator for diff --git a/mm/gup.c b/mm/gup.c index 3ee4b4c7e0cb..49dd27ba6c13 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -213,6 +213,31 @@ void page_put_unpin(struct page *page, unsigned int flags) } EXPORT_SYMBOL_GPL(page_put_unpin); +/** + * folio_put_unpin_sub - Unpin/put a folio as appropriate + * @folio: The folio to release + * @flags: gup flags indicating the mode of release (FOLL_*) + * @refs: Number of refs/pins to drop + * + * Release a folio according to the flags. If FOLL_GET is set, the folio has a + * ref dropped; if FOLL_PIN is set, it is unpinned; otherwise it is left + * unaltered. + */ +void folio_put_unpin_sub(struct folio *folio, unsigned int flags, + unsigned int refs) +{ + if (flags & (FOLL_GET | FOLL_PIN)) + gup_put_folio(folio, refs, flags); +} +EXPORT_SYMBOL_GPL(folio_put_unpin_sub); + +void page_put_unpin_sub(struct page *page, unsigned int flags, + unsigned int refs) +{ + folio_put_unpin_sub(page_folio(page), flags, refs); +} +EXPORT_SYMBOL_GPL(page_put_unpin_sub); + /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount * @page: pointer to page to be grabbed diff --git a/net/core/datagram.c b/net/core/datagram.c index 122bfb144d32..63ea1f8817e0 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -614,6 +614,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length) { + unsigned int cleanup_mode = iov_iter_extract_mode(from, FOLL_SOURCE_BUF); int frag; if (msg && msg->msg_ubuf && msg->sg_from_iter) @@ -622,7 +623,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, frag = skb_shinfo(skb)->nr_frags; while (length && iov_iter_count(from)) { - struct page *pages[MAX_SKB_FRAGS]; + struct page *pages[MAX_SKB_FRAGS], **ppages = pages; struct page *last_head = NULL; size_t start; ssize_t copied; @@ -632,9 +633,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; - copied = iov_iter_get_pages(from, pages, length, - MAX_SKB_FRAGS - frag, &start, - FOLL_SOURCE_BUF); + copied = iov_iter_extract_pages(from, &ppages, length, + MAX_SKB_FRAGS - frag, + FOLL_SOURCE_BUF, &start); if (copied < 0) return -EFAULT; @@ -662,12 +663,14 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; if (head == skb_frag_page(last) && + cleanup_mode == skb_frag_cleanup(last) && start == skb_frag_off(last) + skb_frag_size(last)) { skb_frag_size_add(last, size); /* We combined this page, we need to release - * a reference. Since compound pages refcount - * is shared among many pages, batch the refcount - * adjustments to limit false sharing. + * a reference or a pin. Since compound pages + * refcount is shared among many pages, batch + * the refcount adjustments to limit false + * sharing. */ last_head = head; refs++; @@ -675,14 +678,14 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, } } if (refs) { - page_ref_sub(last_head, refs); + page_put_unpin_sub(last_head, cleanup_mode, refs); refs = 0; } skb_fill_page_desc_noacc(skb, frag++, head, start, size, - FOLL_GET); + cleanup_mode); } if (refs) - page_ref_sub(last_head, refs); + page_put_unpin_sub(last_head, cleanup_mode, refs); } return 0; }