Re: [PATCH net-next v13 12/14] net: replace page_frag with page_frag_cache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/15/2024 6:01 AM, Alexander H Duyck wrote:
On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
Use the newly introduced prepare/probe/commit API to
replace page_frag with page_frag_cache for sk_page_frag().

CC: Alexander Duyck <alexander.duyck@xxxxxxxxx>
Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx>
Acked-by: Mat Martineau <martineau@xxxxxxxxxx>
---
  .../chelsio/inline_crypto/chtls/chtls.h       |   3 -
  .../chelsio/inline_crypto/chtls/chtls_io.c    | 100 ++++---------
  .../chelsio/inline_crypto/chtls/chtls_main.c  |   3 -
  drivers/net/tun.c                             |  48 +++---
  include/linux/sched.h                         |   2 +-
  include/net/sock.h                            |  14 +-
  kernel/exit.c                                 |   3 +-
  kernel/fork.c                                 |   3 +-
  net/core/skbuff.c                             |  59 +++++---
  net/core/skmsg.c                              |  22 +--
  net/core/sock.c                               |  46 ++++--
  net/ipv4/ip_output.c                          |  33 +++--
  net/ipv4/tcp.c                                |  32 ++--
  net/ipv4/tcp_output.c                         |  28 ++--
  net/ipv6/ip6_output.c                         |  33 +++--
  net/kcm/kcmsock.c                             |  27 ++--
  net/mptcp/protocol.c                          |  67 +++++----
  net/sched/em_meta.c                           |   2 +-
  net/tls/tls_device.c                          | 137 ++++++++++--------
  19 files changed, 347 insertions(+), 315 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
index 7ff82b6778ba..fe2b6a8ef718 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
@@ -234,7 +234,6 @@ struct chtls_dev {
  	struct list_head list_node;
  	struct list_head rcu_node;
  	struct list_head na_node;
-	unsigned int send_page_order;
  	int max_host_sndbuf;
  	u32 round_robin_cnt;
  	struct key_map kmap;
@@ -453,8 +452,6 @@ enum {
/* The ULP mode/submode of an skbuff */
  #define skb_ulp_mode(skb)  (ULP_SKB_CB(skb)->ulp_mode)
-#define TCP_PAGE(sk)   (sk->sk_frag.page)
-#define TCP_OFF(sk)    (sk->sk_frag.offset)
static inline struct chtls_dev *to_chtls_dev(struct tls_toe_device *tlsdev)
  {
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
index d567e42e1760..334381c1587f 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
@@ -825,12 +825,6 @@ void skb_entail(struct sock *sk, struct sk_buff *skb, int flags)
  	ULP_SKB_CB(skb)->flags = flags;
  	__skb_queue_tail(&csk->txq, skb);
  	sk->sk_wmem_queued += skb->truesize;
-
-	if (TCP_PAGE(sk) && TCP_OFF(sk)) {
-		put_page(TCP_PAGE(sk));
-		TCP_PAGE(sk) = NULL;
-		TCP_OFF(sk) = 0;
-	}
  }
static struct sk_buff *get_tx_skb(struct sock *sk, int size)
@@ -882,16 +876,12 @@ static void push_frames_if_head(struct sock *sk)
  		chtls_push_frames(csk, 1);
  }
-static int chtls_skb_copy_to_page_nocache(struct sock *sk,
-					  struct iov_iter *from,
-					  struct sk_buff *skb,
-					  struct page *page,
-					  int off, int copy)
+static int chtls_skb_copy_to_va_nocache(struct sock *sk, struct iov_iter *from,
+					struct sk_buff *skb, char *va, int copy)
  {
  	int err;
- err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) +
-				       off, copy, skb->len);
+	err = skb_do_copy_data_nocache(sk, skb, from, va, copy, skb->len);
  	if (err)
  		return err;
@@ -1114,82 +1104,44 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
  			if (err)
  				goto do_fault;
  		} else {
+			struct page_frag_cache *pfrag = &sk->sk_frag;

Is this even valid? Shouldn't it be using sk_page_frag to get the

chtls_sendmsg() only use sk->sk_frag, see below.

reference here? Seems like it might be trying to instantiate an unused
cache.

I am not sure if I understand what you meant by "trying to instantiate
an unused cache". sk->sk_frag is supposed to be instantiated in
sock_init_data_uid() by calling page_frag_cache_init() in this patch.


As per my earlier suggestion this could be made very simple if we are
just pulling a bio_vec out from the page cache at the start. With that
we could essentially plug it into the TCP_PAGE/TCP_OFF block here and
most of it would just function the same.

I am not sure if we had the same common understanding as why chtls had
more changing than other places when replacing page_frag with
page_frag_cache.

chtls_sendmsg() was duplicating its own implementation of page_frag
refilling instead of using skb_page_frag_refill(), we can remove that
implementation by using the new API, that is why there is more changing
for chtls than other places. Are you suggesting to keep chtls's own
implementation of page_frag refilling by 'plug it into the TCP_PAGE/
TCP_OFF block' here?


  			int i = skb_shinfo(skb)->nr_frags;
-			struct page *page = TCP_PAGE(sk);

TCP_PAGE macro is defined as below, that is why sk->sk_frag is used
instead of sk_page_frag() for chtls case if I understand your question
correctly:

#define TCP_PAGE(sk)   (sk->sk_frag.page)
#define TCP_OFF(sk)    (sk->sk_frag.offset)

-			int pg_size = PAGE_SIZE;
-			int off = TCP_OFF(sk);
-			bool merge;
-
-			if (page)
-				pg_size = page_size(page);
-			if (off < pg_size &&
-			    skb_can_coalesce(skb, i, page, off)) {
+			unsigned int offset, fragsz;
+			bool merge = false;
+			struct page *page;
+			void *va;
+
+			fragsz = 32U;
+			page = page_frag_alloc_prepare(pfrag, &offset, &fragsz,
+						       &va, sk->sk_allocation);
+			if (unlikely(!page))
+				goto wait_for_memory;
+
+			if (skb_can_coalesce(skb, i, page, offset))
  				merge = true;
-				goto copy;
-			}
-			merge = false;
-			if (i == (is_tls_tx(csk) ? (MAX_SKB_FRAGS - 1) :
-			    MAX_SKB_FRAGS))
+			else if (i == (is_tls_tx(csk) ? (MAX_SKB_FRAGS - 1) :
+				       MAX_SKB_FRAGS))
  				goto new_buf;
- if (page && off == pg_size) {
-				put_page(page);
-				TCP_PAGE(sk) = page = NULL;
-				pg_size = PAGE_SIZE;
-			}
-
-			if (!page) {
-				gfp_t gfp = sk->sk_allocation;
-				int order = cdev->send_page_order;
-
-				if (order) {
-					page = alloc_pages(gfp | __GFP_COMP |
-							   __GFP_NOWARN |
-							   __GFP_NORETRY,
-							   order);
-					if (page)
-						pg_size <<= order;
-				}
-				if (!page) {
-					page = alloc_page(gfp);
-					pg_size = PAGE_SIZE;
-				}
-				if (!page)
-					goto wait_for_memory;
-				off = 0;
-			}
-copy:
-			if (copy > pg_size - off)
-				copy = pg_size - off;
+			copy = min_t(int, copy, fragsz);
  			if (is_tls_tx(csk))
  				copy = min_t(int, copy, csk->tlshws.txleft);
- err = chtls_skb_copy_to_page_nocache(sk, &msg->msg_iter,
-							     skb, page,
-							     off, copy);
-			if (unlikely(err)) {
-				if (!TCP_PAGE(sk)) {
-					TCP_PAGE(sk) = page;
-					TCP_OFF(sk) = 0;
-				}
+			err = chtls_skb_copy_to_va_nocache(sk, &msg->msg_iter,
+							   skb, va, copy);
+			if (unlikely(err))
  				goto do_fault;
-			}
+
  			/* Update the skb. */
  			if (merge) {
  				skb_frag_size_add(
  						&skb_shinfo(skb)->frags[i - 1],
  						copy);
+				page_frag_alloc_commit_noref(pfrag, copy);
  			} else {
-				skb_fill_page_desc(skb, i, page, off, copy);
-				if (off + copy < pg_size) {
-					/* space left keep page */
-					get_page(page);
-					TCP_PAGE(sk) = page;
-				} else {
-					TCP_PAGE(sk) = NULL;
-				}
+				skb_fill_page_desc(skb, i, page, offset, copy);
+				page_frag_alloc_commit(pfrag, copy);
  			}
-			TCP_OFF(sk) = off + copy;
  		}
  		if (unlikely(skb->len == mss))
  			tx_skb_finalize(skb);

Really there is so much refactor here it is hard to tell what is what.
I would suggest just trying to plug in an intermediary value and you
can save the refactor for later.

I am not sure if your above suggestion works, if it does work, I am not
sure if it is that simple enough to just plug in an intermediary value
as the the fields in 'struct page_frag_cache' is much different from the
fields in 'struct page_frag' as below when replacing page_frag with page_frag_cache for sk->sk_frag:

struct page_frag_cache {
	unsigned long encoded_va;

+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
 	__u16 remaining;
	__u16 pagecnt_bias;
 #else
 	__u32 remaining;
	__u32 pagecnt_bias;
 #endif
};

struct page_frag {
	struct page *page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
	__u32 offset;
	__u32 size;
#else
	__u16 offset;
	__u16 size;
#endif
};



diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
index 455a54708be4..ba88b2fc7cd8 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
@@ -34,7 +34,6 @@ static DEFINE_MUTEX(notify_mutex);
  static RAW_NOTIFIER_HEAD(listen_notify_list);
  static struct proto chtls_cpl_prot, chtls_cpl_protv6;
  struct request_sock_ops chtls_rsk_ops, chtls_rsk_opsv6;
-static uint send_page_order = (14 - PAGE_SHIFT < 0) ? 0 : 14 - PAGE_SHIFT;
static void register_listen_notifier(struct notifier_block *nb)
  {
@@ -273,8 +272,6 @@ static void *chtls_uld_add(const struct cxgb4_lld_info *info)
  	INIT_WORK(&cdev->deferq_task, process_deferq);
  	spin_lock_init(&cdev->listen_lock);
  	spin_lock_init(&cdev->idr_lock);
-	cdev->send_page_order = min_t(uint, get_order(32768),
-				      send_page_order);
  	cdev->max_host_sndbuf = 48 * 1024;
if (lldi->vr->key.size)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1d06c560c5e6..51df92fd60db 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1598,21 +1598,19 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
  }
static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
-				       struct page_frag *alloc_frag, char *buf,
-				       int buflen, int len, int pad)
+				       char *buf, int buflen, int len, int pad)
  {
  	struct sk_buff *skb = build_skb(buf, buflen);
- if (!skb)
+	if (!skb) {
+		page_frag_free_va(buf);
  		return ERR_PTR(-ENOMEM);
+	}
skb_reserve(skb, pad);
  	skb_put(skb, len);
  	skb_set_owner_w(skb, tfile->socket.sk);
- get_page(alloc_frag->page);
-	alloc_frag->offset += buflen;
-

Rather than freeing the buf it would be better if you were to just
stick to the existing pattern and commit the alloc_frag at the end here
instead of calling get_page.

  	return skb;
  }
@@ -1660,7 +1658,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
  				     struct virtio_net_hdr *hdr,
  				     int len, int *skb_xdp)
  {
-	struct page_frag *alloc_frag = &current->task_frag;
+	struct page_frag_cache *alloc_frag = &current->task_frag;
  	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
  	struct bpf_prog *xdp_prog;
  	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
@@ -1676,16 +1674,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
  	buflen += SKB_DATA_ALIGN(len + pad);
  	rcu_read_unlock();
- alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+	buf = page_frag_alloc_va_align(alloc_frag, buflen, GFP_KERNEL,
+				       SMP_CACHE_BYTES);
+	if (unlikely(!buf))
  		return ERR_PTR(-ENOMEM);
- buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	copied = copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset + pad,
-				     len, from);
-	if (copied != len)
+	copied = copy_from_iter(buf + pad, len, from);
+	if (copied != len) {
+		page_frag_alloc_abort(alloc_frag, buflen);
  		return ERR_PTR(-EFAULT);
+	}
/* There's a small window that XDP may be set after the check
  	 * of xdp_prog above, this should be rare and for simplicity
@@ -1693,8 +1691,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
  	 */
  	if (hdr->gso_type || !xdp_prog) {
  		*skb_xdp = 1;
-		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
-				       pad);
+		return __tun_build_skb(tfile, buf, buflen, len, pad);
  	}
*skb_xdp = 0;
@@ -1711,21 +1708,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
  		xdp_prepare_buff(&xdp, buf, pad, len, false);
act = bpf_prog_run_xdp(xdp_prog, &xdp);
-		if (act == XDP_REDIRECT || act == XDP_TX) {
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;

the above is only executed for XDP_REDIRECT and XDP_TX cases.

-		}
  		err = tun_xdp_act(tun, xdp_prog, &xdp, act);
-		if (err < 0) {
-			if (act == XDP_REDIRECT || act == XDP_TX)
-				put_page(alloc_frag->page);

And there is a put_page() immediately when xdp_do_redirect() or
tun_xdp_tx() fails in tun_xdp_act(), so there is something else
might have taken a reference to the page and modified it in some way
even when tun_xdp_act() return error? Would you be more specific
about why above happens?

-			goto out;
-		}
-
  		if (err == XDP_REDIRECT)
  			xdp_do_flush();
-		if (err != XDP_PASS)
+
+		if (err == XDP_REDIRECT || err == XDP_TX) {
+			goto out;
+		} else if (err < 0 || err != XDP_PASS) {
+			page_frag_alloc_abort(alloc_frag, buflen);
  			goto out;
+		}

Your abort function here is not necessarily safe. It is assuming that
nothing else might have taken a reference to the page or modified it in
some way. Generally we shouldn't allow rewinding the pointer until we
check the page count to guarantee nobody else is now working with a
copy of the page.

  		pad = xdp.data - xdp.data_hard_start;
  		len = xdp.data_end - xdp.data;
@@ -1734,7 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
  	rcu_read_unlock();
  	local_bh_enable();
- return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
+	return __tun_build_skb(tfile, buf, buflen, len, pad);
out:
  	bpf_net_ctx_clear(bpf_net_ctx);


...





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux