On 15/04/2024 13:16, Niklas Söderlund wrote: > Hi Paul, > > I think using page pool is a good idea! > > On 2024-04-15 10:48:04 +0100, Paul Barker wrote: >> This patch makes multiple changes that can't be separated: >> >> 1) Allocate plain RX buffers via a page pool instead of allocating >> SKBs, then use build_skb() when a packet is received. >> 2) For GbEth IP, reduce the RX buffer size to 2kB. >> 3) For GbEth IP, merge packets which span more than one RX descriptor >> as SKB fragments instead of copying data. >> >> Implementing (1) without (2) would require the use of an order-1 page >> pool (instead of an order-0 page pool split into page fragments) for >> GbEth. >> >> Implementing (2) without (3) would leave us no space to re-assemble >> packets which span more than one RX descriptor. >> >> Implementing (3) without (1) would not be possible as the network stack >> expects to use put_page() or page_pool_put_page() to free SKB fragments >> after an SKB is consumed. >> >> This patch gives the following improvements during testing with iperf3. >> >> * RZ/G2L: >> * TCP RX: same bandwidth at -43% CPU load (70% -> 40%) >> * UDP RX: same bandwidth at -17% CPU load (88% -> 74%) >> >> * RZ/G2UL: >> * TCP RX: +30% bandwidth (726Mbps -> 941Mbps) >> * UDP RX: +417% bandwidth (108Mbps -> 558Mbps) >> >> * RZ/G3S: >> * TCP RX: +64% bandwidth (562Mbps -> 920Mbps) >> * UDP RX: +420% bandwidth (90Mbps -> 468Mbps) >> >> * RZ/Five: >> * TCP RX: +217% bandwidth (145Mbps -> 459Mbps) >> * UDP RX: +470% bandwidth (20Mbps -> 114Mbps) >> >> There is no significant impact on bandwidth or CPU load in testing on >> RZ/G2H or R-Car M3N. >> >> Signed-off-by: Paul Barker <paul.barker.ct@xxxxxxxxxxxxxx> >> --- >> drivers/net/ethernet/renesas/ravb.h | 10 +- >> drivers/net/ethernet/renesas/ravb_main.c | 209 +++++++++++++---------- >> 2 files changed, 128 insertions(+), 91 deletions(-) >> >> diff --git a/drivers/net/ethernet/renesas/ravb.h b/drivers/net/ethernet/renesas/ravb.h >> index 9c6392ade2f1..4348366c3dc7 100644 >> --- a/drivers/net/ethernet/renesas/ravb.h >> +++ b/drivers/net/ethernet/renesas/ravb.h >> @@ -1050,8 +1050,8 @@ struct ravb_hw_info { >> netdev_features_t net_features; >> int stats_len; >> u32 tccr_mask; >> + u32 rx_buffer_size; >> u32 rx_max_frame_size; >> - u32 rx_max_desc_use; >> u32 rx_desc_size; >> unsigned aligned_tx: 1; >> unsigned needs_irq_coalesce:1; /* Needs software IRQ coalescing */ >> @@ -1071,6 +1071,11 @@ struct ravb_hw_info { >> unsigned half_duplex:1; /* E-MAC supports half duplex mode */ >> }; >> >> +struct ravb_rx_buffer { >> + struct page *page; >> + unsigned int offset; >> +}; >> + >> struct ravb_private { >> struct net_device *ndev; >> struct platform_device *pdev; >> @@ -1094,7 +1099,8 @@ struct ravb_private { >> struct ravb_tx_desc *tx_ring[NUM_TX_QUEUE]; >> void *tx_align[NUM_TX_QUEUE]; >> struct sk_buff *rx_1st_skb; >> - struct sk_buff **rx_skb[NUM_RX_QUEUE]; >> + struct page_pool *rx_pool; > > Don't we need a page pool per queue? Else multiple calls to > ravb_ring_init() and ravb_ring_free() for different queues will > otherwise risk allocating over a previous queue and multiple free the > same one. Ack. > >> + struct ravb_rx_buffer *rx_buffers[NUM_RX_QUEUE]; >> struct sk_buff **tx_skb[NUM_TX_QUEUE]; >> u32 rx_over_errors; >> u32 rx_fifo_errors; >> diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c >> index 7434faf0820c..892a3eadef1e 100644 >> --- a/drivers/net/ethernet/renesas/ravb_main.c >> +++ b/drivers/net/ethernet/renesas/ravb_main.c >> @@ -30,6 +30,7 @@ >> #include <linux/reset.h> >> #include <linux/math64.h> >> #include <net/ip.h> >> +#include <net/page_pool/helpers.h> >> >> #include "ravb.h" >> >> @@ -113,25 +114,6 @@ static void ravb_set_rate_rcar(struct net_device *ndev) >> } >> } >> >> -static struct sk_buff * >> -ravb_alloc_skb(struct net_device *ndev, const struct ravb_hw_info *info, >> - gfp_t gfp_mask) >> -{ >> - struct sk_buff *skb; >> - u32 reserve; >> - >> - skb = __netdev_alloc_skb(ndev, info->rx_max_frame_size + RAVB_ALIGN - 1, >> - gfp_mask); >> - if (!skb) >> - return NULL; >> - >> - reserve = (unsigned long)skb->data & (RAVB_ALIGN - 1); >> - if (reserve) >> - skb_reserve(skb, RAVB_ALIGN - reserve); >> - >> - return skb; >> -} >> - >> /* Get MAC address from the MAC address registers >> * >> * Ethernet AVB device doesn't have ROM for MAC address. >> @@ -257,21 +239,10 @@ static void ravb_rx_ring_free(struct net_device *ndev, int q) >> { >> struct ravb_private *priv = netdev_priv(ndev); >> unsigned int ring_size; >> - unsigned int i; >> >> if (!priv->rx_ring[q].raw) >> return; >> >> - for (i = 0; i < priv->num_rx_ring[q]; i++) { >> - struct ravb_rx_desc *desc = ravb_rx_get_desc(priv, q, i); >> - >> - if (!dma_mapping_error(ndev->dev.parent, >> - le32_to_cpu(desc->dptr))) >> - dma_unmap_single(ndev->dev.parent, >> - le32_to_cpu(desc->dptr), >> - priv->info->rx_max_frame_size, >> - DMA_FROM_DEVICE); >> - } >> ring_size = priv->info->rx_desc_size * (priv->num_rx_ring[q] + 1); >> dma_free_coherent(ndev->dev.parent, ring_size, priv->rx_ring[q].raw, >> priv->rx_desc_dma[q]); >> @@ -298,13 +269,14 @@ static void ravb_ring_free(struct net_device *ndev, int q) >> priv->tx_ring[q] = NULL; >> } >> >> - /* Free RX skb ringbuffer */ >> - if (priv->rx_skb[q]) { >> - for (i = 0; i < priv->num_rx_ring[q]; i++) >> - dev_kfree_skb(priv->rx_skb[q][i]); >> + /* Free RX buffers */ >> + for (i = 0; i < priv->num_rx_ring[q]; i++) { >> + if (priv->rx_buffers[q][i].page) >> + page_pool_put_page(priv->rx_pool, priv->rx_buffers[q][i].page, 0, true); >> } >> - kfree(priv->rx_skb[q]); >> - priv->rx_skb[q] = NULL; >> + kfree(priv->rx_buffers[q]); >> + priv->rx_buffers[q] = NULL; >> + page_pool_destroy(priv->rx_pool); >> >> /* Free aligned TX buffers */ >> kfree(priv->tx_align[q]); >> @@ -317,35 +289,54 @@ static void ravb_ring_free(struct net_device *ndev, int q) >> priv->tx_skb[q] = NULL; >> } >> >> +static int >> +ravb_alloc_rx_buffer(struct net_device *ndev, int q, u32 entry, gfp_t gfp_mask, >> + __le32 *dptr) > > Why not pass the struct ravb_rx_desc instead of a dptr? Then the > function can deal with the error case and fill in rx_desc->dptr and > rx_desc->ds_cc directly making the caller simpler. Ack. > >> +{ >> + struct ravb_private *priv = netdev_priv(ndev); >> + const struct ravb_hw_info *info = priv->info; >> + struct ravb_rx_buffer *rx_buff = &priv->rx_buffers[q][entry]; >> + dma_addr_t dma_addr; >> + unsigned int size; >> + >> + size = info->rx_buffer_size; >> + rx_buff->page = page_pool_alloc(priv->rx_pool, &rx_buff->offset, &size, >> + gfp_mask); >> + if (unlikely(!rx_buff->page)) >> + return -ENOMEM; >> + >> + dma_addr = page_pool_get_dma_addr(rx_buff->page) + rx_buff->offset; >> + dma_sync_single_for_device(ndev->dev.parent, dma_addr, >> + info->rx_buffer_size, DMA_FROM_DEVICE); >> + *dptr = cpu_to_le32(dma_addr); >> + return 0; >> +} >> + >> static u32 >> ravb_rx_ring_refill(struct net_device *ndev, int q, u32 count, gfp_t gfp_mask) >> { >> struct ravb_private *priv = netdev_priv(ndev); >> const struct ravb_hw_info *info = priv->info; >> struct ravb_rx_desc *rx_desc; >> - dma_addr_t dma_addr; >> u32 i, entry; >> >> for (i = 0; i < count; i++) { >> entry = (priv->dirty_rx[q] + i) % priv->num_rx_ring[q]; >> rx_desc = ravb_rx_get_desc(priv, q, entry); >> - rx_desc->ds_cc = cpu_to_le16(info->rx_max_desc_use); >> >> - if (!priv->rx_skb[q][entry]) { >> - priv->rx_skb[q][entry] = ravb_alloc_skb(ndev, info, gfp_mask); >> - if (!priv->rx_skb[q][entry]) >> - break; >> - dma_addr = dma_map_single(ndev->dev.parent, >> - priv->rx_skb[q][entry]->data, >> - priv->info->rx_max_frame_size, >> - DMA_FROM_DEVICE); >> - skb_checksum_none_assert(priv->rx_skb[q][entry]); >> - /* We just set the data size to 0 for a failed mapping >> - * which should prevent DMA from happening... >> - */ >> - if (dma_mapping_error(ndev->dev.parent, dma_addr)) >> + if (!priv->rx_buffers[q][entry].page) { >> + if (unlikely(ravb_alloc_rx_buffer(ndev, q, entry, gfp_mask, >> + &rx_desc->dptr))) { >> + /* We just set the data size to 0 for a failed mapping >> + * which should prevent DMA from happening... >> + */ >> rx_desc->ds_cc = cpu_to_le16(0); >> - rx_desc->dptr = cpu_to_le32(dma_addr); >> + break; >> + } >> + >> + rx_desc->ds_cc = cpu_to_le16(info->rx_buffer_size >> + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) >> + - ETH_FCS_LEN + sizeof(__sum16)); > > Can a comment be added to why we subtract and add things to the size? Ack. I'll address these in v4. -- Paul Barker
Attachment:
OpenPGP_0x27F4B3459F002257.asc
Description: OpenPGP public key
Attachment:
OpenPGP_signature.asc
Description: OpenPGP digital signature