> -----Original Message----- > From: Simon Horman <horms@xxxxxxxxxx> > Sent: Friday, September 29, 2023 4:57 AM > To: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> > Cc: linux-hyperv@xxxxxxxxxxxxxxx; netdev@xxxxxxxxxxxxxxx; Dexuan Cui > <decui@xxxxxxxxxxxxx>; KY Srinivasan <kys@xxxxxxxxxxxxx>; Paul Rosswurm > <paulros@xxxxxxxxxxxxx>; olaf@xxxxxxxxx; vkuznets <vkuznets@xxxxxxxxxx>; > davem@xxxxxxxxxxxxx; wei.liu@xxxxxxxxxx; edumazet@xxxxxxxxxx; > kuba@xxxxxxxxxx; pabeni@xxxxxxxxxx; leon@xxxxxxxxxx; Long Li > <longli@xxxxxxxxxxxxx>; ssengar@xxxxxxxxxxxxxxxxxxx; linux- > rdma@xxxxxxxxxxxxxxx; daniel@xxxxxxxxxxxxx; john.fastabend@xxxxxxxxx; > bpf@xxxxxxxxxxxxxxx; ast@xxxxxxxxxx; Ajay Sharma > <sharmaajay@xxxxxxxxxxxxx>; hawk@xxxxxxxxxx; tglx@xxxxxxxxxxxxx; > shradhagupta@xxxxxxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx > Subject: Re: [PATCH net, 3/3] net: mana: Fix oversized sge0 for GSO packets > > On Sat, Sep 23, 2023 at 06:31:47PM -0700, Haiyang Zhang wrote: > > Handle the case when GSO SKB linear length is too large. > > > > MANA NIC requires GSO packets to put only the header part to SGE0, > > otherwise the TX queue may stop at the HW level. > > > > So, use 2 SGEs for the skb linear part which contains more than the > > packet header. > > > > Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network > Adapter (MANA)") > > Signed-off-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> > > Hi Haiyang Zhang, > > thanks for your patch. > Please find some feedback inline. > > > --- > > drivers/net/ethernet/microsoft/mana/mana_en.c | 186 ++++++++++++------ > > include/net/mana/mana.h | 5 +- > > 2 files changed, 134 insertions(+), 57 deletions(-) > > > > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c > b/drivers/net/ethernet/microsoft/mana/mana_en.c > > index 86e724c3eb89..0a3879163b56 100644 > > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c > > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c > > @@ -91,63 +91,136 @@ static unsigned int mana_checksum_info(struct > sk_buff *skb) > > return 0; > > } > > > > +static inline void mana_add_sge(struct mana_tx_package *tp, > > + struct mana_skb_head *ash, int sg_i, > > + dma_addr_t da, int sge_len, u32 gpa_mkey) > > Please don't use inline for code in .c files unless there > is a demonstrable reason to do so: in general, the compiler should be > left to inline code as it sees fit. Sure, will remove the "inline". > > > +{ > > + ash->dma_handle[sg_i] = da; > > + ash->size[sg_i] = sge_len; > > + > > + tp->wqe_req.sgl[sg_i].address = da; > > + tp->wqe_req.sgl[sg_i].mem_key = gpa_mkey; > > + tp->wqe_req.sgl[sg_i].size = sge_len; > > +} > > + > > static int mana_map_skb(struct sk_buff *skb, struct mana_port_context > *apc, > > - struct mana_tx_package *tp) > > + struct mana_tx_package *tp, int gso_hs) > > { > > struct mana_skb_head *ash = (struct mana_skb_head *)skb->head; > > + int hsg = 1; /* num of SGEs of linear part */ > > struct gdma_dev *gd = apc->ac->gdma_dev; > > + int skb_hlen = skb_headlen(skb); > > + int sge0_len, sge1_len = 0; > > struct gdma_context *gc; > > struct device *dev; > > skb_frag_t *frag; > > dma_addr_t da; > > + int sg_i; > > int i; > > > > gc = gd->gdma_context; > > dev = gc->dev; > > - da = dma_map_single(dev, skb->data, skb_headlen(skb), > DMA_TO_DEVICE); > > > > + if (gso_hs && gso_hs < skb_hlen) { > > + sge0_len = gso_hs; > > + sge1_len = skb_hlen - gso_hs; > > + } else { > > + sge0_len = skb_hlen; > > + } > > + > > + da = dma_map_single(dev, skb->data, sge0_len, DMA_TO_DEVICE); > > if (dma_mapping_error(dev, da)) > > return -ENOMEM; > > > > - ash->dma_handle[0] = da; > > - ash->size[0] = skb_headlen(skb); > > + mana_add_sge(tp, ash, 0, da, sge0_len, gd->gpa_mkey); > > > > - tp->wqe_req.sgl[0].address = ash->dma_handle[0]; > > - tp->wqe_req.sgl[0].mem_key = gd->gpa_mkey; > > - tp->wqe_req.sgl[0].size = ash->size[0]; > > + if (sge1_len) { > > + sg_i = 1; > > + da = dma_map_single(dev, skb->data + sge0_len, sge1_len, > > + DMA_TO_DEVICE); > > + if (dma_mapping_error(dev, da)) > > + goto frag_err; > > + > > + mana_add_sge(tp, ash, sg_i, da, sge1_len, gd->gpa_mkey); > > + hsg = 2; > > + } > > > > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { > > + sg_i = hsg + i; > > + > > frag = &skb_shinfo(skb)->frags[i]; > > da = skb_frag_dma_map(dev, frag, 0, skb_frag_size(frag), > > DMA_TO_DEVICE); > > - > > if (dma_mapping_error(dev, da)) > > goto frag_err; > > > > - ash->dma_handle[i + 1] = da; > > - ash->size[i + 1] = skb_frag_size(frag); > > - > > - tp->wqe_req.sgl[i + 1].address = ash->dma_handle[i + 1]; > > - tp->wqe_req.sgl[i + 1].mem_key = gd->gpa_mkey; > > - tp->wqe_req.sgl[i + 1].size = ash->size[i + 1]; > > + mana_add_sge(tp, ash, sg_i, da, skb_frag_size(frag), > > + gd->gpa_mkey); > > } > > > > return 0; > > > > frag_err: > > - for (i = i - 1; i >= 0; i--) > > - dma_unmap_page(dev, ash->dma_handle[i + 1], ash->size[i + > 1], > > + for (i = sg_i - 1; i >= hsg; i--) > > + dma_unmap_page(dev, ash->dma_handle[i], ash->size[i], > > DMA_TO_DEVICE); > > > > - dma_unmap_single(dev, ash->dma_handle[0], ash->size[0], > DMA_TO_DEVICE); > > + for (i = hsg - 1; i >= 0; i--) > > + dma_unmap_single(dev, ash->dma_handle[i], ash->size[i], > > + DMA_TO_DEVICE); > > > > return -ENOMEM; > > } > > > > +/* Handle the case when GSO SKB linear length is too large. > > + * MANA NIC requires GSO packets to put only the packet header to SGE0. > > + * So, we need 2 SGEs for the skb linear part which contains more than the > > + * header. > > + */ > > +static inline int mana_fix_skb_head(struct net_device *ndev, > > + struct sk_buff *skb, int gso_hs, > > + u32 *num_sge) > > +{ > > + int skb_hlen = skb_headlen(skb); > > + > > + if (gso_hs < skb_hlen) { > > + *num_sge = 2 + skb_shinfo(skb)->nr_frags; > > + } else if (gso_hs > skb_hlen) { > > + if (net_ratelimit()) > > + netdev_err(ndev, > > + "TX nonlinear head: hs:%d, skb_hlen:%d\n", > > + gso_hs, skb_hlen); > > + > > + return -EINVAL; > > + } > > + > > + return 0; > > nit: I think it would be slightly nicer if the num_sge parameter of this > function was removed and it returned negative values on error (already > the case) and positive values, representing the number f segments, on success. Will do. > > > +} > > + > > +/* Get the GSO packet's header size */ > > +static inline int mana_get_gso_hs(struct sk_buff *skb) > > +{ > > + int gso_hs; > > + > > + if (skb->encapsulation) { > > + gso_hs = skb_inner_tcp_all_headers(skb); > > + } else { > > + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { > > + gso_hs = skb_transport_offset(skb) + > > + sizeof(struct udphdr); > > + } else { > > + gso_hs = skb_tcp_all_headers(skb); > > + } > > + } > > + > > + return gso_hs; > > +} > > + > > netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev) > > { > > enum mana_tx_pkt_format pkt_fmt = MANA_SHORT_PKT_FMT; > > struct mana_port_context *apc = netdev_priv(ndev); > > + int gso_hs = 0; /* zero for non-GSO pkts */ > > u16 txq_idx = skb_get_queue_mapping(skb); > > struct gdma_dev *gd = apc->ac->gdma_dev; > > bool ipv4 = false, ipv6 = false; > > @@ -159,7 +232,6 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, > struct net_device *ndev) > > struct mana_txq *txq; > > struct mana_cq *cq; > > int err, len; > > - u16 ihs; > > > > if (unlikely(!apc->port_is_up)) > > goto tx_drop; > > @@ -209,19 +281,6 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, > struct net_device *ndev) > > pkg.wqe_req.client_data_unit = 0; > > > > pkg.wqe_req.num_sge = 1 + skb_shinfo(skb)->nr_frags; > > - WARN_ON_ONCE(pkg.wqe_req.num_sge > > MAX_TX_WQE_SGL_ENTRIES); > > - > > - if (pkg.wqe_req.num_sge <= ARRAY_SIZE(pkg.sgl_array)) { > > - pkg.wqe_req.sgl = pkg.sgl_array; > > - } else { > > - pkg.sgl_ptr = kmalloc_array(pkg.wqe_req.num_sge, > > - sizeof(struct gdma_sge), > > - GFP_ATOMIC); > > - if (!pkg.sgl_ptr) > > - goto tx_drop_count; > > - > > - pkg.wqe_req.sgl = pkg.sgl_ptr; > > - } > > It is unclear to me why this logic has moved from here to further > down in this function. Is it to avoid some cases where > alloation has to be unwond on error (when mana_fix_skb_head() fails) ? > If so, this feels more like an optimisation than a fix. mana_fix_skb_head() may add one more sge (success case) so the sgl allocation should be done later. Otherwise, we need to free / re-allocate the array later. > > > > > if (skb->protocol == htons(ETH_P_IP)) > > ipv4 = true; > > @@ -229,6 +288,23 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, > struct net_device *ndev) > > ipv6 = true; > > > > if (skb_is_gso(skb)) { > > + gso_hs = mana_get_gso_hs(skb); > > + > > + if (mana_fix_skb_head(ndev, skb, gso_hs, > &pkg.wqe_req.num_sge)) > > + goto tx_drop_count; > > + > > + if (skb->encapsulation) { > > + u64_stats_update_begin(&tx_stats->syncp); > > + tx_stats->tso_inner_packets++; > > + tx_stats->tso_inner_bytes += skb->len - gso_hs; > > + u64_stats_update_end(&tx_stats->syncp); > > + } else { > > + u64_stats_update_begin(&tx_stats->syncp); > > + tx_stats->tso_packets++; > > + tx_stats->tso_bytes += skb->len - gso_hs; > > + u64_stats_update_end(&tx_stats->syncp); > > + } > > nit: I wonder if this could be slightly more succinctly written as: > > u64_stats_update_begin(&tx_stats->syncp); > if (skb->encapsulation) { > tx_stats->tso_inner_packets++; > tx_stats->tso_inner_bytes += skb->len - gso_hs; > } else { > tx_stats->tso_packets++; > tx_stats->tso_bytes += skb->len - gso_hs; > } > u64_stats_update_end(&tx_stats->syncp); > Yes it can be written this way:) > Also, it is unclear to me why the stats logic is moved here from > futher down in the same block. It feels more like a clean-up than a fix > (as, btw, is my suggestion immediately above). Since we need to calculate the gso_hs and fix head earlier than the stats and some other work, I move it immediately after skb_is_gso(skb). The gso_hs calculation was part of the tx_stats block, so the tx_stats is moved together to remain close to the gso_hs calculation to keep readability. > > > + > > pkg.tx_oob.s_oob.is_outer_ipv4 = ipv4; > > pkg.tx_oob.s_oob.is_outer_ipv6 = ipv6; > > > > @@ -252,26 +328,6 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, > struct net_device *ndev) > > &ipv6_hdr(skb)->daddr, 0, > > IPPROTO_TCP, 0); > > } > > - > > - if (skb->encapsulation) { > > - ihs = skb_inner_tcp_all_headers(skb); > > - u64_stats_update_begin(&tx_stats->syncp); > > - tx_stats->tso_inner_packets++; > > - tx_stats->tso_inner_bytes += skb->len - ihs; > > - u64_stats_update_end(&tx_stats->syncp); > > - } else { > > - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { > > - ihs = skb_transport_offset(skb) + sizeof(struct > udphdr); > > - } else { > > - ihs = skb_tcp_all_headers(skb); > > - } > > - > > - u64_stats_update_begin(&tx_stats->syncp); > > - tx_stats->tso_packets++; > > - tx_stats->tso_bytes += skb->len - ihs; > > - u64_stats_update_end(&tx_stats->syncp); > > - } > > - > > } else if (skb->ip_summed == CHECKSUM_PARTIAL) { > > csum_type = mana_checksum_info(skb); > > > > @@ -294,11 +350,25 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, > struct net_device *ndev) > > } else { > > /* Can't do offload of this type of checksum */ > > if (skb_checksum_help(skb)) > > - goto free_sgl_ptr; > > + goto tx_drop_count; > > } > > } > > > > - if (mana_map_skb(skb, apc, &pkg)) { > > + WARN_ON_ONCE(pkg.wqe_req.num_sge > > MAX_TX_WQE_SGL_ENTRIES); > > + > > + if (pkg.wqe_req.num_sge <= ARRAY_SIZE(pkg.sgl_array)) { > > + pkg.wqe_req.sgl = pkg.sgl_array; > > + } else { > > + pkg.sgl_ptr = kmalloc_array(pkg.wqe_req.num_sge, > > + sizeof(struct gdma_sge), > > + GFP_ATOMIC); > > + if (!pkg.sgl_ptr) > > + goto tx_drop_count; > > + > > + pkg.wqe_req.sgl = pkg.sgl_ptr; > > + } > > + > > + if (mana_map_skb(skb, apc, &pkg, gso_hs)) { > > u64_stats_update_begin(&tx_stats->syncp); > > tx_stats->mana_map_err++; > > u64_stats_update_end(&tx_stats->syncp); > > @@ -1255,12 +1325,18 @@ static void mana_unmap_skb(struct sk_buff *skb, > struct mana_port_context *apc) > > { > > struct mana_skb_head *ash = (struct mana_skb_head *)skb->head; > > struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; > > + int hsg = 1; /* num of SGEs of linear part */ > > struct device *dev = gc->dev; > > int i; > > > > - dma_unmap_single(dev, ash->dma_handle[0], ash->size[0], > DMA_TO_DEVICE); > > + if (skb_is_gso(skb) && skb_headlen(skb) > ash->size[0]) > > + hsg = 2; > > nit: Maybe this is nicer? > > /* num of SGEs of linear part */ > hsg = (skb_is_gso(skb) && skb_headlen(skb) > ash->size[0]) ? 2 : 1; Will do. Thanks, - Haiyang