On Thu, Feb 13, 2025 at 11:01:38AM +0530, Suman Ghosh wrote: > This patch adds support to AF_XDP zero copy for CN10K. > This patch specifically adds receive side support. In this approach once > a xdp program with zero copy support on a specific rx queue is enabled, > then that receive quse is disabled/detached from the existing kernel > queue and re-assigned to the umem memory. > > Signed-off-by: Suman Ghosh <sumang@xxxxxxxxxxx> > --- > .../ethernet/marvell/octeontx2/nic/Makefile | 2 +- > .../ethernet/marvell/octeontx2/nic/cn10k.c | 7 +- > .../marvell/octeontx2/nic/otx2_common.c | 114 ++++++++--- > .../marvell/octeontx2/nic/otx2_common.h | 6 +- > .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 25 ++- > .../marvell/octeontx2/nic/otx2_txrx.c | 73 +++++-- > .../marvell/octeontx2/nic/otx2_txrx.h | 6 + > .../ethernet/marvell/octeontx2/nic/otx2_vf.c | 12 +- > .../ethernet/marvell/octeontx2/nic/otx2_xsk.c | 182 ++++++++++++++++++ > .../ethernet/marvell/octeontx2/nic/otx2_xsk.h | 21 ++ > .../ethernet/marvell/octeontx2/nic/qos_sq.c | 2 +- > 11 files changed, 389 insertions(+), 61 deletions(-) > create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.c > create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.h > > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile > index cb6513ab35e7..69e0778f9ac1 100644 > --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile > @@ -9,7 +9,7 @@ obj-$(CONFIG_RVU_ESWITCH) += rvu_rep.o > > rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \ > otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \ > - otx2_devlink.o qos_sq.o qos.o > + otx2_devlink.o qos_sq.o qos.o otx2_xsk.o > rvu_nicvf-y := otx2_vf.o > rvu_rep-y := rep.o > > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c > index a15cc86635d6..c3b6e0f60a79 100644 > --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c > @@ -112,9 +112,12 @@ int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) > struct otx2_nic *pfvf = dev; > int cnt = cq->pool_ptrs; > u64 ptrs[NPA_MAX_BURST]; > + struct otx2_pool *pool; > dma_addr_t bufptr; > int num_ptrs = 1; > > + pool = &pfvf->qset.pool[cq->cq_idx]; > + > /* Refill pool with new buffers */ > while (cq->pool_ptrs) { > if (otx2_alloc_buffer(pfvf, cq, &bufptr)) { > @@ -124,7 +127,9 @@ int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) > break; > } > cq->pool_ptrs--; > - ptrs[num_ptrs] = (u64)bufptr + OTX2_HEAD_ROOM; > + ptrs[num_ptrs] = pool->xsk_pool ? > + (u64)bufptr : (u64)bufptr + OTX2_HEAD_ROOM; > + > num_ptrs++; > if (num_ptrs == NPA_MAX_BURST || cq->pool_ptrs == 0) { > __cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs, > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c > index 161cf33ef89e..92b0dba07853 100644 > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c > @@ -17,6 +17,7 @@ > #include "otx2_common.h" > #include "otx2_struct.h" > #include "cn10k.h" > +#include "otx2_xsk.h" > > static bool otx2_is_pfc_enabled(struct otx2_nic *pfvf) > { > @@ -549,10 +550,13 @@ static int otx2_alloc_pool_buf(struct otx2_nic *pfvf, struct otx2_pool *pool, > } > > static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, > - dma_addr_t *dma) > + dma_addr_t *dma, int qidx, int idx) > { > u8 *buf; > > + if (pool->xsk_pool) > + return otx2_xsk_pool_alloc_buf(pfvf, pool, dma, idx); > + > if (pool->page_pool) > return otx2_alloc_pool_buf(pfvf, pool, dma); > > @@ -571,12 +575,12 @@ static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, > } > > int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, > - dma_addr_t *dma) > + dma_addr_t *dma, int qidx, int idx) > { > int ret; > > local_bh_disable(); > - ret = __otx2_alloc_rbuf(pfvf, pool, dma); > + ret = __otx2_alloc_rbuf(pfvf, pool, dma, qidx, idx); > local_bh_enable(); > return ret; > } > @@ -584,7 +588,8 @@ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, > int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, > dma_addr_t *dma) > { > - if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma))) > + if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma, > + cq->cq_idx, cq->pool_ptrs - 1))) > return -ENOMEM; > return 0; > } > @@ -884,7 +889,7 @@ void otx2_sqb_flush(struct otx2_nic *pfvf) > #define RQ_PASS_LVL_AURA (255 - ((95 * 256) / 100)) /* RED when 95% is full */ > #define RQ_DROP_LVL_AURA (255 - ((99 * 256) / 100)) /* Drop when 99% is full */ > > -static int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura) > +int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura) > { > struct otx2_qset *qset = &pfvf->qset; > struct nix_aq_enq_req *aq; > @@ -1041,7 +1046,7 @@ int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) > > } > > -static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) > +int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) > { > struct otx2_qset *qset = &pfvf->qset; > int err, pool_id, non_xdp_queues; > @@ -1057,11 +1062,18 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) > cq->cint_idx = qidx; > cq->cqe_cnt = qset->rqe_cnt; > if (pfvf->xdp_prog) { > - pool = &qset->pool[qidx]; > xdp_rxq_info_reg(&cq->xdp_rxq, pfvf->netdev, qidx, 0); > - xdp_rxq_info_reg_mem_model(&cq->xdp_rxq, > - MEM_TYPE_PAGE_POOL, > - pool->page_pool); > + pool = &qset->pool[qidx]; > + if (pool->xsk_pool) { > + xdp_rxq_info_reg_mem_model(&cq->xdp_rxq, > + MEM_TYPE_XSK_BUFF_POOL, > + NULL); > + xsk_pool_set_rxq_info(pool->xsk_pool, &cq->xdp_rxq); > + } else if (pool->page_pool) { > + xdp_rxq_info_reg_mem_model(&cq->xdp_rxq, > + MEM_TYPE_PAGE_POOL, > + pool->page_pool); > + } > } > } else if (qidx < non_xdp_queues) { > cq->cq_type = CQ_TX; > @@ -1281,9 +1293,10 @@ void otx2_free_bufs(struct otx2_nic *pfvf, struct otx2_pool *pool, > > pa = otx2_iova_to_phys(pfvf->iommu_domain, iova); > page = virt_to_head_page(phys_to_virt(pa)); > - > if (pool->page_pool) { > page_pool_put_full_page(pool->page_pool, page, true); > + } else if (pool->xsk_pool) { > + /* Note: No way of identifying xdp_buff */ > } else { > dma_unmap_page_attrs(pfvf->dev, iova, size, > DMA_FROM_DEVICE, > @@ -1298,6 +1311,7 @@ void otx2_free_aura_ptr(struct otx2_nic *pfvf, int type) > int pool_id, pool_start = 0, pool_end = 0, size = 0; > struct otx2_pool *pool; > u64 iova; > + int idx; > > if (type == AURA_NIX_SQ) { > pool_start = otx2_get_pool_idx(pfvf, type, 0); > @@ -1312,16 +1326,21 @@ void otx2_free_aura_ptr(struct otx2_nic *pfvf, int type) > > /* Free SQB and RQB pointers from the aura pool */ > for (pool_id = pool_start; pool_id < pool_end; pool_id++) { > - iova = otx2_aura_allocptr(pfvf, pool_id); > pool = &pfvf->qset.pool[pool_id]; > + iova = otx2_aura_allocptr(pfvf, pool_id); > while (iova) { > if (type == AURA_NIX_RQ) > iova -= OTX2_HEAD_ROOM; > - > otx2_free_bufs(pfvf, pool, iova, size); > - > iova = otx2_aura_allocptr(pfvf, pool_id); > } > + > + for (idx = 0 ; idx < pool->xdp_cnt; idx++) { > + if (!pool->xdp[idx]) > + continue; > + > + xsk_buff_free(pool->xdp[idx]); > + } > } > } > > @@ -1338,7 +1357,8 @@ void otx2_aura_pool_free(struct otx2_nic *pfvf) > qmem_free(pfvf->dev, pool->stack); > qmem_free(pfvf->dev, pool->fc_addr); > page_pool_destroy(pool->page_pool); > - pool->page_pool = NULL; > + devm_kfree(pfvf->dev, pool->xdp); > + pool->xsk_pool = NULL; > } > devm_kfree(pfvf->dev, pfvf->qset.pool); > pfvf->qset.pool = NULL; > @@ -1425,6 +1445,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, > int stack_pages, int numptrs, int buf_size, int type) > { > struct page_pool_params pp_params = { 0 }; > + struct xsk_buff_pool *xsk_pool; > struct npa_aq_enq_req *aq; > struct otx2_pool *pool; > int err; > @@ -1468,21 +1489,35 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, > aq->ctype = NPA_AQ_CTYPE_POOL; > aq->op = NPA_AQ_INSTOP_INIT; > > - if (type != AURA_NIX_RQ) { > - pool->page_pool = NULL; > + if (type != AURA_NIX_RQ) > + return 0; > + > + if (!test_bit(pool_id, pfvf->af_xdp_zc_qidx)) { > + pp_params.order = get_order(buf_size); > + pp_params.flags = PP_FLAG_DMA_MAP; > + pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs); > + pp_params.nid = NUMA_NO_NODE; > + pp_params.dev = pfvf->dev; > + pp_params.dma_dir = DMA_FROM_DEVICE; > + pool->page_pool = page_pool_create(&pp_params); > + if (IS_ERR(pool->page_pool)) { > + netdev_err(pfvf->netdev, "Creation of page pool failed\n"); > + return PTR_ERR(pool->page_pool); > + } > return 0; > } > > - pp_params.order = get_order(buf_size); > - pp_params.flags = PP_FLAG_DMA_MAP; > - pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs); > - pp_params.nid = NUMA_NO_NODE; > - pp_params.dev = pfvf->dev; > - pp_params.dma_dir = DMA_FROM_DEVICE; > - pool->page_pool = page_pool_create(&pp_params); > - if (IS_ERR(pool->page_pool)) { > - netdev_err(pfvf->netdev, "Creation of page pool failed\n"); > - return PTR_ERR(pool->page_pool); > + /* Set XSK pool to support AF_XDP zero-copy */ > + xsk_pool = xsk_get_pool_from_qid(pfvf->netdev, pool_id); > + if (xsk_pool) { > + pool->xsk_pool = xsk_pool; > + pool->xdp_cnt = numptrs; > + pool->xdp = devm_kcalloc(pfvf->dev, > + numptrs, sizeof(struct xdp_buff *), GFP_KERNEL); What is the rationale behind having a buffer pool within your driver while you have this very same thing within xsk_buff_pool? You're doubling your work. Just use the xsk_buff_alloc_batch() and have a simpler ZC implementation in your driver. > + if (IS_ERR(pool->xdp)) { > + netdev_err(pfvf->netdev, "Creation of xsk pool failed\n"); > + return PTR_ERR(pool->xdp); > + } > } > > return 0; > @@ -1543,9 +1578,18 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf) > } > > for (ptr = 0; ptr < num_sqbs; ptr++) { > - err = otx2_alloc_rbuf(pfvf, pool, &bufptr); > - if (err) > + err = otx2_alloc_rbuf(pfvf, pool, &bufptr, pool_id, ptr); > + if (err) { > + if (pool->xsk_pool) { > + ptr--; > + while (ptr >= 0) { > + xsk_buff_free(pool->xdp[ptr]); > + ptr--; > + } > + } > goto err_mem; > + } > + > pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr); > sq->sqb_ptrs[sq->sqb_count++] = (u64)bufptr; > } > @@ -1595,11 +1639,19 @@ int otx2_rq_aura_pool_init(struct otx2_nic *pfvf) > /* Allocate pointers and free them to aura/pool */ > for (pool_id = 0; pool_id < hw->rqpool_cnt; pool_id++) { > pool = &pfvf->qset.pool[pool_id]; > + > for (ptr = 0; ptr < num_ptrs; ptr++) { > - err = otx2_alloc_rbuf(pfvf, pool, &bufptr); > - if (err) > + err = otx2_alloc_rbuf(pfvf, pool, &bufptr, pool_id, ptr); > + if (err) { > + if (pool->xsk_pool) { > + while (ptr) > + xsk_buff_free(pool->xdp[--ptr]); > + } > return -ENOMEM; > + } > + > pfvf->hw_ops->aura_freeptr(pfvf, pool_id, > + pool->xsk_pool ? bufptr : > bufptr + OTX2_HEAD_ROOM); > } > } > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h > index d5fbccb289df..60508971b62f 100644 > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h > @@ -532,6 +532,8 @@ struct otx2_nic { > > /* Inline ipsec */ > struct cn10k_ipsec ipsec; > + /* af_xdp zero-copy */ > + unsigned long *af_xdp_zc_qidx; > }; > > static inline bool is_otx2_lbkvf(struct pci_dev *pdev) > @@ -1003,7 +1005,7 @@ void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq); > void otx2_free_pending_sqe(struct otx2_nic *pfvf); > void otx2_sqb_flush(struct otx2_nic *pfvf); > int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, > - dma_addr_t *dma); > + dma_addr_t *dma, int qidx, int idx); > int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable); > void otx2_ctx_disable(struct mbox *mbox, int type, bool npa); > int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable); > @@ -1033,6 +1035,8 @@ void otx2_pfaf_mbox_destroy(struct otx2_nic *pf); > void otx2_disable_mbox_intr(struct otx2_nic *pf); > void otx2_disable_napi(struct otx2_nic *pf); > irqreturn_t otx2_cq_intr_handler(int irq, void *cq_irq); > +int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura); > +int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx); > > /* RSS configuration APIs*/ > int otx2_rss_init(struct otx2_nic *pfvf); > diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c > index 4347a3c95350..50a42cd5d50a 100644 > --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c > +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c > @@ -27,6 +27,7 @@ > #include "qos.h" > #include <rvu_trace.h> > #include "cn10k_ipsec.h" > +#include "otx2_xsk.h" > > #define DRV_NAME "rvu_nicpf" > #define DRV_STRING "Marvell RVU NIC Physical Function Driver" > @@ -1662,9 +1663,7 @@ void otx2_free_hw_resources(struct otx2_nic *pf) > struct nix_lf_free_req *free_req; > struct mbox *mbox = &pf->mbox; > struct otx2_cq_queue *cq; > - struct otx2_pool *pool; > struct msg_req *req; > - int pool_id; > int qidx; > > /* Ensure all SQE are processed */ > @@ -1705,13 +1704,6 @@ void otx2_free_hw_resources(struct otx2_nic *pf) > /* Free RQ buffer pointers*/ > otx2_free_aura_ptr(pf, AURA_NIX_RQ); > > - for (qidx = 0; qidx < pf->hw.rx_queues; qidx++) { > - pool_id = otx2_get_pool_idx(pf, AURA_NIX_RQ, qidx); > - pool = &pf->qset.pool[pool_id]; > - page_pool_destroy(pool->page_pool); > - pool->page_pool = NULL; > - } > - > otx2_free_cq_res(pf); > > /* Free all ingress bandwidth profiles allocated */ > @@ -2788,6 +2780,8 @@ static int otx2_xdp(struct net_device *netdev, struct netdev_bpf *xdp) > switch (xdp->command) { > case XDP_SETUP_PROG: > return otx2_xdp_setup(pf, xdp->prog); > + case XDP_SETUP_XSK_POOL: > + return otx2_xsk_pool_setup(pf, xdp->xsk.pool, xdp->xsk.queue_id); > default: > return -EINVAL; > } > @@ -2865,6 +2859,7 @@ static const struct net_device_ops otx2_netdev_ops = { > .ndo_set_vf_vlan = otx2_set_vf_vlan, > .ndo_get_vf_config = otx2_get_vf_config, > .ndo_bpf = otx2_xdp, > + .ndo_xsk_wakeup = otx2_xsk_wakeup, > .ndo_xdp_xmit = otx2_xdp_xmit, > .ndo_setup_tc = otx2_setup_tc, > .ndo_set_vf_trust = otx2_ndo_set_vf_trust, > @@ -3203,16 +3198,26 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) > /* Enable link notifications */ > otx2_cgx_config_linkevents(pf, true); > > + pf->af_xdp_zc_qidx = bitmap_zalloc(qcount, GFP_KERNEL); if this is taken from ice drivers then be aware we got rid of bitmap tracking zc enabled queues. see adbf5a42341f ("ice: remove af_xdp_zc_qps bitmap"). in case you would still have a need for that after going through referenced commit, please provide us some justification why. > + if (!pf->af_xdp_zc_qidx) { > + err = -ENOMEM; > + goto err_sriov_cleannup; > + } > + > #ifdef CONFIG_DCB > err = otx2_dcbnl_set_ops(netdev); > if (err) > - goto err_pf_sriov_init; > + goto err_free_zc_bmap; > #endif > > otx2_qos_init(pf, qos_txqs); > > return 0; > > +err_free_zc_bmap: > + bitmap_free(pf->af_xdp_zc_qidx); > +err_sriov_cleannup: > + otx2_sriov_vfcfg_cleanup(pf); > err_pf_sriov_init: > otx2_shutdown_tc(pf); > err_mcam_flow_del: [...]