From: Kaike Wan <kaike.wan@xxxxxxxxx> Ipoib netdev will share receive contexts with existing VNIC netdev. To achieve that, a dummy netdev is allocated with hfi1_devdata to own the receive contexts, and ipoib and VNIC netdevs will be put on top of it. Each receive context is associated with a single NAPI object. This patch adds the functions to receive incoming packets for accelerated ipoib. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> Reviewed-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> Signed-off-by: Sadanand Warrier <sadanand.warrier@xxxxxxxxx> Signed-off-by: Grzegorz Andrejczuk <grzegorz.andrejczuk@xxxxxxxxx> Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> --- drivers/infiniband/hw/hfi1/Makefile | 2 + drivers/infiniband/hw/hfi1/driver.c | 92 ++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/hfi.h | 5 +- drivers/infiniband/hw/hfi1/ipoib.h | 18 ++++++ drivers/infiniband/hw/hfi1/ipoib_rx.c | 71 +++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/netdev.h | 90 +++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/netdev_rx.c | 79 +++++++++++++++++++++++++++ 7 files changed, 355 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/ipoib_rx.c create mode 100644 drivers/infiniband/hw/hfi1/netdev.h create mode 100644 drivers/infiniband/hw/hfi1/netdev_rx.c diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 0b25713..2e89ec1 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -23,10 +23,12 @@ hfi1-y := \ intr.o \ iowait.o \ ipoib_main.o \ + ipoib_rx.o \ ipoib_tx.o \ mad.o \ mmu_rb.o \ msix.o \ + netdev_rx.o \ opfn.o \ pcie.o \ pio.o \ diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 049d15b..c5ed6ed 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2018 Intel Corporation. + * Copyright(c) 2015-2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -54,6 +54,7 @@ #include <linux/module.h> #include <linux/prefetch.h> #include <rdma/ib_verbs.h> +#include <linux/etherdevice.h> #include "hfi.h" #include "trace.h" @@ -63,6 +64,9 @@ #include "vnic.h" #include "fault.h" +#include "ipoib.h" +#include "netdev.h" + #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -1550,6 +1554,81 @@ void handle_eflags(struct hfi1_packet *packet) show_eflags_errs(packet); } +static void hfi1_ipoib_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp; + struct net_device *netdev; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct napi_struct *napi = rcd->napi; + struct sk_buff *skb; + struct hfi1_netdev_rxq *rxq = container_of(napi, + struct hfi1_netdev_rxq, napi); + u32 extra_bytes; + u32 tlen, qpnum; + bool do_work, do_cnp; + struct hfi1_ipoib_dev_priv *priv; + + trace_hfi1_rcvhdr(packet); + + hfi1_setup_ib_header(packet); + + packet->ohdr = &((struct ib_header *)packet->hdr)->u.oth; + packet->grh = NULL; + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return; + } + + qpnum = ib_bth_get_qpn(packet->ohdr); + netdev = hfi1_netdev_get_data(rcd->dd, qpnum); + if (!netdev) + goto drop_no_nd; + + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + /* handle congestion notifications */ + do_work = hfi1_may_ecn(packet); + if (unlikely(do_work)) { + do_cnp = (packet->opcode != IB_OPCODE_CNP); + (void)hfi1_process_ecn_slowpath(hfi1_ipoib_priv(netdev)->qp, + packet, do_cnp); + } + + /* + * We have split point after last byte of DETH + * lets strip padding and CRC and ICRC. + * tlen is whole packet len so we need to + * subtract header size as well. + */ + tlen = packet->tlen; + extra_bytes = ib_bth_get_pad(packet->ohdr) + (SIZE_OF_CRC << 2) + + packet->hlen; + if (unlikely(tlen < extra_bytes)) + goto drop; + + tlen -= extra_bytes; + + skb = hfi1_ipoib_prepare_skb(rxq, tlen, packet->ebuf); + if (unlikely(!skb)) + goto drop; + + priv = hfi1_ipoib_priv(netdev); + hfi1_ipoib_update_rx_netstats(priv, 1, skb->len); + + skb->dev = netdev; + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + + return; + +drop: + ++netdev->stats.rx_dropped; +drop_no_nd: + ibp = rcd_to_iport(packet->rcd); + ++ibp->rvp.n_pkt_drops; +} + /* * The following functions are called by the interrupt handler. They are type * specific handlers for each packet type. @@ -1757,3 +1836,14 @@ void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd) [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, }; + +const rhf_rcv_function_ptr netdev_rhf_rcv_functions[] = { + [RHF_RCV_TYPE_EXPECTED] = process_receive_invalid, + [RHF_RCV_TYPE_EAGER] = process_receive_invalid, + [RHF_RCV_TYPE_IB] = hfi1_ipoib_ib_rcv, + [RHF_RCV_TYPE_ERROR] = process_receive_error, + [RHF_RCV_TYPE_BYPASS] = hfi1_vnic_bypass_rcv, + [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, +}; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 5a9276c..c7d0aad 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -233,6 +233,8 @@ struct hfi1_ctxtdata { intr_handler fast_handler; /** slow handler */ intr_handler slow_handler; + /* napi pointer assiociated with netdev */ + struct napi_struct *napi; /* verbs rx_stats per rcd */ struct hfi1_opcode_stats_perctx *opstats; /* clear interrupt mask */ @@ -985,7 +987,7 @@ typedef void (*hfi1_make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct rvt_swqe *wqe); extern const rhf_rcv_function_ptr normal_rhf_rcv_functions[]; - +extern const rhf_rcv_function_ptr netdev_rhf_rcv_functions[]; /* return values for the RHF receive functions */ #define RHF_RCV_CONTINUE 0 /* keep going */ @@ -1417,6 +1419,7 @@ struct hfi1_devdata { struct hfi1_vnic_data vnic; /* Lock to protect IRQ SRC register access */ spinlock_t irq_src_lock; + struct net_device *dummy_netdev; /* Keeps track of IPoIB RSM rule users */ atomic_t ipoib_rsm_usr_num; diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index c2e63ca..ca00f6c 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -22,6 +22,7 @@ #include "hfi.h" #include "iowait.h" +#include "netdev.h" #include <rdma/ib_verbs.h> @@ -29,6 +30,7 @@ #define HFI1_IPOIB_TXREQ_NAME_LEN 32 +#define HFI1_IPOIB_PSEUDO_LEN 20 #define HFI1_IPOIB_ENCAP_LEN 4 struct hfi1_ipoib_dev_priv; @@ -119,6 +121,19 @@ struct hfi1_ipoib_rdma_netdev { } static inline void +hfi1_ipoib_update_rx_netstats(struct hfi1_ipoib_dev_priv *priv, + u64 packets, + u64 bytes) +{ + struct pcpu_sw_netstats *netstats = this_cpu_ptr(priv->netstats); + + u64_stats_update_begin(&netstats->syncp); + netstats->rx_packets += packets; + netstats->rx_bytes += bytes; + u64_stats_update_end(&netstats->syncp); +} + +static inline void hfi1_ipoib_update_tx_netstats(struct hfi1_ipoib_dev_priv *priv, u64 packets, u64 bytes) @@ -142,6 +157,9 @@ int hfi1_ipoib_send_dma(struct net_device *dev, void hfi1_ipoib_napi_tx_enable(struct net_device *dev); void hfi1_ipoib_napi_tx_disable(struct net_device *dev); +struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq, + int size, void *data); + int hfi1_ipoib_rn_get_params(struct ib_device *device, u8 port_num, enum rdma_netdev_t type, diff --git a/drivers/infiniband/hw/hfi1/ipoib_rx.c b/drivers/infiniband/hw/hfi1/ipoib_rx.c new file mode 100644 index 0000000..2485663 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib_rx.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +#include "netdev.h" +#include "ipoib.h" + +#define HFI1_IPOIB_SKB_PAD ((NET_SKB_PAD) + (NET_IP_ALIGN)) + +static void copy_ipoib_buf(struct sk_buff *skb, void *data, int size) +{ + void *dst_data; + + skb_checksum_none_assert(skb); + skb->protocol = *((__be16 *)data); + + dst_data = skb_put(skb, size); + memcpy(dst_data, data, size); + skb->mac_header = HFI1_IPOIB_PSEUDO_LEN; + skb_pull(skb, HFI1_IPOIB_ENCAP_LEN); +} + +static struct sk_buff *prepare_frag_skb(struct napi_struct *napi, int size) +{ + struct sk_buff *skb; + int skb_size = SKB_DATA_ALIGN(size + HFI1_IPOIB_SKB_PAD); + void *frag; + + skb_size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + skb_size = SKB_DATA_ALIGN(skb_size); + frag = napi_alloc_frag(skb_size); + + if (unlikely(!frag)) + return napi_alloc_skb(napi, size); + + skb = build_skb(frag, skb_size); + + if (unlikely(!skb)) { + skb_free_frag(frag); + return NULL; + } + + skb_reserve(skb, HFI1_IPOIB_SKB_PAD); + return skb; +} + +struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq, + int size, void *data) +{ + struct napi_struct *napi = &rxq->napi; + int skb_size = size + HFI1_IPOIB_ENCAP_LEN; + struct sk_buff *skb; + + /* + * For smaller(4k + skb overhead) allocations we will go using + * napi cache. Otherwise we will try to use napi frag cache. + */ + if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE)) + skb = napi_alloc_skb(napi, skb_size); + else + skb = prepare_frag_skb(napi, skb_size); + + if (unlikely(!skb)) + return NULL; + + copy_ipoib_buf(skb, data, size); + + return skb; +} diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h new file mode 100644 index 0000000..8992dfe --- /dev/null +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +#ifndef HFI1_NETDEV_H +#define HFI1_NETDEV_H + +#include "hfi.h" + +#include <linux/netdevice.h> +#include <linux/xarray.h> + +/** + * struct hfi1_netdev_rxq - Receive Queue for HFI + * dummy netdev. Both IPoIB and VNIC netdevices will be working on + * top of this device. + * @napi: napi object + * @priv: ptr to netdev_priv + * @rcd: ptr to receive context data + */ +struct hfi1_netdev_rxq { + struct napi_struct napi; + struct hfi1_netdev_priv *priv; + struct hfi1_ctxtdata *rcd; +}; + +/* + * Number of netdev contexts used. Ensure it is less than or equal to + * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE). + */ +#define HFI1_MAX_NETDEV_CTXTS 8 + +/* Number of NETDEV RSM entries */ +#define NUM_NETDEV_MAP_ENTRIES HFI1_MAX_NETDEV_CTXTS + +/** + * struct hfi1_netdev_priv: data required to setup and run HFI netdev. + * @dd: hfi1_devdata + * @rxq: pointer to dummy netdev receive queues. + * @num_rx_q: number of receive queues + * @rmt_index: first free index in RMT Array + * @msix_start: first free MSI-X interrupt vector. + * @dev_tbl: netdev table for unique identifier VNIC and IPoIb VLANs. + * @enabled: atomic counter of netdevs enabling receive queues. + * When 0 NAPI will be disabled. + * @netdevs: atomic counter of netdevs using dummy netdev. + * When 0 receive queues will be freed. + */ +struct hfi1_netdev_priv { + struct hfi1_devdata *dd; + struct hfi1_netdev_rxq *rxq; + int num_rx_q; + int rmt_start; + struct xarray dev_tbl; + /* count of enabled napi polls */ + atomic_t enabled; + /* count of netdevs on top */ + atomic_t netdevs; +}; + +static inline +struct hfi1_netdev_priv *hfi1_netdev_priv(struct net_device *dev) +{ + return (struct hfi1_netdev_priv *)&dev[1]; +} + +static inline +int hfi1_netdev_ctxt_count(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return priv->num_rx_q; +} + +static inline +struct hfi1_ctxtdata *hfi1_netdev_get_ctxt(struct hfi1_devdata *dd, int ctxt) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return priv->rxq[ctxt].rcd; +} + +int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data); +void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id); +void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id); +void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id); + +#endif /* HFI1_NETDEV_H */ diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c new file mode 100644 index 0000000..3e286cb --- /dev/null +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for netdev RX functionality + */ + +#include "sdma.h" +#include "verbs.h" +#include "netdev.h" +#include "hfi.h" + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <rdma/ib_verbs.h> + +/** + * hfi1_netdev_add_data - Registers data with unique identifier + * to be requested later this is needed for VNIC and IPoIB VLANs + * implementations. + * This call is protected by mutex idr_lock. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + * @data: data to be associated with index + */ +int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return xa_insert(&priv->dev_tbl, id, data, GFP_NOWAIT); +} + +/** + * hfi1_netdev_remove_data - Removes data with previously given id. + * Returns the reference to removed entry. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return xa_erase(&priv->dev_tbl, id); +} + +/** + * hfi1_netdev_get_data - Gets data with given id + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return xa_load(&priv->dev_tbl, id); +} + +/** + * hfi1_netdev_get_first_dat - Gets first entry with greater or equal id. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + unsigned long index = *start_id; + void *ret; + + ret = xa_find(&priv->dev_tbl, &index, UINT_MAX, XA_PRESENT); + *start_id = (int)index; + return ret; +}