From: Kaike Wan <kaike.wan@xxxxxxxxx> The hfi1 hardware flow is a hardware flow-control mechanism for a KDETH data packet that is received on a hfi1 port. It validates the packet by checking both the generation and sequence. Each QP that uses the TID RDMA mechanism will allocate a hardware flow from its receiving context for any incoming KDETH data packets. This patch implements: (1) functions to allocate hardware flow (2) functions to free hardware flow (3) a function to initialize hardware flow generation for a receiving context (4) functions to allocate software flows for requests when qpriv is initialized. Software flows are used to track hardware flow information for each data segment of a TID RDMA request (5) functions to free software flows for requests when qpriv is freed Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@xxxxxxxxx> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> --- drivers/infiniband/hw/hfi1/init.c | 11 +- drivers/infiniband/hw/hfi1/tid_rdma.c | 238 +++++++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 758d273..bdd2ad6 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -67,7 +67,7 @@ #include "aspm.h" #include "affinity.h" #include "vnic.h" -#include "exp_rcv.h" +#include "user_exp_rcv.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -369,6 +369,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; mutex_init(&rcd->exp_mutex); + spin_lock_init(&rcd->exp_lock); + INIT_LIST_HEAD(&rcd->flow_queue.queue_head); + INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); @@ -471,6 +474,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, GFP_KERNEL, numa); if (!rcd->opstats) goto bail; + + /* Initialize TID flow generations for the context */ + hfi1_kern_init_ctxt_generations(rcd); } *context = rcd; @@ -773,6 +779,8 @@ static void enable_chip(struct hfi1_devdata *dd) rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_IS_KSET(TID_RDMA)) + rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; hfi1_rcvctrl(dd, rcvmask, rcd); sc_enable(rcd->sc); hfi1_rcd_put(rcd); @@ -1474,6 +1482,7 @@ static int __init hfi1_mod_init(void) /* sanitize link CRC options */ link_crc_mask &= SUPPORTED_CRCS; + hfi1_compute_tid_rdma_flow_wt(); /* * These must be called before the driver is registered with * the PCI subsystem. diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index aefa23c..c8e2c40 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -53,6 +53,7 @@ #include "tid_rdma.h" #include "user_exp_rcv.h" #include "trace.h" +#include <rdma/ib_umem.h> /** * DOC: TID RDMA READ protocol @@ -149,6 +150,8 @@ static u32 mask_generation(u32 a) * C - Capcode */ +static u32 tid_rdma_flow_wt; + static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, u32 psn, u16 *fidx) { @@ -319,14 +322,161 @@ static void tid_rdma_trigger_resume(struct work_struct *work) { } +void hfi1_compute_tid_rdma_flow_wt(void) +{ + /* + * Heuristic for computing the RNR timeout when waiting on the flow + * queue. Rather than a computationaly expensive exact estimate of when + * a flow will be available, we assume that if a QP is at position N in + * the flow queue it has to wait approximately (N + 1) * (number of + * segments between two sync points), assuming PMTU of 4K. The rationale + * for this is that flows are released and recycled at each sync point. + */ + tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) / + TID_RDMA_MAX_SEGMENT_SIZE; +} + +/** + * kern_reserve_flow - allocate a hardware flow + * @rcd - the context to use for allocation + * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to + * signify "don't care". + * + * Use a bit mask based allocation to reserve a hardware + * flow for use in receiving KDETH data packets. If a preferred flow is + * specified the function will attempt to reserve that flow again, if + * available. + * + * The exp_lock must be held. + * + * Return: + * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 + * On failure: -EAGAIN + */ +static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) + __must_hold(&rcd->exp_lock) +{ + int nr; + + /* Attempt to reserve the preferred flow index */ + if (last >= 0 && last < RXE_NUM_TID_FLOWS && + !test_and_set_bit(last, &rcd->flow_mask)) + return last; + + nr = ffz(rcd->flow_mask); + BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= + (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); + if (nr > (RXE_NUM_TID_FLOWS - 1)) + return -EAGAIN; + set_bit(nr, &rcd->flow_mask); + return nr; +} + +static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, + u32 flow_idx) +{ + u64 reg; + + reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | + RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | + RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; + + if (generation != KERN_GENERATION_RESERVED) + reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; + + write_uctxt_csr(rcd->dd, rcd->ctxt, + RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); +} + +static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + u32 generation = rcd->flows[flow_idx].generation; + + kern_set_hw_flow(rcd, generation, flow_idx); + return generation; +} + +static u32 kern_flow_generation_next(u32 gen) +{ + u32 generation = mask_generation(gen + 1); + + if (generation == KERN_GENERATION_RESERVED) + generation = mask_generation(generation + 1); + return generation; +} + +static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + rcd->flows[flow_idx].generation = + kern_flow_generation_next(rcd->flows[flow_idx].generation); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); +} + static int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) { + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + unsigned long flags; + int ret = 0; + + /* The QP already has an allocated flow */ + if (fs->index != RXE_NUM_TID_FLOWS) + return ret; + + spin_lock_irqsave(&rcd->exp_lock, flags); + + ret = kern_reserve_flow(rcd, fs->last_index); + if (ret < 0) + goto queue; + fs->index = ret; + fs->last_index = fs->index; + + /* Generation received in a RESYNC overrides default flow generation */ + if (fs->generation != KERN_GENERATION_RESERVED) + rcd->flows[fs->index].generation = fs->generation; + fs->generation = kern_setup_hw_flow(rcd, fs->index); + fs->psn = 0; + fs->flags = 0; + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return 0; +queue: + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; } void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) { + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + unsigned long flags; + + if (fs->index >= RXE_NUM_TID_FLOWS) + return; + spin_lock_irqsave(&rcd->exp_lock, flags); + kern_clear_hw_flow(rcd, fs->index); + clear_bit(fs->index, &rcd->flow_mask); + fs->index = RXE_NUM_TID_FLOWS; + fs->psn = 0; + fs->generation = KERN_GENERATION_RESERVED; + + spin_unlock_irqrestore(&rcd->exp_lock, flags); +} + +void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) +{ + int i; + + for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { + rcd->flows[i].generation = mask_generation(prandom_u32()); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); + } } static int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, @@ -346,6 +496,71 @@ void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) { } +static void hfi1_kern_exp_rcv_dealloc(struct tid_rdma_flow *flow) +{ + kfree(flow->fstate); + flow->fstate = NULL; +} + +static int hfi1_kern_exp_rcv_alloc(struct tid_rdma_flow *flow) +{ + flow->fstate = kzalloc(sizeof(*flow->fstate), GFP_ATOMIC); + if (!flow->fstate) + goto nomem; + + return 0; +nomem: + hfi1_kern_exp_rcv_dealloc(flow); + return -ENOMEM; +} + +/* Called at QP destroy time to free TID RDMA resources */ +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) +{ + int i; + + for (i = 0; req->flows && i < req->n_max_flows; i++) + hfi1_kern_exp_rcv_dealloc(&req->flows[i]); + + kfree(req->flows); + req->flows = NULL; + req->n_max_flows = 0; + req->n_flows = 0; +} + +/* + * This is called at QP create time to allocate resources for TID RDMA + * segments/flows. This is done to keep all required memory pre-allocated and + * avoid memory allocation in the data path. + */ +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req) +{ + struct tid_rdma_flow *flows; + int i, ret; + u16 nflows; + + /* Size of the flow circular buffer is the next higher power of 2 */ + nflows = max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, + TID_RDMA_MAX_WRITE_SEGS_PER_REQ); + req->n_max_flows = roundup_pow_of_two(nflows + 1); + flows = kcalloc(req->n_max_flows, sizeof(*flows), GFP_KERNEL); + if (!flows) { + ret = -ENOMEM; + goto err; + } + req->flows = flows; + + for (i = 0; i < req->n_max_flows; i++) { + ret = hfi1_kern_exp_rcv_alloc(&req->flows[i]); + if (ret) + goto err; + } + return 0; +err: + hfi1_kern_exp_rcv_free_flows(req); + return ret; +} + /* * Validate and accept the TID RDMA READ request parameters. * Return 0 if the request is accepted successfully; @@ -884,9 +1099,14 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr) { struct hfi1_qp_priv *qpriv = qp->priv; - int i; + int i, ret; + BUILD_BUG_ON(TID_RDMA_MAX_SEGMENT_SIZE / PAGE_SIZE > U8_MAX); qpriv->rcd = qp_to_rcd(rdi, qp); + qpriv->flow_state.psn = 0; + qpriv->flow_state.index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.generation = KERN_GENERATION_RESERVED; spin_lock_init(&qpriv->opfn.lock); INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); @@ -912,6 +1132,12 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, if (!priv) return -ENOMEM; + ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req); + if (ret) { + kfree(priv); + return ret; + } + /* * Initialize various TID RDMA request variables. * These variables are "static", which is why they @@ -934,6 +1160,12 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, if (!priv) return -ENOMEM; + ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req); + if (ret) { + kfree(priv); + return ret; + } + priv->tid_req.qp = qp; priv->tid_req.rcd = qpriv->rcd; priv->tid_req.e.ack = &qp->s_ack_queue[i]; @@ -956,12 +1188,16 @@ void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) wqe = rvt_get_swqe_ptr(qp, i); priv = wqe->priv; + if (priv) + hfi1_kern_exp_rcv_free_flows(&priv->tid_req); kfree(priv); wqe->priv = NULL; } for (i = 0; i < rvt_max_atomic(rdi); i++) { struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; + if (priv) + hfi1_kern_exp_rcv_free_flows(&priv->tid_req); kfree(priv); qp->s_ack_queue[i].priv = NULL; } -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html