From: Kaike Wan <kaike.wan@xxxxxxxxx> TID entries are used by hfi1 hardware to receive data payload from incoming packets directly into a user buffer and thus avoid data copying by software. This patch implements the functions for TID allocation, freeing, and programming TID RcvArray entries in hardware for kernel clients. TID entries are managed via lists of TID groups similar to PSM. Furthermore, to track TID resource allocation for each request, software flows are also allocated and freed as needed. Since software flows consume large amount of memory for tracking TID allocation and freeing, it is generally desirable to allocate them dynamically in the send queue and only for TID RDMA requests, but pre-allocate them for receive queue because the send queue could have thousands of entries while the receive queue has only a limited number of entries. Signed-off-by: Mitko Haralanov <mitko.haralanov@xxxxxxxxx> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@xxxxxxxxx> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> Signed-off-by: Kaike Wan <kaike.wan@xxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> --- drivers/infiniband/hw/hfi1/hfi.h | 2 drivers/infiniband/hw/hfi1/init.c | 3 drivers/infiniband/hw/hfi1/tid_rdma.c | 877 +++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 101 +++ drivers/infiniband/hw/hfi1/user_exp_rcv.h | 1 drivers/infiniband/hw/hfi1/verbs.c | 29 + drivers/infiniband/hw/hfi1/verbs.h | 34 + drivers/infiniband/sw/rdmavt/qp.c | 2 include/rdma/rdmavt_qp.h | 2 9 files changed, 1033 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 78aa344..1412ed1 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -303,6 +303,8 @@ struct hfi1_ctxtdata { spinlock_t exp_lock; /* Queue for QP's waiting for HW TID flows */ struct tid_queue flow_queue; + /* Queue for QP's waiting for HW receive array entries */ + struct tid_queue rarr_queue; /* when waiting for rcv or pioavail */ wait_queue_head_t wait; /* uuid from PSM */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index da970ed..9aa28f5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -372,6 +372,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, mutex_init(&rcd->exp_mutex); spin_lock_init(&rcd->exp_lock); INIT_LIST_HEAD(&rcd->flow_queue.queue_head); + INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); @@ -1596,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd) struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; if (rcd) { - hfi1_clear_tids(rcd); + hfi1_free_ctxt_rcv_groups(rcd); hfi1_free_ctxt(rcd); } } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 88bf208..f5eba5d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -8,6 +8,7 @@ #include "qp.h" #include "verbs.h" #include "tid_rdma.h" +#include "exp_rcv.h" #include "trace.h" #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) @@ -35,8 +36,14 @@ static u32 mask_generation(u32 a) #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) +/* Maximum number of segments in flight per QP request. */ #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 +#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ + TID_RDMA_MAX_WRITE_SEGS_PER_REQ) +#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) + +#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 @@ -79,6 +86,11 @@ static u32 mask_generation(u32 a) */ static void tid_rdma_trigger_resume(struct work_struct *work); +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp); +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -230,7 +242,7 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); rcd->jkey = TID_RDMA_JKEY; hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); - return 0; + return hfi1_alloc_ctxt_rcv_groups(rcd); } /** @@ -266,6 +278,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr) { struct hfi1_qp_priv *qpriv = qp->priv; + int i, ret; qpriv->rcd = qp_to_rcd(rdi, qp); @@ -278,15 +291,75 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.generation = KERN_GENERATION_RESERVED; INIT_LIST_HEAD(&qpriv->tid_wait); + if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct hfi1_devdata *dd = qpriv->rcd->dd; + + qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * + sizeof(*qpriv->pages), + GFP_KERNEL, dd->node); + if (!qpriv->pages) + return -ENOMEM; + for (i = 0; i < qp->s_size; i++) { + struct hfi1_swqe_priv *priv; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.swqe = wqe; + wqe->priv = priv; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv; + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.ack = &qp->s_ack_queue[i]; + + ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, + GFP_KERNEL); + if (ret) { + kfree(priv); + return ret; + } + qp->s_ack_queue[i].priv = priv; + } + } + return 0; } void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { - struct hfi1_qp_priv *priv = qp->priv; - - if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) - cancel_work_sync(&priv->opfn.opfn_work); + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + u32 i; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + for (i = 0; i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->priv); + wqe->priv = NULL; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; + + if (priv) + hfi1_kern_exp_rcv_free_flows(&priv->tid_req); + kfree(priv); + qp->s_ack_queue[i].priv = NULL; + } + cancel_work_sync(&qpriv->opfn.opfn_work); + kfree(qpriv->pages); + qpriv->pages = NULL; + } } /* Flow and tid waiter functions */ @@ -540,6 +613,7 @@ void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); + _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); } /* Flow functions */ @@ -702,3 +776,796 @@ void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); } } + +/* TID allocation functions */ +static u8 trdma_pset_order(struct tid_rdma_pageset *s) +{ + u8 count = s->count; + + return ilog2(count) + 1; +} + +/** + * tid_rdma_find_phys_blocks_4k - get groups base on mr info + * @npages - number of pages + * @pages - pointer to an array of page structs + * @list - page set array to return + * + * This routine returns the number of groups associated with + * the current sge information. This implementation is based + * on the expected receive find_phys_blocks() adjusted to + * use the MR information vs. the pfn. + * + * Return: + * the number of RcvArray entries + */ +static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 pagecount, pageidx, setcount = 0, i; + void *vaddr, *this_vaddr; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + vaddr = page_address(pages[0]); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_vaddr = i < npages ? page_address(pages[i]) : NULL; + /* + * If the vaddr's are not sequential, pages are not physically + * contiguous. + */ + if (this_vaddr != (vaddr + PAGE_SIZE)) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + vaddr = this_vaddr; + } else { + vaddr += PAGE_SIZE; + pagecount++; + } + } + /* insure we always return an even number of sets */ + if (setcount & 1) + list[setcount++].count = 0; + return setcount; +} + +/** + * tid_flush_pages - dump out pages into pagesets + * @list - list of pagesets + * @idx - pointer to current page index + * @pages - number of pages to dump + * @sets - current number of pagesset + * + * This routine flushes out accumuated pages. + * + * To insure an even number of sets the + * code may add a filler. + * + * This can happen with when pages is not + * a power of 2 or pages is a power of 2 + * less than the maximum pages. + * + * Return: + * The new number of sets + */ + +static u32 tid_flush_pages(struct tid_rdma_pageset *list, + u32 *idx, u32 pages, u32 sets) +{ + while (pages) { + u32 maxpages = pages; + + if (maxpages > MAX_EXPECTED_PAGES) + maxpages = MAX_EXPECTED_PAGES; + else if (!is_power_of_2(maxpages)) + maxpages = rounddown_pow_of_two(maxpages); + list[sets].idx = *idx; + list[sets++].count = maxpages; + *idx += maxpages; + pages -= maxpages; + } + /* might need a filler */ + if (sets & 1) + list[sets++].count = 0; + return sets; +} + +/** + * tid_rdma_find_phys_blocks_8k - get groups base on mr info + * @pages - pointer to an array of page structs + * @npages - number of pages + * @list - page set array to return + * + * This routine parses an array of pages to compute pagesets + * in an 8k compatible way. + * + * pages are tested two at a time, i, i + 1 for contiguous + * pages and i - 1 and i contiguous pages. + * + * If any condition is false, any accumlated pages are flushed and + * v0,v1 are emitted as separate PAGE_SIZE pagesets + * + * Otherwise, the current 8k is totaled for a future flush. + * + * Return: + * The number of pagesets + * list set with the returned number of pagesets + * + */ +static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 idx, sets = 0, i; + u32 pagecnt = 0; + void *v0, *v1, *vm1; + + if (!npages) + return 0; + for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { + /* get a new v0 */ + v0 = page_address(pages[i]); + v1 = i + 1 < npages ? + page_address(pages[i + 1]) : NULL; + /* compare i, i + 1 vaddr */ + if (v1 != (v0 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + /* output v0,v1 as two pagesets */ + list[sets].idx = idx++; + list[sets++].count = 1; + if (v1) { + list[sets].count = 1; + list[sets++].idx = idx++; + } else { + list[sets++].count = 0; + } + vm1 = NULL; + pagecnt = 0; + continue; + } + /* i,i+1 consecutive, look at i-1,i */ + if (vm1 && v0 != (vm1 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + pagecnt = 0; + } + /* pages will always be a multiple of 8k */ + pagecnt += 2; + /* save i-1 */ + vm1 = v1; + /* move to next pair */ + } + /* dump residual pages at end */ + sets = tid_flush_pages(list, &idx, npages - idx, sets); + /* by design cannot be odd sets */ + WARN_ON(sets & 1); + return sets; +} + +/** + * Find pages for one segment of a sge array represented by @ss. The function + * does not check the sge, the sge must have been checked for alignment with a + * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of + * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge + * copy maintained in @ss->sge, the original sge is not modified. + * + * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not + * releasing the MR reference count at the same time. Otherwise, we'll "leak" + * references to the MR. This difference requires that we keep track of progress + * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request + * structure. + */ +static u32 kern_find_pages(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + struct tid_rdma_request *req = flow->req; + struct rvt_sge *sge = &ss->sge; + u32 length = flow->req->seg_len; + u32 len = PAGE_SIZE; + u32 i = 0; + + while (length && req->isge < ss->num_sge) { + pages[i++] = virt_to_page(sge->vaddr); + + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (!sge->sge_length) { + if (++req->isge < ss->num_sge) + *sge = ss->sg_list[req->isge - 1]; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + ++sge->m; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + + flow->length = flow->req->seg_len - length; + *last = req->isge == ss->num_sge ? false : true; + return i; +} + +static void dma_unmap_flow(struct tid_rdma_flow *flow) +{ + struct hfi1_devdata *dd; + int i; + struct tid_rdma_pageset *pset; + + dd = flow->req->rcd->dd; + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count && pset->addr) { + dma_unmap_page(&dd->pcidev->dev, + pset->addr, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + pset->mapped = 0; + } + } +} + +static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) +{ + int i; + struct hfi1_devdata *dd = flow->req->rcd->dd; + struct tid_rdma_pageset *pset; + + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count) { + pset->addr = dma_map_page(&dd->pcidev->dev, + pages[pset->idx], + 0, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + + if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { + dma_unmap_flow(flow); + return -ENOMEM; + } + pset->mapped = 1; + } + } + return 0; +} + +static inline bool dma_mapped(struct tid_rdma_flow *flow) +{ + return !!flow->pagesets[0].mapped; +} + +/* + * Get pages pointers and identify contiguous physical memory chunks for a + * segment. All segments are of length flow->req->seg_len. + */ +static int kern_get_phys_blocks(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + u8 npages; + + /* Reuse previously computed pagesets, if any */ + if (flow->npagesets) { + if (!dma_mapped(flow)) + return dma_map_flow(flow, pages); + return 0; + } + + npages = kern_find_pages(flow, pages, ss, last); + + if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) + flow->npagesets = + tid_rdma_find_phys_blocks_4k(flow, pages, npages, + flow->pagesets); + else + flow->npagesets = + tid_rdma_find_phys_blocks_8k(flow, pages, npages, + flow->pagesets); + + return dma_map_flow(flow, pages); +} + +static inline void kern_add_tid_node(struct tid_rdma_flow *flow, + struct hfi1_ctxtdata *rcd, char *s, + struct tid_group *grp, u8 cnt) +{ + struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; + + WARN_ON_ONCE(flow->tnode_cnt >= + (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); + if (WARN_ON_ONCE(cnt & 1)) + dd_dev_err(rcd->dd, + "unexpected odd allocation cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + + node->grp = grp; + node->map = grp->map; + node->cnt = cnt; +} + +/* + * Try to allocate pageset_count TID's from TID groups for a context + * + * This function allocates TID's without moving groups between lists or + * modifying grp->map. This is done as follows, being cogizant of the lists + * between which the TID groups will move: + * 1. First allocate complete groups of 8 TID's since this is more efficient, + * these groups will move from group->full without affecting used + * 2. If more TID's are needed allocate from used (will move from used->full or + * stay in used) + * 3. If we still don't have the required number of TID's go back and look again + * at a complete group (will move from group->used) + */ +static int kern_alloc_tids(struct tid_rdma_flow *flow) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + u32 ngroups, pageidx = 0; + struct tid_group *group = NULL, *used; + u8 use; + + flow->tnode_cnt = 0; + ngroups = flow->npagesets / dd->rcv_entries.group_size; + if (!ngroups) + goto used_list; + + /* First look at complete groups */ + list_for_each_entry(group, &rcd->tid_group_list.list, list) { + kern_add_tid_node(flow, rcd, "complete groups", group, + group->size); + + pageidx += group->size; + if (!--ngroups) + break; + } + + if (pageidx >= flow->npagesets) + goto ok; + +used_list: + /* Now look at partially used groups */ + list_for_each_entry(used, &rcd->tid_used_list.list, list) { + use = min_t(u32, flow->npagesets - pageidx, + used->size - used->used); + kern_add_tid_node(flow, rcd, "used groups", used, use); + + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; + } + + /* + * Look again at a complete group, continuing from where we left. + * However, if we are at the head, we have reached the end of the + * complete groups list from the first loop above + */ + if (group && &group->list == &rcd->tid_group_list.list) + goto bail_eagain; + group = list_prepare_entry(group, &rcd->tid_group_list.list, + list); + if (list_is_last(&group->list, &rcd->tid_group_list.list)) + goto bail_eagain; + group = list_next_entry(group, list); + use = min_t(u32, flow->npagesets - pageidx, group->size); + kern_add_tid_node(flow, rcd, "complete continue", group, use); + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; +bail_eagain: + return -EAGAIN; +ok: + return 0; +} + +static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, + u32 *pset_idx) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + struct tid_rdma_pageset *pset; + u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; + u32 rcventry, npages = 0, pair = 0, tidctrl; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + pset = &flow->pagesets[(*pset_idx)++]; + if (pset->count) { + hfi1_put_tid(dd, rcventry, PT_EXPECTED, + pset->addr, trdma_pset_order(pset)); + } else { + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + } + npages += pset->count; + + rcventry -= rcd->expected_base; + tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; + /* + * A single TID entry will be used to use a rcvarr pair (with + * tidctrl 0x3), if ALL these are true (a) the bit pos is even + * (b) the group map shows current and the next bits as free + * indicating two consecutive rcvarry entries are available (c) + * we actually need 2 more entries + */ + pair = !(i & 0x1) && !((node->map >> i) & 0x3) && + node->cnt >= cnt + 2; + if (!pair) { + if (!pset->count) + tidctrl = 0x1; + flow->tid_entry[flow->tidcnt++] = + EXP_TID_SET(IDX, rcventry >> 1) | + EXP_TID_SET(CTRL, tidctrl) | + EXP_TID_SET(LEN, npages); + /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ + flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); + npages = 0; + } + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_full_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_group_list, + &rcd->tid_used_list); + + grp->used++; + grp->map |= BIT(i); + cnt++; + } +} + +static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + u32 rcventry; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + + grp->used--; + grp->map &= ~BIT(i); + cnt++; + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_full_list, + &rcd->tid_used_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_group_list); + } + if (WARN_ON_ONCE(cnt & 1)) { + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + + dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + } +} + +static void kern_program_rcvarray(struct tid_rdma_flow *flow) +{ + u32 pset_idx = 0; + int i; + + flow->npkts = 0; + flow->tidcnt = 0; + for (i = 0; i < flow->tnode_cnt; i++) + kern_program_rcv_group(flow, i, &pset_idx); +} + +/** + * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a + * TID RDMA request + * + * @req: TID RDMA request for which the segment/flow is being set up + * @ss: sge state, maintains state across successive segments of a sge + * @last: set to true after the last sge segment has been processed + * + * This function + * (1) finds a free flow entry in the flow circular buffer + * (2) finds pages and continuous physical chunks constituing one segment + * of an sge + * (3) allocates TID group entries for those chunks + * (4) programs rcvarray entries in the hardware corresponding to those + * TID's + * (5) computes a tidarray with formatted TID entries which can be sent + * to the sender + * (6) Reserves and programs HW flows. + * (7) It also manages queing the QP when TID/flow resources are not + * available. + * + * @req points to struct tid_rdma_request of which the segments are a part. The + * function uses qp, rcd and seg_len members of @req. In the absence of errors, + * req->flow_idx is the index of the flow which has been prepared in this + * invocation of function call. With flow = &req->flows[req->flow_idx], + * flow->tid_entry contains the TID array which the sender can use for TID RDMA + * sends and flow->npkts contains number of packets required to send the + * segment. + * + * hfi1_check_sge_align should be called prior to calling this function and if + * it signals error TID RDMA cannot be used for this sge and this function + * should not be called. + * + * For the queuing, caller must hold the flow->req->qp s_lock from the send + * engine and the function will procure the exp_lock. + * + * Return: + * The function returns -EAGAIN if sufficient number of TID/flow resources to + * map the segment could not be allocated. In this case the function should be + * called again with previous arguments to retry the TID allocation. There are + * no other error returns. The function returns 0 on success. + */ +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->setup_head]; + struct hfi1_ctxtdata *rcd = req->rcd; + struct hfi1_qp_priv *qpriv = req->qp->priv; + unsigned long flags; + struct rvt_qp *fqp; + u16 clear_tail = req->clear_tail; + + lockdep_assert_held(&req->qp->s_lock); + /* + * We return error if either (a) we don't have space in the flow + * circular buffer, or (b) we already have max entries in the buffer. + * Max entries depend on the type of request we are processing and the + * negotiated TID RDMA parameters. + */ + if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || + CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= + req->n_flows) + return -EINVAL; + + /* + * Get pages, identify contiguous physical memory chunks for the segment + * If we can not determine a DMA address mapping we will treat it just + * like if we ran out of space above. + */ + if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { + hfi1_wait_kmem(flow->req->qp); + return -ENOMEM; + } + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) + goto queue; + + /* + * At this point we know the number of pagesets and hence the number of + * TID's to map the segment. Allocate the TID's from the TID groups. If + * we cannot allocate the required number we exit and try again later + */ + if (kern_alloc_tids(flow)) + goto queue; + /* + * Finally program the TID entries with the pagesets, compute the + * tidarray and enable the HW flow + */ + kern_program_rcvarray(flow); + + /* + * Setup the flow state with relevant information. + * This information is used for tracking the sequence of data packets + * for the segment. + * The flow is setup here as this is the most accurate time and place + * to do so. Doing at a later time runs the risk of the flow data in + * qpriv getting out of sync. + */ + memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); + flow->idx = qpriv->flow_state.index; + flow->flow_state.generation = qpriv->flow_state.generation; + flow->flow_state.spsn = qpriv->flow_state.psn; + flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, flow->flow_state.spsn); + qpriv->flow_state.psn += flow->npkts; + + dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + tid_rdma_schedule_tid_wakeup(fqp); + + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) +{ + flow->npagesets = 0; +} + +/* + * This function is called after one segment has been successfully sent to + * release the flow and TID HW/SW resources for that segment. The segments for a + * TID RDMA request are setup and cleared in FIFO order which is managed using a + * circular buffer. + */ +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct hfi1_ctxtdata *rcd = req->rcd; + unsigned long flags; + int i; + struct rvt_qp *fqp; + + lockdep_assert_held(&req->qp->s_lock); + /* Exit if we have nothing in the flow circular buffer */ + if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) + return -EINVAL; + + spin_lock_irqsave(&rcd->exp_lock, flags); + + for (i = 0; i < flow->tnode_cnt; i++) + kern_unprogram_rcv_group(flow, i); + /* To prevent double unprogramming */ + flow->tnode_cnt = 0; + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + dma_unmap_flow(flow); + + hfi1_tid_rdma_reset_flow(flow); + req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); + + if (fqp == req->qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } + + return 0; +} + +/* + * This function is called to release all the tid entries for + * a request. + */ +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + /* Use memory barrier for proper ordering */ + while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { + if (hfi1_kern_exp_rcv_clear(req)) + break; + } +} + +/** + * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information + * @req - the tid rdma request to be cleaned + */ +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) +{ + kfree(req->flows); + req->flows = NULL; +} + +/** + * __trdma_clean_swqe - clean up for large sized QPs + * @qp: the queue patch + * @wqe: the send wqe + */ +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_swqe_priv *p = wqe->priv; + + hfi1_kern_exp_rcv_free_flows(&p->tid_req); +} + +/* + * This can be called at QP create time or in the data path. + */ +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp) +{ + struct tid_rdma_flow *flows; + int i; + + if (likely(req->flows)) + return 0; + flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, + req->rcd->numa_id); + if (!flows) + return -ENOMEM; + /* mini init */ + for (i = 0; i < MAX_FLOWS; i++) { + flows[i].req = req; + flows[i].npagesets = 0; + flows[i].pagesets[0].mapped = 0; + } + req->flows = flows; + return 0; +} + +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + /* + * Initialize various TID RDMA request variables. + * These variables are "static", which is why they + * can be pre-initialized here before the WRs has + * even been submitted. + * However, non-NULL values for these variables do not + * imply that this WQE has been enabled for TID RDMA. + * Drivers should check the WQE's opcode to determine + * if a request is a TID RDMA one or not. + */ + req->qp = qp; + req->rcd = qpriv->rcd; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3bc0aaf..524baf8 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -6,7 +6,16 @@ #ifndef HFI1_TID_RDMA_H #define HFI1_TID_RDMA_H +#include <linux/circ_buf.h> +#include "common.h" + +/* Add a convenience helper */ +#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1)) +#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size) +#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size) + #define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ +#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) struct tid_rdma_params { struct rcu_head rcu_head; @@ -36,6 +45,81 @@ struct tid_flow_state { u8 flags; }; +struct tid_rdma_request { + struct rvt_qp *qp; + struct hfi1_ctxtdata *rcd; + union { + struct rvt_swqe *swqe; + struct rvt_ack_entry *ack; + } e; + + struct tid_rdma_flow *flows; /* array of tid flows */ + u16 n_flows; /* size of the flow buffer window */ + u16 setup_head; /* flow index we are setting up */ + u16 clear_tail; /* flow index we are clearing */ + u16 flow_idx; /* flow index most recently set up */ + + u32 seg_len; + + u32 isge; /* index of "current" sge */ +}; + +/* + * When header suppression is used, PSNs associated with a "flow" are + * relevant (and not the PSNs maintained by verbs). Track per-flow + * PSNs here for a TID RDMA segment. + * + */ +struct flow_state { + u32 flags; + u32 resp_ib_psn; /* The IB PSN of the response for this flow */ + u32 generation; /* generation of flow */ + u32 spsn; /* starting PSN in TID space */ + u32 lpsn; /* last PSN in TID space */ + u32 r_next_psn; /* next PSN to be received (in TID space) */ +}; + +struct tid_rdma_pageset { + dma_addr_t addr : 48; /* Only needed for the first page */ + u8 idx: 8; + u8 count : 7; + u8 mapped: 1; +}; + +/** + * kern_tid_node - used for managing TID's in TID groups + * + * @grp_idx: rcd relative index to tid_group + * @map: grp->map captured prior to programming this TID group in HW + * @cnt: Only @cnt of available group entries are actually programmed + */ +struct kern_tid_node { + struct tid_group *grp; + u8 map; + u8 cnt; +}; + +/* Overall info for a TID RDMA segment */ +struct tid_rdma_flow { + /* + * While a TID RDMA segment is being transferred, it uses a QP number + * from the "KDETH section of QP numbers" (which is different from the + * QP number that originated the request). Bits 11-15 of these QP + * numbers identify the "TID flow" for the segment. + */ + struct flow_state flow_state; + struct tid_rdma_request *req; + u32 length; + u8 tnode_cnt; + u8 tidcnt; + u8 idx; + u8 npagesets; + u8 npkts; + struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; + struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; + u32 tid_entry[TID_RDMA_MAX_PAGES]; +}; + bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); @@ -43,6 +127,23 @@ struct tid_flow_state { void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last); +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req); +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req); +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe); + +/** + * trdma_clean_swqe - clean flows for swqe if large send queue + * @qp: the qp + * @wqe: the send wqe + */ +static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + if (!wqe->priv) + return; + __trdma_clean_swqe(qp, wqe); +} int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index e383cc0..43b105d 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -48,7 +48,6 @@ */ #include "hfi.h" - #include "exp_rcv.h" struct tid_pageset { diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 571bfd5..02c1873 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -504,11 +504,28 @@ static void verbs_sdma_complete( hfi1_put_txreq(tx); } +void hfi1_wait_kmem(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct ib_device *ibdev = ibqp->device; + struct hfi1_ibdev *dev = to_idev(ibdev); + + if (list_empty(&priv->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->s_iowait.list, &dev->memwait); + priv->s_iowait.lock = &dev->iowait_lock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); + rvt_get_qp(qp); + } +} + static int wait_kmem(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { - struct hfi1_qp_priv *priv = qp->priv; unsigned long flags; int ret = 0; @@ -517,15 +534,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, &ps->wait->tx_head); - if (list_empty(&priv->s_iowait.list)) { - if (list_empty(&dev->memwait)) - mod_timer(&dev->mem_timer, jiffies + 1); - qp->s_flags |= RVT_S_WAIT_KMEM; - list_add_tail(&priv->s_iowait.list, &dev->memwait); - priv->s_iowait.lock = &dev->iowait_lock; - trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); - rvt_get_qp(qp); - } + hfi1_wait_kmem(qp); write_sequnlock(&dev->iowait_lock); hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 9065e47..94f198b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -159,6 +159,7 @@ struct hfi1_qp_priv { struct sdma_engine *s_sde; /* current sde */ struct send_context *s_sendcontext; /* current sendcontext */ struct hfi1_ctxtdata *rcd; /* QP's receive context */ + struct page **pages; /* for TID page scan */ u32 tid_enqueue; /* saved when tid waited */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; @@ -173,6 +174,14 @@ struct hfi1_qp_priv { u8 timeout_shift; /* account for number of packets per segment */ }; +struct hfi1_swqe_priv { + struct tid_rdma_request tid_req; +}; + +struct hfi1_ack_priv { + struct tid_rdma_request tid_req; +}; + /* * This structure is used to hold commonly lookedup and computed values during * the send engine progress. @@ -321,6 +330,21 @@ static inline u32 delta_psn(u32 a, u32 b) return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; } +/* + * Look through all the active flows for a TID RDMA request and find + * the one (if it exists) that contains the specified PSN. + */ +static inline u32 __full_flow_psn(struct flow_state *state, u32 psn) +{ + return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + (psn & HFI1_KDETH_BTH_SEQ_MASK)); +} + +static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn) +{ + return __full_flow_psn(&flow->flow_state, psn); +} + struct verbs_txreq; void hfi1_put_txreq(struct verbs_txreq *tx); @@ -403,6 +427,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); } +void hfi1_wait_kmem(struct rvt_qp *qp); + +static inline void hfi1_trdma_send_complete(struct rvt_qp *qp, + struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + trdma_clean_swqe(qp, wqe); + rvt_send_complete(qp, wqe, status); +} + extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; extern const u8 hdr_len_by_opcode[]; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 80c994c..9906e66 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1642,11 +1642,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp) kref_put(&qp->ip->ref, rvt_release_mmap_info); else vfree(qp->r_rq.wq); - vfree(qp->s_wq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); + vfree(qp->s_wq); kfree(qp); return 0; } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 56a9221..9095a0b 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -174,6 +174,7 @@ struct rvt_swqe { u32 lpsn; /* last packet sequence number */ u32 ssn; /* send sequence number */ u32 length; /* total length of data in sg_list */ + void *priv; /* driver dependent field */ struct rvt_sge sg_list[0]; }; @@ -235,6 +236,7 @@ struct rvt_ack_entry { u32 lpsn; u8 opcode; u8 sent; + void *priv; }; #define RC_QP_SCALING_INTERVAL 5