On Wed, 2008-08-13 at 11:18 -0500, Tom Tucker wrote: > Use FRMR when registering client memory if the memory registration > strategy is FRMR. > > Signed-off-by: Tom Tucker <tom@xxxxxxxxxxxxxxxxxxxxx> > > --- > net/sunrpc/xprtrdma/verbs.c | 296 ++++++++++++++++++++++++++++++++++++++----- > 1 files changed, 263 insertions(+), 33 deletions(-) > > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c > index 8ea283e..ed401f9 100644 > --- a/net/sunrpc/xprtrdma/verbs.c > +++ b/net/sunrpc/xprtrdma/verbs.c > @@ -423,6 +423,7 @@ rpcrdma_clean_cq(struct ib_cq *cq) > int > rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) > { > + struct ib_device_attr devattr; > int rc; > struct rpcrdma_ia *ia = &xprt->rx_ia; > > @@ -443,6 +444,49 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) > } > > /* > + * Query the device to determine if the requested memory > + * registration strategy is supported. If it isnt't set the > + * strategy to a globally supported model. > + */ > + rc = ib_query_device(ia->ri_id->device, &devattr); > + if (rc) { > + dprintk("RPC: %s: ib_query_device failed %d\n", > + __func__, rc); > + goto out2; > + } > + switch (memreg) { > + case RPCRDMA_MEMWINDOWS: > + case RPCRDMA_MEMWINDOWS_ASYNC: > + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { > + dprintk("RPC: %s: MEMWINDOWS specified but not " > + "supported, using RPCRDMA_ALLPHYSICAL", > + __func__); > + memreg = RPCRDMA_ALLPHYSICAL; > + } > + break; > + case RPCRDMA_MTHCAFMR: > + if (!ia->ri_id->device->alloc_fmr) { > + dprintk("RPC: %s: MTHCAFMR specified but not " > + "supported, using RPCRDMA_ALLPHYSICAL", > + __func__); > + memreg = RPCRDMA_ALLPHYSICAL; > + } > + break; > + case RPCRDMA_FASTREG: > + /* Requires both fast reg and global dma lkey */ > + if ((0 == > + (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) || > + (0 == (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))) { > + dprintk("RPC: %s: FASTREG specified but not " > + "supported, using RPCRDMA_ALLPHYSICAL", > + __func__); > + memreg = RPCRDMA_ALLPHYSICAL; > + } > + break; > + } > + dprintk("RPC: memory registration strategy is %d\n", memreg); > + > + /* > * Optionally obtain an underlying physical identity mapping in > * order to do a memory window-based bind. This base registration > * is protected from remote access - that is enabled only by binding > @@ -450,7 +494,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) > * revoked after the corresponding completion similar to a storage > * adapter. > */ > - if (memreg > RPCRDMA_REGISTER) { > + if ((memreg > RPCRDMA_REGISTER) && (memreg != RPCRDMA_FASTREG)) { > int mem_priv = IB_ACCESS_LOCAL_WRITE; > switch (memreg) { > #if RPCRDMA_PERSISTENT_REGISTRATION > @@ -475,7 +519,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) > memreg = RPCRDMA_REGISTER; > ia->ri_bind_mem = NULL; > } > + ia->ri_dma_lkey = ia->ri_bind_mem->lkey; > } > + if (memreg == RPCRDMA_FASTREG) > + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; > > /* Else will do memory reg/dereg for each chunk */ > ia->ri_memreg_strategy = memreg; > @@ -541,6 +588,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, > ep->rep_attr.srq = NULL; > ep->rep_attr.cap.max_send_wr = cdata->max_requests; > switch (ia->ri_memreg_strategy) { > + case RPCRDMA_FASTREG: > + /* Add room for fast reg and invalidate */ > + ep->rep_attr.cap.max_send_wr *= 3; > + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) > + return -EINVAL; > + break; > case RPCRDMA_MEMWINDOWS_ASYNC: > case RPCRDMA_MEMWINDOWS: > /* Add room for mw_binds+unbinds - overkill! */ > @@ -623,6 +676,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, > break; > case RPCRDMA_MTHCAFMR: > case RPCRDMA_REGISTER: > + case RPCRDMA_FASTREG: > ep->rep_remote_cma.responder_resources = cdata->max_requests * > (RPCRDMA_MAX_DATA_SEGS / 8); > break; > @@ -866,6 +920,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, > > buf->rb_max_requests = cdata->max_requests; > spin_lock_init(&buf->rb_lock); > + spin_lock_init(&buf->rb_frs_lock); > atomic_set(&buf->rb_credits, 1); > > /* Need to allocate: > @@ -874,6 +929,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, > * 3. array of struct rpcrdma_rep for replies > * 4. padding, if any > * 5. mw's, if any > + * 6. frmr's, if any > * Send/recv buffers in req/rep need to be registered > */ > > @@ -881,6 +937,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, > (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); > len += cdata->padding; > switch (ia->ri_memreg_strategy) { > + case RPCRDMA_FASTREG: > + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * > + sizeof(struct rpcrdma_frmr); > + break; > case RPCRDMA_MTHCAFMR: > /* TBD we are perhaps overallocating here */ > len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * > @@ -895,7 +955,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, > break; > } > > - /* allocate 1, 4 and 5 in one shot */ > + /* allocate 1, 4, 5 and 6 in one shot */ > p = kzalloc(len, GFP_KERNEL); > if (p == NULL) { > dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", > @@ -927,7 +987,38 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, > * and also reduce unbind-to-bind collision. > */ > INIT_LIST_HEAD(&buf->rb_mws); > + INIT_LIST_HEAD(&buf->rb_frs); > switch (ia->ri_memreg_strategy) { > + case RPCRDMA_FASTREG: > + { Can we please get rid of this kind of ugliness whereby we insert blocks just in order to set up a temporary variable. I know Chuck is fond of it, but as far as I'm concerned it just screws up readability of the code. If you feel you need to isolate a variable to a particular block of code, then make a function, and that's particularly true of these huge switch statements that are trying to do 100 entirely unrelated things in one function. > + struct rpcrdma_frmr *fr = (struct rpcrdma_frmr *)p; > + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { > + fr->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, > + RPCRDMA_MAX_SEGS); > + if (IS_ERR(fr->fr_mr)) { > + rc = PTR_ERR(fr->fr_mr); > + printk("RPC: %s: ib_alloc_fast_reg_mr" > + " failed %i\n", __func__, rc); > + goto out; > + } > + fr->fr_pgl = > + ib_alloc_fast_reg_page_list(ia->ri_id->device, > + RPCRDMA_MAX_SEGS); > + if (IS_ERR(fr->fr_pgl)) { > + rc = PTR_ERR(fr->fr_pgl); > + printk("RPC: %s: " > + "ib_alloc_fast_reg_page_list " > + "failed %i\n", __func__, rc); > + goto out; > + } > + INIT_LIST_HEAD(&fr->fr_list); > + list_add(&fr->fr_list, &buf->rb_frs); > + dprintk("RPC: %s alloc fmr %p pgl %p\n", __func__, > + fr->fr_mr, fr->fr_pgl); > + ++fr; > + } > + } ^urgh > + break; > case RPCRDMA_MTHCAFMR: > { > struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; > @@ -1056,6 +1147,49 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) > */ > dprintk("RPC: %s: entering\n", __func__); > > + while (!list_empty(&buf->rb_frs)) { > + struct rpcrdma_frmr *fr = > + list_entry(buf->rb_frs.next, > + struct rpcrdma_frmr, fr_list); > + list_del(&fr->fr_list); > + rc = ib_dereg_mr(fr->fr_mr); > + if (rc) > + dprintk("RPC: %s:" > + " ib_dereg_mr" > + " failed %i\n", > + __func__, rc); > + ib_free_fast_reg_page_list(fr->fr_pgl); > + } > + > + while (!list_empty(&buf->rb_mws)) { > + struct rpcrdma_mw *r; > + switch (ia->ri_memreg_strategy) { > + case RPCRDMA_MTHCAFMR: > + r = list_entry(buf->rb_mws.next, > + struct rpcrdma_mw, mw_list); > + list_del(&r->mw_list); > + rc = ib_dealloc_fmr(r->r.fmr); > + if (rc) > + dprintk("RPC: %s:" > + " ib_dealloc_fmr" > + " failed %i\n", > + __func__, rc); > + break; > + case RPCRDMA_MEMWINDOWS_ASYNC: > + case RPCRDMA_MEMWINDOWS: > + r = list_entry(buf->rb_mws.next, > + struct rpcrdma_mw, mw_list); > + list_del(&r->mw_list); > + rc = ib_dealloc_mw(r->r.mw); > + if (rc) > + dprintk("RPC: %s: ib_dealloc_mw " > + "failed %i\n", __func__, rc); > + break; > + default: > + break; > + } > + } > + > for (i = 0; i < buf->rb_max_requests; i++) { > if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { > rpcrdma_deregister_internal(ia, > @@ -1064,33 +1198,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) > kfree(buf->rb_recv_bufs[i]); > } > if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { > - while (!list_empty(&buf->rb_mws)) { > - struct rpcrdma_mw *r; > - r = list_entry(buf->rb_mws.next, > - struct rpcrdma_mw, mw_list); > - list_del(&r->mw_list); > - switch (ia->ri_memreg_strategy) { > - case RPCRDMA_MTHCAFMR: > - rc = ib_dealloc_fmr(r->r.fmr); > - if (rc) > - dprintk("RPC: %s:" > - " ib_dealloc_fmr" > - " failed %i\n", > - __func__, rc); > - break; > - case RPCRDMA_MEMWINDOWS_ASYNC: > - case RPCRDMA_MEMWINDOWS: > - rc = ib_dealloc_mw(r->r.mw); > - if (rc) > - dprintk("RPC: %s:" > - " ib_dealloc_mw" > - " failed %i\n", > - __func__, rc); > - break; > - default: > - break; > - } > - } > rpcrdma_deregister_internal(ia, > buf->rb_send_bufs[i]->rl_handle, > &buf->rb_send_bufs[i]->rl_iov); > @@ -1115,6 +1222,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) > { > struct rpcrdma_req *req; > unsigned long flags; > + int i; > > spin_lock_irqsave(&buffers->rb_lock, flags); > if (buffers->rb_send_index == buffers->rb_max_requests) { > @@ -1134,8 +1242,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) > buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; > } > buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; > + for (i = 0; i < RPCRDMA_MAX_SEGS; i++) > + req->rl_segments[i].mr_chunk.rl_fr = NULL; > + > if (!list_empty(&buffers->rb_mws)) { > - int i = RPCRDMA_MAX_SEGS - 1; > + i = RPCRDMA_MAX_SEGS - 1; > do { > struct rpcrdma_mw *r; > r = list_entry(buffers->rb_mws.next, > @@ -1148,6 +1259,31 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) > return req; > } > > +static void > +rpcrdma_free_frmr(struct rpcrdma_buffer *buf, struct rpcrdma_frmr *fr_mr) > +{ > + unsigned long flags; > + spin_lock_irqsave(&buf->rb_frs_lock, flags); > + list_add(&fr_mr->fr_list, &buf->rb_frs); > + spin_unlock_irqrestore(&buf->rb_frs_lock, flags); > +} > + > +static struct rpcrdma_frmr * > +rpcrdma_alloc_frmr(struct rpcrdma_buffer *buf) > +{ > + unsigned long flags; > + struct rpcrdma_frmr *fr_mr = NULL; > + > + spin_lock_irqsave(&buf->rb_frs_lock, flags); > + if (!list_empty(&buf->rb_frs)) { > + fr_mr = list_entry(buf->rb_frs.next, > + struct rpcrdma_frmr, fr_list); > + list_del_init(&fr_mr->fr_list); > + } > + spin_unlock_irqrestore(&buf->rb_frs_lock, flags); > + return fr_mr; > +} > + > /* > * Put request/reply buffers back into pool. > * Pre-decrement counter/array index. > @@ -1252,9 +1388,10 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, > va, len, DMA_BIDIRECTIONAL); > iov->length = len; > > - if (ia->ri_bind_mem != NULL) { > + if (RPCRDMA_FASTREG == ia->ri_memreg_strategy || > + ia->ri_bind_mem) { > *mrp = NULL; > - iov->lkey = ia->ri_bind_mem->lkey; > + iov->lkey = ia->ri_dma_lkey; > return 0; > } > > @@ -1302,6 +1439,43 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia, > /* > * Wrappers for chunk registration, shared by read/write chunk code. > */ > +static int > +rpcrdma_fastreg_seg(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *mr, > + int nsegs, u32 access) > +{ > + struct ib_send_wr invalidate_wr, fastreg_wr, *bad_wr; > + u8 key; > + u32 rkey = mr->mr_chunk.rl_fr->fr_mr->rkey; > + int ret; > + > + /* Prepare INVALIDATE WR */ > + memset(&invalidate_wr, 0, sizeof invalidate_wr); > + invalidate_wr.opcode = IB_WR_LOCAL_INV; > + invalidate_wr.send_flags = IB_SEND_SIGNALED; > + invalidate_wr.ex.invalidate_rkey = rkey; > + invalidate_wr.next = &fastreg_wr; > + > + /* Bump the key */ > + key = (u8)(mr->mr_chunk.rl_fr->fr_mr->rkey & 0x000000FF); > + ib_update_fast_reg_key(mr->mr_chunk.rl_fr->fr_mr, ++key); > + > + /* Prepare FASTREG WR */ > + memset(&fastreg_wr, 0, sizeof fastreg_wr); > + fastreg_wr.opcode = IB_WR_FAST_REG_MR; > + fastreg_wr.send_flags = IB_SEND_SIGNALED; > + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)mr->mr_dma; > + fastreg_wr.wr.fast_reg.page_list = mr->mr_chunk.rl_fr->fr_pgl; > + fastreg_wr.wr.fast_reg.page_list_len = nsegs; > + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; > + fastreg_wr.wr.fast_reg.length = nsegs << PAGE_SHIFT; > + fastreg_wr.wr.fast_reg.access_flags = access; > + fastreg_wr.wr.fast_reg.rkey = mr->mr_chunk.rl_fr->fr_mr->rkey; > + ret = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); > + dprintk("RPC: %s fast reg rkey %08x kva %llx map_len " > + "%d page_list_len %d ret %d\n", __func__, > + rkey, mr->mr_dma, nsegs << PAGE_SHIFT, nsegs, ret); > + return ret; > +} > > static void > rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) > @@ -1353,6 +1527,53 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, > #endif > > /* Registration using fast memory registration */ > + case RPCRDMA_FASTREG: > + { > + int len, pageoff = offset_in_page(seg->mr_offset); Ditto... > + > + seg1->mr_chunk.rl_fr = rpcrdma_alloc_frmr(&r_xprt->rx_buf); > + if (!seg1->mr_chunk.rl_fr) { > + printk("RPC: Failed to allocate frmr\n"); > + rc = -ENOMEM; > + break; > + } > + seg1->mr_offset -= pageoff; /* start of page */ > + seg1->mr_len += pageoff; > + len = -pageoff; > + if (nsegs > RPCRDMA_MAX_DATA_SEGS) > + nsegs = RPCRDMA_MAX_DATA_SEGS; > + for (i = 0; i < nsegs;) { > + rpcrdma_map_one(ia, seg, writing); > + seg1->mr_chunk.rl_fr->fr_pgl->page_list[i] = seg->mr_dma; > + len += seg->mr_len; > + ++seg; > + ++i; > + /* Check for holes */ > + if ((i < nsegs && offset_in_page(seg->mr_offset)) || > + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) > + break; > + } > + nsegs = i; > + dprintk("RPC: %s: Using fmr %p to map %d segments\n", > + __func__, seg1->mr_chunk.rl_fr, nsegs); > + rc = rpcrdma_fastreg_seg(ia, seg1, nsegs, mem_priv); > + if (rc) { > + printk("RPC: %s: failed ib_map_phys_fmr " > + "%u@0x%llx+%i (%d)... status %i\n", __func__, > + len, (unsigned long long)seg1->mr_dma, > + pageoff, nsegs, rc); > + while (nsegs--) > + rpcrdma_unmap_one(ia, --seg); > + } else { > + seg1->mr_rkey = seg1->mr_chunk.rl_fr->fr_mr->rkey; > + seg1->mr_base = seg1->mr_dma + pageoff; > + seg1->mr_nsegs = nsegs; > + seg1->mr_len = len; > + } > + } > + break; > + > + /* Registration using MTHCA FMR */ > case RPCRDMA_MTHCAFMR: > { > u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; > @@ -1486,6 +1707,16 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, > break; > #endif > > + case RPCRDMA_FASTREG: > + while (seg1->mr_nsegs--) { > + if (seg1->mr_chunk.rl_fr) { > + rpcrdma_free_frmr(&r_xprt->rx_buf, seg1->mr_chunk.rl_fr); > + seg1->mr_chunk.rl_fr = NULL; > + } > + rpcrdma_unmap_one(ia, seg++); > + } > + break; > + > case RPCRDMA_MTHCAFMR: > { > LIST_HEAD(l); > @@ -1590,7 +1821,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, > INIT_CQCOUNT(ep); > send_wr.send_flags = IB_SEND_SIGNALED; > } > - > rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); > if (rc) > dprintk("RPC: %s: ib_post_send returned %i\n", __func__, -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html