> On Apr 26, 2016, at 4:26 PM, Sagi Grimberg <sagi@xxxxxxxxxxx> wrote: > > > > On 25/04/16 22:22, Chuck Lever wrote: >> There needs to be a safe method of releasing registered memory >> resources when an RPC terminates. Safe can mean a number of things: >> >> + Doesn't have to sleep >> >> + Doesn't rely on having a QP in RTS >> >> ro_unmap_safe will be that safe method. It can be used in cases >> where synchronous memory invalidation can deadlock, or needs to have >> an active QP. >> >> The important case is fencing an RPC's memory regions after it is >> signaled (^C) and before it exits. If this is not done, there is a >> window where the server can write an RPC reply into memory that the >> client has released and re-used for some other purpose. >> >> Note that this is a full solution for FRWR, but FMR and physical >> still have some gaps where a particularly bad server can wreak >> some havoc on the client. These gaps are not made worse by this >> patch and are expected to be exceptionally rare and timing-based. >> They are noted in documenting comments. >> >> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> >> --- >> net/sunrpc/xprtrdma/fmr_ops.c | 105 +++++++++++++++++++++++++++++++++--- >> net/sunrpc/xprtrdma/frwr_ops.c | 27 +++++++++ >> net/sunrpc/xprtrdma/physical_ops.c | 20 +++++++ >> net/sunrpc/xprtrdma/rpc_rdma.c | 5 -- >> net/sunrpc/xprtrdma/transport.c | 9 +-- >> net/sunrpc/xprtrdma/xprt_rdma.h | 3 + >> 6 files changed, 150 insertions(+), 19 deletions(-) >> >> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c >> index 9d50f3a..a658dcf 100644 >> --- a/net/sunrpc/xprtrdma/fmr_ops.c >> +++ b/net/sunrpc/xprtrdma/fmr_ops.c >> @@ -35,6 +35,64 @@ >> /* Maximum scatter/gather per FMR */ >> #define RPCRDMA_MAX_FMR_SGES (64) >> >> +static struct workqueue_struct *fmr_recovery_wq; >> + >> +#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) >> + >> +int >> +fmr_alloc_recovery_wq(void) >> +{ >> + fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); >> + return !fmr_recovery_wq ? -ENOMEM : 0; >> +} >> + >> +void >> +fmr_destroy_recovery_wq(void) >> +{ >> + struct workqueue_struct *wq; >> + >> + if (!fmr_recovery_wq) >> + return; >> + >> + wq = fmr_recovery_wq; >> + fmr_recovery_wq = NULL; >> + destroy_workqueue(wq); >> +} >> + >> +static int >> +__fmr_unmap(struct rpcrdma_mw *mw) >> +{ >> + LIST_HEAD(l); >> + >> + list_add(&mw->fmr.fmr->list, &l); >> + return ib_unmap_fmr(&l); >> +} >> + >> +/* Deferred reset of a single FMR. Generate a fresh rkey by >> + * replacing the MR. There's no recovery if this fails. >> + */ >> +static void >> +__fmr_recovery_worker(struct work_struct *work) >> +{ >> + struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, >> + mw_work); >> + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; >> + >> + __fmr_unmap(mw); >> + rpcrdma_put_mw(r_xprt, mw); >> + return; >> +} >> + >> +/* A broken MR was discovered in a context that can't sleep. >> + * Defer recovery to the recovery worker. >> + */ >> +static void >> +__fmr_queue_recovery(struct rpcrdma_mw *mw) >> +{ >> + INIT_WORK(&mw->mw_work, __fmr_recovery_worker); >> + queue_work(fmr_recovery_wq, &mw->mw_work); >> +} >> + >> static int >> fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, >> struct rpcrdma_create_data_internal *cdata) >> @@ -92,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt) >> if (IS_ERR(r->fmr.fmr)) >> goto out_fmr_err; >> >> + r->mw_xprt = r_xprt; >> list_add(&r->mw_list, &buf->rb_mws); >> list_add(&r->mw_all, &buf->rb_all); >> } >> @@ -107,15 +166,6 @@ out: >> return rc; >> } >> >> -static int >> -__fmr_unmap(struct rpcrdma_mw *r) >> -{ >> - LIST_HEAD(l); >> - >> - list_add(&r->fmr.fmr->list, &l); >> - return ib_unmap_fmr(&l); >> -} >> - >> /* Use the ib_map_phys_fmr() verb to register a memory region >> * for remote access via RDMA READ or RDMA WRITE. >> */ >> @@ -242,6 +292,42 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) >> req->rl_nchunks = 0; >> } >> >> +/* Use a slow, safe mechanism to invalidate all memory regions >> + * that were registered for "req". >> + * >> + * In the asynchronous case, DMA unmapping occurs first here >> + * because the rpcrdma_mr_seg is released immediately after this >> + * call. It's contents won't be available in __fmr_dma_unmap later. >> + * FIXME. >> + */ >> +static void >> +fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, >> + bool sync) >> +{ >> + struct rpcrdma_mr_seg *seg; >> + struct rpcrdma_mw *mw; >> + unsigned int i; >> + >> + for (i = 0; req->rl_nchunks; req->rl_nchunks--) { >> + seg = &req->rl_segments[i]; >> + mw = seg->rl_mw; >> + >> + if (sync) { >> + /* ORDER */ >> + __fmr_unmap(mw); >> + __fmr_dma_unmap(r_xprt, seg); >> + rpcrdma_put_mw(r_xprt, mw); >> + } else { >> + __fmr_dma_unmap(r_xprt, seg); >> + __fmr_queue_recovery(mw); >> + } >> + >> + i += seg->mr_nsegs; >> + seg->mr_nsegs = 0; >> + seg->rl_mw = NULL; >> + } >> +} >> + >> /* Use the ib_unmap_fmr() verb to prevent further remote >> * access via RDMA READ or RDMA WRITE. >> */ >> @@ -295,6 +381,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) >> const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { >> .ro_map = fmr_op_map, >> .ro_unmap_sync = fmr_op_unmap_sync, >> + .ro_unmap_safe = fmr_op_unmap_safe, >> .ro_unmap = fmr_op_unmap, >> .ro_open = fmr_op_open, >> .ro_maxpages = fmr_op_maxpages, >> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c >> index 1251a1d..79ba323 100644 >> --- a/net/sunrpc/xprtrdma/frwr_ops.c >> +++ b/net/sunrpc/xprtrdma/frwr_ops.c >> @@ -614,6 +614,32 @@ reset_mrs: >> goto unmap; >> } >> >> +/* Use a slow, safe mechanism to invalidate all memory regions >> + * that were registered for "req". >> + */ >> +static void >> +frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, >> + bool sync) >> +{ >> + struct rpcrdma_mr_seg *seg; >> + struct rpcrdma_mw *mw; >> + unsigned int i; >> + >> + for (i = 0; req->rl_nchunks; req->rl_nchunks--) { >> + seg = &req->rl_segments[i]; >> + mw = seg->rl_mw; >> + >> + if (sync) >> + __frwr_reset_and_unmap(r_xprt, mw); >> + else >> + __frwr_queue_recovery(mw); >> + >> + i += seg->mr_nsegs; >> + seg->mr_nsegs = 0; >> + seg->rl_mw = NULL; >> + } >> +} >> + >> /* Post a LOCAL_INV Work Request to prevent further remote access >> * via RDMA READ or RDMA WRITE. >> */ >> @@ -675,6 +701,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) >> const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { >> .ro_map = frwr_op_map, >> .ro_unmap_sync = frwr_op_unmap_sync, >> + .ro_unmap_safe = frwr_op_unmap_safe, >> .ro_unmap = frwr_op_unmap, >> .ro_open = frwr_op_open, >> .ro_maxpages = frwr_op_maxpages, >> diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c >> index 2dc6ec2..95ef3a7 100644 >> --- a/net/sunrpc/xprtrdma/physical_ops.c >> +++ b/net/sunrpc/xprtrdma/physical_ops.c >> @@ -97,6 +97,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) >> rpcrdma_unmap_one(device, &req->rl_segments[i++]); >> } >> >> +/* Use a slow, safe mechanism to invalidate all memory regions >> + * that were registered for "req". >> + * >> + * For physical memory registration, there is no good way to >> + * fence a single MR that has been advertised to the server. The >> + * client has already handed the server an R_key that cannot be >> + * invalidated and is shared by all MRs on this connection. >> + * Tearing down the PD might be the only safe choice, but it's >> + * not clear that a freshly acquired DMA R_key would be different >> + * than the one used by the PD that was just destroyed. >> + * FIXME. >> + */ >> +static void >> +physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, >> + bool sync) >> +{ >> + physical_op_unmap_sync(r_xprt, req); >> +} >> + > > So physical has no async mode? Nope. There's no way to fence memory once a client has exposed its whole-memory R_key. The client could drop its connection and delete the PD. > Is there a device that makes you resort to physical memreg? I'm not aware of one. > It's an awful lot of maintenance on what looks to be a esoteric (at > best) code path. It's never chosen by falling back to that mode. physical has long been on the chopping block. Last time I suggested removing it I got a complaint. But there's no in-kernel device that requires this mode, so seems like it should go sooner rather than later. > The rest looks fine to me. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html