From: Chuck Lever <chuck.lever@xxxxxxxxxx> xprtiod_workqueue is a MEM_RECLAIM-enabled workqueue. However, the RDMA core API functions are not memory reclaim-safe. This was partially accomplished by commit 6b1eb3b22272 ("SUNRPC: Replace the use of the xprtiod WQ in rpcrdma"). This commit addressed the issue in the connect path, but not in the disconnect path. Thus sometimes a transport disconnect results in this splat: workqueue: WQ_MEM_RECLAIM xprtiod:xprt_autoclose [sunrpc] is flushing !WQ_MEM_RECLAIM events_highpri:rpcrdma_mr_refresh_worker [rpcrdma] WARNING: CPU: 1 PID: 20378 at kernel/workqueue.c:3728 check_flush_dependency+0x101/0x120 ? check_flush_dependency+0x101/0x120 ? report_bug+0x175/0x1a0 ? handle_bug+0x44/0x90 ? exc_invalid_op+0x1c/0x70 ? asm_exc_invalid_op+0x1f/0x30 ? __pfx_rpcrdma_mr_refresh_worker+0x10/0x10 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac] ? check_flush_dependency+0x101/0x120 __flush_work.isra.0+0x20a/0x290 __cancel_work_sync+0x129/0x1c0 cancel_work_sync+0x14/0x20 rpcrdma_xprt_disconnect+0x229/0x3f0 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac] xprt_rdma_close+0x16/0x40 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac] xprt_autoclose+0x63/0x110 [sunrpc a04d701bce94b5a8fb541cafbe1a489d6b1ab5b3] process_one_work+0x19e/0x3f0 worker_thread+0x340/0x510 ? __pfx_worker_thread+0x10/0x10 kthread+0xf7/0x130 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x41/0x60 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Create a context in which it is safe to manage resources that are not memory reclaim-safe that can be invoked during transport disconnect. Essentially this means that releasing an rpcrdma_ep is now done completely asynchronously. Subsequent patches will move the release of transport resources into this new context. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218704 Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> --- net/sunrpc/xprtrdma/transport.c | 20 +++++++++++++- net/sunrpc/xprtrdma/verbs.c | 46 ++++++++++++++++++++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++- 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 29b0562d62e7..237d78c1ec54 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -761,8 +761,12 @@ static struct xprt_class xprt_rdma = { .netid = { "rdma", "rdma6", "" }, }; +struct workqueue_struct *rpcrdma_release_wq __read_mostly; + void xprt_rdma_cleanup(void) { + struct workqueue_struct *wq; + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); @@ -772,18 +776,32 @@ void xprt_rdma_cleanup(void) xprt_unregister_transport(&xprt_rdma); xprt_unregister_transport(&xprt_rdma_bc); + + wq = rpcrdma_release_wq; + rpcrdma_release_wq = NULL; + destroy_workqueue(wq); } int xprt_rdma_init(void) { + struct workqueue_struct *wq; int rc; + /* provision a WQ that is always unbound and !mem_reclaim */ + wq = alloc_workqueue("rpcrdma_release", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; + rpcrdma_release_wq = wq; + rc = xprt_register_transport(&xprt_rdma); - if (rc) + if (rc) { + destroy_workqueue(wq); return rc; + } rc = xprt_register_transport(&xprt_rdma_bc); if (rc) { + destroy_workqueue(wq); xprt_unregister_transport(&xprt_rdma); return rc; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c6d9d94c28ba..f1e4a28325fa 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -73,7 +73,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_ep_get(struct rpcrdma_ep *ep); -static int rpcrdma_ep_put(struct rpcrdma_ep *ep); +static void rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, int node); @@ -234,15 +234,15 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_ROUTE_RESOLVED: ep->re_async_rc = 0; complete(&ep->re_done); - return 0; + break; case RDMA_CM_EVENT_ADDR_ERROR: ep->re_async_rc = -EPROTO; complete(&ep->re_done); - return 0; + break; case RDMA_CM_EVENT_ROUTE_ERROR: ep->re_async_rc = -ENETUNREACH; complete(&ep->re_done); - return 0; + break; case RDMA_CM_EVENT_DEVICE_REMOVAL: pr_info("rpcrdma: removing device %s for %pISpc\n", ep->re_id->device->name, sap); @@ -269,12 +269,13 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_connect_status = -ENOTCONN; wake_connect_worker: wake_up_all(&ep->re_connect_wait); - return 0; + break; case RDMA_CM_EVENT_DISCONNECTED: ep->re_connect_status = -ECONNABORTED; disconnected: rpcrdma_force_disconnect(ep); - return rpcrdma_ep_put(ep); + rpcrdma_ep_put(ep); + fallthrough; default: break; } @@ -328,9 +329,13 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, return ERR_PTR(rc); } -static void rpcrdma_ep_destroy(struct kref *kref) +/* Delayed release of a connection's hardware resources. Releasing + * RDMA hardware resources is done in a !MEM_RECLAIM context because + * the RDMA core API functions are generally not reclaim-safe. + */ +static void rpcrdma_ep_destroy(struct work_struct *work) { - struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); + struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, re_worker); if (ep->re_id->qp) { rdma_destroy_qp(ep->re_id); @@ -348,22 +353,30 @@ static void rpcrdma_ep_destroy(struct kref *kref) ib_dealloc_pd(ep->re_pd); ep->re_pd = NULL; + if (ep->re_id) + rdma_destroy_id(ep->re_id); + ep->re_id = NULL; + kfree(ep); module_put(THIS_MODULE); } +static void rpcrdma_ep_release(struct kref *kref) +{ + struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); + + INIT_WORK(&ep->re_worker, rpcrdma_ep_destroy); + queue_work(rpcrdma_release_wq, &ep->re_worker); +} + static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep) { kref_get(&ep->re_kref); } -/* Returns: - * %0 if @ep still has a positive kref count, or - * %1 if @ep was destroyed successfully. - */ -static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep) +static noinline void rpcrdma_ep_put(struct rpcrdma_ep *ep) { - return kref_put(&ep->re_kref, rpcrdma_ep_destroy); + kref_put(&ep->re_kref, rpcrdma_ep_release); } static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) @@ -475,7 +488,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) out_destroy: rpcrdma_ep_put(ep); - rdma_destroy_id(id); return rc; } @@ -566,10 +578,8 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); - if (rpcrdma_ep_put(ep)) - rdma_destroy_id(id); - r_xprt->rx_ep = NULL; + rpcrdma_ep_put(ep); } /* Fixed-size circular FIFO queue. This implementation is wait-free and diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 08bda29ed953..048d2e329384 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -70,7 +70,6 @@ */ struct rpcrdma_mr; struct rpcrdma_ep { - struct kref re_kref; struct rdma_cm_id *re_id; struct ib_pd *re_pd; unsigned int re_max_rdma_segs; @@ -100,6 +99,9 @@ struct rpcrdma_ep { atomic_t re_completion_ids; char re_write_pad[XDR_UNIT]; + + struct kref re_kref; + struct work_struct re_worker; }; /* Pre-allocate extra Work Requests for handling reverse-direction @@ -583,6 +585,7 @@ void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); void xprt_rdma_close(struct rpc_xprt *xprt); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); +extern struct workqueue_struct *rpcrdma_release_wq; int xprt_rdma_init(void); void xprt_rdma_cleanup(void); -- 2.44.0