NFSd's DRC key contains peer port, but rpcrdma does't support port resue now. This patch supports resvport and resueport as tcp/udp for rpcrdma. Signed-off-by: Kinglong Mee <kinglongmee@xxxxxxxxx> --- net/sunrpc/xprtrdma/transport.c | 36 ++++++++++++ net/sunrpc/xprtrdma/verbs.c | 100 ++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++ 3 files changed, 141 insertions(+) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9a8ce5df83ca..fee3b562932b 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -70,6 +70,10 @@ unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; static struct xprt_class xprt_rdma; +static unsigned int xprt_rdma_min_resvport_limit = RPC_MIN_RESVPORT; +static unsigned int xprt_rdma_max_resvport_limit = RPC_MAX_RESVPORT; +unsigned int xprt_rdma_min_resvport = RPC_DEF_MIN_RESVPORT; +unsigned int xprt_rdma_max_resvport = RPC_DEF_MAX_RESVPORT; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -137,6 +141,24 @@ static struct ctl_table xr_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "rdma_min_resvport", + .data = &xprt_rdma_min_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xprt_rdma_min_resvport_limit, + .extra2 = &xprt_rdma_max_resvport_limit + }, + { + .procname = "rdma_max_resvport", + .data = &xprt_rdma_max_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xprt_rdma_min_resvport_limit, + .extra2 = &xprt_rdma_max_resvport_limit + }, }; #endif @@ -346,6 +368,20 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); new_xprt = rpcx_to_rdmax(xprt); + + if (args->srcaddr) + memcpy(&new_xprt->rx_srcaddr, args->srcaddr, args->addrlen); + else { + rc = rpc_init_anyaddr(args->dstaddr->sa_family, + (struct sockaddr *)&new_xprt->rx_srcaddr); + if (rc != 0) { + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); + return ERR_PTR(rc); + } + } + rc = rpcrdma_buffer_create(new_xprt); if (rc) { xprt_rdma_free_addresses(xprt); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 63262ef0c2e3..0ce5123d799b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -285,6 +285,98 @@ static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) xprt_force_disconnect(ep->re_xprt); } +static int rpcrdma_get_random_port(void) +{ + unsigned short min = xprt_rdma_min_resvport, max = xprt_rdma_max_resvport; + unsigned short range; + unsigned short rand; + + if (max < min) + return -EADDRINUSE; + range = max - min + 1; + rand = get_random_u32_below(range); + return rand + min; +} + +static void rpcrdma_set_srcport(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id) +{ + struct sockaddr *sap = (struct sockaddr *)&id->route.addr.src_addr; + + if (r_xprt->rx_srcport == 0 && r_xprt->rx_xprt.reuseport) { + switch (sap->sa_family) { + case AF_INET6: + r_xprt->rx_srcport = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); + break; + case AF_INET: + r_xprt->rx_srcport = ntohs(((struct sockaddr_in *)sap)->sin_port); + break; + } + } +} + +static int rpcrdma_get_srcport(struct rpcrdma_xprt *r_xprt) +{ + int port = r_xprt->rx_srcport; + + if (port == 0 && r_xprt->rx_xprt.resvport) + port = rpcrdma_get_random_port(); + return port; +} + +static unsigned short rpcrdma_next_srcport(struct rpcrdma_xprt *r_xprt, unsigned short port) +{ + if (r_xprt->rx_srcport != 0) + r_xprt->rx_srcport = 0; + if (!r_xprt->rx_xprt.resvport) + return 0; + if (port <= xprt_rdma_min_resvport || port > xprt_rdma_max_resvport) + return xprt_rdma_max_resvport; + return --port; +} + +static int rpcrdma_bind(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id) +{ + struct sockaddr_storage myaddr; + int err, nloop = 0; + int port = rpcrdma_get_srcport(r_xprt); + unsigned short last; + + /* + * If we are asking for any ephemeral port (i.e. port == 0 && + * r_xprt->rx_xprt.resvport == 0), don't bind. Let the local + * port selection happen implicitly when the socket is used + * (for example at connect time). + * + * This ensures that we can continue to establish TCP + * connections even when all local ephemeral ports are already + * a part of some TCP connection. This makes no difference + * for UDP sockets, but also doesn't harm them. + * + * If we're asking for any reserved port (i.e. port == 0 && + * r_xprt->rx_xprt.resvport == 1) rpcrdma_get_srcport above will + * ensure that port is non-zero and we will bind as needed. + */ + if (port <= 0) + return port; + + memcpy(&myaddr, &r_xprt->rx_srcaddr, r_xprt->rx_xprt.addrlen); + do { + rpc_set_port((struct sockaddr *)&myaddr, port); + err = rdma_bind_addr(id, (struct sockaddr *)&myaddr); + if (err == 0) { + if (r_xprt->rx_xprt.reuseport) + r_xprt->rx_srcport = port; + break; + } + last = port; + port = rpcrdma_next_srcport(r_xprt, port); + if (port > last) + nloop++; + } while (err == -EADDRINUSE && nloop != 2); + + return err; +} + static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep) { @@ -300,6 +392,12 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, if (IS_ERR(id)) return id; + rc = rpcrdma_bind(r_xprt, id); + if (rc) { + rc = -ENOTCONN; + goto out; + } + ep->re_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, RDMA_RESOLVE_TIMEOUT); @@ -328,6 +426,8 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, if (rc) goto out; + rpcrdma_set_srcport(r_xprt, id); + return id; out: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8147d2b41494..9c7bcb541267 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -433,6 +433,9 @@ struct rpcrdma_xprt { struct delayed_work rx_connect_worker; struct rpc_timeout rx_timeout; struct rpcrdma_stats rx_stats; + + struct sockaddr_storage rx_srcaddr; + unsigned short rx_srcport; }; #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) @@ -581,6 +584,8 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) */ extern unsigned int xprt_rdma_max_inline_read; extern unsigned int xprt_rdma_max_inline_write; +extern unsigned int xprt_rdma_min_resvport; +extern unsigned int xprt_rdma_max_resvport; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); void xprt_rdma_close(struct rpc_xprt *xprt); -- 2.47.0