Hi Chuck, On 2024/10/16 9:57 PM, Chuck Lever wrote: > On Wed, Oct 16, 2024 at 07:48:25PM +0800, Kinglong Mee wrote: >> NFSd's DRC key contains peer port, but rpcrdma does't support port resue now. >> This patch supports resvport and resueport as tcp/udp for rpcrdma. > > An RDMA consumer is not in control of the "source port" in an RDMA > connection, thus the port number is meaningless. This is why the > Linux NFS client does not already support setting the source port on > RDMA mounts, and why NFSD sets the source port value to zero on > incoming connections; the DRC then always sees a zero port value in > its lookup key tuple. > > See net/sunrpc/xprtrdma/svc_rdma_transport.c :: handle_connect_req() : > > 259 /* The remote port is arbitrary and not under the control of the > 260 * client ULP. Set it to a fixed value so that the DRC continues > 261 * to be effective after a reconnect. > 262 */ > 263 rpc_set_port((struct sockaddr *)&newxprt->sc_xprt.xpt_remote, 0); > > > As a general comment, your patch descriptions need to explain /why/ > each change is being made. For example, the initial patches in this > series, although they properly split the changes into clean easily > digestible hunks, are somewhat baffling until the reader gets to > this one. This patch jumps right to announcing a solution. Thanks for your comment. > > There's no cover letter tying these changes together with a problem > statement. What problematic behavior did you see that motivated this > approach? We have a private nfs server, it's DRC checks the src port, but rpcrdma doesnot support resueport now, so we try to add it as tcp/udp. Maybe someone needs the src port at rpcrdma connect, I made those patches. For the knfsd and nfs client, I don't meet any problem. Thanks, Kinglong Mee > > >> Signed-off-by: Kinglong Mee <kinglongmee@xxxxxxxxx> >> --- >> net/sunrpc/xprtrdma/transport.c | 36 ++++++++++++ >> net/sunrpc/xprtrdma/verbs.c | 100 ++++++++++++++++++++++++++++++++ >> net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++ >> 3 files changed, 141 insertions(+) >> >> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c >> index 9a8ce5df83ca..fee3b562932b 100644 >> --- a/net/sunrpc/xprtrdma/transport.c >> +++ b/net/sunrpc/xprtrdma/transport.c >> @@ -70,6 +70,10 @@ unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; >> unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; >> int xprt_rdma_pad_optimize; >> static struct xprt_class xprt_rdma; >> +static unsigned int xprt_rdma_min_resvport_limit = RPC_MIN_RESVPORT; >> +static unsigned int xprt_rdma_max_resvport_limit = RPC_MAX_RESVPORT; >> +unsigned int xprt_rdma_min_resvport = RPC_DEF_MIN_RESVPORT; >> +unsigned int xprt_rdma_max_resvport = RPC_DEF_MAX_RESVPORT; >> >> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) >> >> @@ -137,6 +141,24 @@ static struct ctl_table xr_tunables_table[] = { >> .mode = 0644, >> .proc_handler = proc_dointvec, >> }, >> + { >> + .procname = "rdma_min_resvport", >> + .data = &xprt_rdma_min_resvport, >> + .maxlen = sizeof(unsigned int), >> + .mode = 0644, >> + .proc_handler = proc_dointvec_minmax, >> + .extra1 = &xprt_rdma_min_resvport_limit, >> + .extra2 = &xprt_rdma_max_resvport_limit >> + }, >> + { >> + .procname = "rdma_max_resvport", >> + .data = &xprt_rdma_max_resvport, >> + .maxlen = sizeof(unsigned int), >> + .mode = 0644, >> + .proc_handler = proc_dointvec_minmax, >> + .extra1 = &xprt_rdma_min_resvport_limit, >> + .extra2 = &xprt_rdma_max_resvport_limit >> + }, >> }; >> >> #endif >> @@ -346,6 +368,20 @@ xprt_setup_rdma(struct xprt_create *args) >> xprt_rdma_format_addresses(xprt, sap); >> >> new_xprt = rpcx_to_rdmax(xprt); >> + >> + if (args->srcaddr) >> + memcpy(&new_xprt->rx_srcaddr, args->srcaddr, args->addrlen); >> + else { >> + rc = rpc_init_anyaddr(args->dstaddr->sa_family, >> + (struct sockaddr *)&new_xprt->rx_srcaddr); >> + if (rc != 0) { >> + xprt_rdma_free_addresses(xprt); >> + xprt_free(xprt); >> + module_put(THIS_MODULE); >> + return ERR_PTR(rc); >> + } >> + } >> + >> rc = rpcrdma_buffer_create(new_xprt); >> if (rc) { >> xprt_rdma_free_addresses(xprt); >> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c >> index 63262ef0c2e3..0ce5123d799b 100644 >> --- a/net/sunrpc/xprtrdma/verbs.c >> +++ b/net/sunrpc/xprtrdma/verbs.c >> @@ -285,6 +285,98 @@ static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) >> xprt_force_disconnect(ep->re_xprt); >> } >> >> +static int rpcrdma_get_random_port(void) >> +{ >> + unsigned short min = xprt_rdma_min_resvport, max = xprt_rdma_max_resvport; >> + unsigned short range; >> + unsigned short rand; >> + >> + if (max < min) >> + return -EADDRINUSE; >> + range = max - min + 1; >> + rand = get_random_u32_below(range); >> + return rand + min; >> +} >> + >> +static void rpcrdma_set_srcport(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id) >> +{ >> + struct sockaddr *sap = (struct sockaddr *)&id->route.addr.src_addr; >> + >> + if (r_xprt->rx_srcport == 0 && r_xprt->rx_xprt.reuseport) { >> + switch (sap->sa_family) { >> + case AF_INET6: >> + r_xprt->rx_srcport = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); >> + break; >> + case AF_INET: >> + r_xprt->rx_srcport = ntohs(((struct sockaddr_in *)sap)->sin_port); >> + break; >> + } >> + } >> +} >> + >> +static int rpcrdma_get_srcport(struct rpcrdma_xprt *r_xprt) >> +{ >> + int port = r_xprt->rx_srcport; >> + >> + if (port == 0 && r_xprt->rx_xprt.resvport) >> + port = rpcrdma_get_random_port(); >> + return port; >> +} >> + >> +static unsigned short rpcrdma_next_srcport(struct rpcrdma_xprt *r_xprt, unsigned short port) >> +{ >> + if (r_xprt->rx_srcport != 0) >> + r_xprt->rx_srcport = 0; >> + if (!r_xprt->rx_xprt.resvport) >> + return 0; >> + if (port <= xprt_rdma_min_resvport || port > xprt_rdma_max_resvport) >> + return xprt_rdma_max_resvport; >> + return --port; >> +} >> + >> +static int rpcrdma_bind(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id) >> +{ >> + struct sockaddr_storage myaddr; >> + int err, nloop = 0; >> + int port = rpcrdma_get_srcport(r_xprt); >> + unsigned short last; >> + >> + /* >> + * If we are asking for any ephemeral port (i.e. port == 0 && >> + * r_xprt->rx_xprt.resvport == 0), don't bind. Let the local >> + * port selection happen implicitly when the socket is used >> + * (for example at connect time). >> + * >> + * This ensures that we can continue to establish TCP >> + * connections even when all local ephemeral ports are already >> + * a part of some TCP connection. This makes no difference >> + * for UDP sockets, but also doesn't harm them. >> + * >> + * If we're asking for any reserved port (i.e. port == 0 && >> + * r_xprt->rx_xprt.resvport == 1) rpcrdma_get_srcport above will >> + * ensure that port is non-zero and we will bind as needed. >> + */ >> + if (port <= 0) >> + return port; >> + >> + memcpy(&myaddr, &r_xprt->rx_srcaddr, r_xprt->rx_xprt.addrlen); >> + do { >> + rpc_set_port((struct sockaddr *)&myaddr, port); >> + err = rdma_bind_addr(id, (struct sockaddr *)&myaddr); >> + if (err == 0) { >> + if (r_xprt->rx_xprt.reuseport) >> + r_xprt->rx_srcport = port; >> + break; >> + } >> + last = port; >> + port = rpcrdma_next_srcport(r_xprt, port); >> + if (port > last) >> + nloop++; >> + } while (err == -EADDRINUSE && nloop != 2); >> + >> + return err; >> +} >> + >> static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, >> struct rpcrdma_ep *ep) >> { >> @@ -300,6 +392,12 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, >> if (IS_ERR(id)) >> return id; >> >> + rc = rpcrdma_bind(r_xprt, id); >> + if (rc) { >> + rc = -ENOTCONN; >> + goto out; >> + } >> + >> ep->re_async_rc = -ETIMEDOUT; >> rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, >> RDMA_RESOLVE_TIMEOUT); >> @@ -328,6 +426,8 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, >> if (rc) >> goto out; >> >> + rpcrdma_set_srcport(r_xprt, id); >> + >> return id; >> >> out: >> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h >> index 8147d2b41494..9c7bcb541267 100644 >> --- a/net/sunrpc/xprtrdma/xprt_rdma.h >> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h >> @@ -433,6 +433,9 @@ struct rpcrdma_xprt { >> struct delayed_work rx_connect_worker; >> struct rpc_timeout rx_timeout; >> struct rpcrdma_stats rx_stats; >> + >> + struct sockaddr_storage rx_srcaddr; >> + unsigned short rx_srcport; >> }; >> >> #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) >> @@ -581,6 +584,8 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) >> */ >> extern unsigned int xprt_rdma_max_inline_read; >> extern unsigned int xprt_rdma_max_inline_write; >> +extern unsigned int xprt_rdma_min_resvport; >> +extern unsigned int xprt_rdma_max_resvport; >> void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); >> void xprt_rdma_free_addresses(struct rpc_xprt *xprt); >> void xprt_rdma_close(struct rpc_xprt *xprt); >> -- >> 2.47.0 >> >