Re: [RFC PATCH 4/5] SUNRPC: support resvport and reuseport for rpcrdma

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Oct 17, 2024 at 10:52:19AM +0800, Kinglong Mee wrote:
> Hi Chuck,
> 
> On 2024/10/16 9:57 PM, Chuck Lever wrote:
> > On Wed, Oct 16, 2024 at 07:48:25PM +0800, Kinglong Mee wrote:
> >> NFSd's DRC key contains peer port, but rpcrdma does't support port resue now.
> >> This patch supports resvport and resueport as tcp/udp for rpcrdma.
> > 
> > An RDMA consumer is not in control of the "source port" in an RDMA
> > connection, thus the port number is meaningless. This is why the
> > Linux NFS client does not already support setting the source port on
> > RDMA mounts, and why NFSD sets the source port value to zero on
> > incoming connections; the DRC then always sees a zero port value in
> > its lookup key tuple.
> > 
> > See net/sunrpc/xprtrdma/svc_rdma_transport.c :: handle_connect_req() :
> > 
> > 259         /* The remote port is arbitrary and not under the control of the
> > 260          * client ULP. Set it to a fixed value so that the DRC continues
> > 261          * to be effective after a reconnect.
> > 262          */
> > 263         rpc_set_port((struct sockaddr *)&newxprt->sc_xprt.xpt_remote, 0);
> > 
> > 
> > As a general comment, your patch descriptions need to explain /why/
> > each change is being made. For example, the initial patches in this
> > series, although they properly split the changes into clean easily
> > digestible hunks, are somewhat baffling until the reader gets to
> > this one. This patch jumps right to announcing a solution.
> 
> Thanks for your comment.
> 
> > 
> > There's no cover letter tying these changes together with a problem
> > statement. What problematic behavior did you see that motivated this
> > approach?
> 
> We have a private nfs server, it's DRC checks the src port, but rpcrdma doesnot
> support resueport now, so we try to add it as tcp/udp.

Thank you for clarifying!

It's common for a DRC to key on the source port. Unfortunately,
IIRC, the Linux RDMA Connection Manager does not provide an API for
an RDMA consumer (such as the Linux NFS client) to set an arbitrary
source port value on the active side. rdma_bind_addr() works on the
listen side only.

But perhaps my recollection is stale.


> Maybe someone needs the src port at rpcrdma connect, I made those patches. 
> 
> For the knfsd and nfs client, I don't meet any problem.
> 
> Thanks,
> Kinglong Mee
> > 
> > 
> >> Signed-off-by: Kinglong Mee <kinglongmee@xxxxxxxxx>
> >> ---
> >>  net/sunrpc/xprtrdma/transport.c |  36 ++++++++++++
> >>  net/sunrpc/xprtrdma/verbs.c     | 100 ++++++++++++++++++++++++++++++++
> >>  net/sunrpc/xprtrdma/xprt_rdma.h |   5 ++
> >>  3 files changed, 141 insertions(+)
> >>
> >> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
> >> index 9a8ce5df83ca..fee3b562932b 100644
> >> --- a/net/sunrpc/xprtrdma/transport.c
> >> +++ b/net/sunrpc/xprtrdma/transport.c
> >> @@ -70,6 +70,10 @@ unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
> >>  unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRWR;
> >>  int xprt_rdma_pad_optimize;
> >>  static struct xprt_class xprt_rdma;
> >> +static unsigned int xprt_rdma_min_resvport_limit = RPC_MIN_RESVPORT;
> >> +static unsigned int xprt_rdma_max_resvport_limit = RPC_MAX_RESVPORT;
> >> +unsigned int xprt_rdma_min_resvport = RPC_DEF_MIN_RESVPORT;
> >> +unsigned int xprt_rdma_max_resvport = RPC_DEF_MAX_RESVPORT;
> >>  
> >>  #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
> >>  
> >> @@ -137,6 +141,24 @@ static struct ctl_table xr_tunables_table[] = {
> >>  		.mode		= 0644,
> >>  		.proc_handler	= proc_dointvec,
> >>  	},
> >> +	{
> >> +		.procname	= "rdma_min_resvport",
> >> +		.data		= &xprt_rdma_min_resvport,
> >> +		.maxlen		= sizeof(unsigned int),
> >> +		.mode		= 0644,
> >> +		.proc_handler	= proc_dointvec_minmax,
> >> +		.extra1		= &xprt_rdma_min_resvport_limit,
> >> +		.extra2		= &xprt_rdma_max_resvport_limit
> >> +	},
> >> +	{
> >> +		.procname	= "rdma_max_resvport",
> >> +		.data		= &xprt_rdma_max_resvport,
> >> +		.maxlen		= sizeof(unsigned int),
> >> +		.mode		= 0644,
> >> +		.proc_handler	= proc_dointvec_minmax,
> >> +		.extra1		= &xprt_rdma_min_resvport_limit,
> >> +		.extra2		= &xprt_rdma_max_resvport_limit
> >> +	},
> >>  };
> >>  
> >>  #endif
> >> @@ -346,6 +368,20 @@ xprt_setup_rdma(struct xprt_create *args)
> >>  	xprt_rdma_format_addresses(xprt, sap);
> >>  
> >>  	new_xprt = rpcx_to_rdmax(xprt);
> >> +
> >> +	if (args->srcaddr)
> >> +		memcpy(&new_xprt->rx_srcaddr, args->srcaddr, args->addrlen);
> >> +	else {
> >> +		rc = rpc_init_anyaddr(args->dstaddr->sa_family,
> >> +					(struct sockaddr *)&new_xprt->rx_srcaddr);
> >> +		if (rc != 0) {
> >> +			xprt_rdma_free_addresses(xprt);
> >> +			xprt_free(xprt);
> >> +			module_put(THIS_MODULE);
> >> +			return ERR_PTR(rc);
> >> +		}
> >> +	}
> >> +
> >>  	rc = rpcrdma_buffer_create(new_xprt);
> >>  	if (rc) {
> >>  		xprt_rdma_free_addresses(xprt);
> >> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> >> index 63262ef0c2e3..0ce5123d799b 100644
> >> --- a/net/sunrpc/xprtrdma/verbs.c
> >> +++ b/net/sunrpc/xprtrdma/verbs.c
> >> @@ -285,6 +285,98 @@ static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn)
> >>  	xprt_force_disconnect(ep->re_xprt);
> >>  }
> >>  
> >> +static int rpcrdma_get_random_port(void)
> >> +{
> >> +	unsigned short min = xprt_rdma_min_resvport, max = xprt_rdma_max_resvport;
> >> +	unsigned short range;
> >> +	unsigned short rand;
> >> +
> >> +	if (max < min)
> >> +		return -EADDRINUSE;
> >> +	range = max - min + 1;
> >> +	rand = get_random_u32_below(range);
> >> +	return rand + min;
> >> +}
> >> +
> >> +static void rpcrdma_set_srcport(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
> >> +{
> >> +        struct sockaddr *sap = (struct sockaddr *)&id->route.addr.src_addr;
> >> +
> >> +	if (r_xprt->rx_srcport == 0 && r_xprt->rx_xprt.reuseport) {
> >> +		switch (sap->sa_family) {
> >> +		case AF_INET6:
> >> +			r_xprt->rx_srcport = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
> >> +			break;
> >> +		case AF_INET:
> >> +			r_xprt->rx_srcport = ntohs(((struct sockaddr_in *)sap)->sin_port);
> >> +			break;
> >> +		}
> >> +	}
> >> +}
> >> +
> >> +static int rpcrdma_get_srcport(struct rpcrdma_xprt *r_xprt)
> >> +{
> >> +	int port = r_xprt->rx_srcport;
> >> +
> >> +	if (port == 0 && r_xprt->rx_xprt.resvport)
> >> +		port = rpcrdma_get_random_port();
> >> +	return port;
> >> +}
> >> +
> >> +static unsigned short rpcrdma_next_srcport(struct rpcrdma_xprt *r_xprt, unsigned short port)
> >> +{
> >> +	if (r_xprt->rx_srcport != 0)
> >> +		r_xprt->rx_srcport = 0;
> >> +	if (!r_xprt->rx_xprt.resvport)
> >> +		return 0;
> >> +	if (port <= xprt_rdma_min_resvport || port > xprt_rdma_max_resvport)
> >> +		return xprt_rdma_max_resvport;
> >> +	return --port;
> >> +}
> >> +
> >> +static int rpcrdma_bind(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
> >> +{
> >> +	struct sockaddr_storage myaddr;
> >> +	int err, nloop = 0;
> >> +	int port = rpcrdma_get_srcport(r_xprt);
> >> +	unsigned short last;
> >> +
> >> +	/*
> >> +	 * If we are asking for any ephemeral port (i.e. port == 0 &&
> >> +	 * r_xprt->rx_xprt.resvport == 0), don't bind.  Let the local
> >> +	 * port selection happen implicitly when the socket is used
> >> +	 * (for example at connect time).
> >> +	 *
> >> +	 * This ensures that we can continue to establish TCP
> >> +	 * connections even when all local ephemeral ports are already
> >> +	 * a part of some TCP connection.  This makes no difference
> >> +	 * for UDP sockets, but also doesn't harm them.
> >> +	 *
> >> +	 * If we're asking for any reserved port (i.e. port == 0 &&
> >> +	 * r_xprt->rx_xprt.resvport == 1) rpcrdma_get_srcport above will
> >> +	 * ensure that port is non-zero and we will bind as needed.
> >> +	 */
> >> +	if (port <= 0)
> >> +		return port;
> >> +
> >> +	memcpy(&myaddr, &r_xprt->rx_srcaddr, r_xprt->rx_xprt.addrlen);
> >> +	do {
> >> +		rpc_set_port((struct sockaddr *)&myaddr, port);
> >> +		err = rdma_bind_addr(id, (struct sockaddr *)&myaddr);
> >> +		if (err == 0) {
> >> +			if (r_xprt->rx_xprt.reuseport)
> >> +				r_xprt->rx_srcport = port;
> >> +			break;
> >> +		}
> >> +		last = port;
> >> +		port = rpcrdma_next_srcport(r_xprt, port);
> >> +		if (port > last)
> >> +			nloop++;
> >> +	} while (err == -EADDRINUSE && nloop != 2);
> >> +
> >> +	return err;
> >> +}
> >> +
> >>  static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
> >>  					    struct rpcrdma_ep *ep)
> >>  {
> >> @@ -300,6 +392,12 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
> >>  	if (IS_ERR(id))
> >>  		return id;
> >>  
> >> +	rc = rpcrdma_bind(r_xprt, id);
> >> +	if (rc) {
> >> +		rc = -ENOTCONN;
> >> +		goto out;
> >> +	}
> >> +
> >>  	ep->re_async_rc = -ETIMEDOUT;
> >>  	rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
> >>  			       RDMA_RESOLVE_TIMEOUT);
> >> @@ -328,6 +426,8 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
> >>  	if (rc)
> >>  		goto out;
> >>  
> >> +	rpcrdma_set_srcport(r_xprt, id);
> >> +
> >>  	return id;
> >>  
> >>  out:
> >> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> >> index 8147d2b41494..9c7bcb541267 100644
> >> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> >> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> >> @@ -433,6 +433,9 @@ struct rpcrdma_xprt {
> >>  	struct delayed_work	rx_connect_worker;
> >>  	struct rpc_timeout	rx_timeout;
> >>  	struct rpcrdma_stats	rx_stats;
> >> +
> >> +	struct sockaddr_storage	rx_srcaddr;
> >> +	unsigned short		rx_srcport;
> >>  };
> >>  
> >>  #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
> >> @@ -581,6 +584,8 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
> >>   */
> >>  extern unsigned int xprt_rdma_max_inline_read;
> >>  extern unsigned int xprt_rdma_max_inline_write;
> >> +extern unsigned int xprt_rdma_min_resvport;
> >> +extern unsigned int xprt_rdma_max_resvport;
> >>  void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
> >>  void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
> >>  void xprt_rdma_close(struct rpc_xprt *xprt);
> >> -- 
> >> 2.47.0
> >>
> > 
> 

-- 
Chuck Lever




[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux