Re: [RFC PATCH 4/5] SUNRPC: support resvport and reuseport for rpcrdma

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Chuck,

On 2024/10/16 9:57 PM, Chuck Lever wrote:
> On Wed, Oct 16, 2024 at 07:48:25PM +0800, Kinglong Mee wrote:
>> NFSd's DRC key contains peer port, but rpcrdma does't support port resue now.
>> This patch supports resvport and resueport as tcp/udp for rpcrdma.
> 
> An RDMA consumer is not in control of the "source port" in an RDMA
> connection, thus the port number is meaningless. This is why the
> Linux NFS client does not already support setting the source port on
> RDMA mounts, and why NFSD sets the source port value to zero on
> incoming connections; the DRC then always sees a zero port value in
> its lookup key tuple.
> 
> See net/sunrpc/xprtrdma/svc_rdma_transport.c :: handle_connect_req() :
> 
> 259         /* The remote port is arbitrary and not under the control of the
> 260          * client ULP. Set it to a fixed value so that the DRC continues
> 261          * to be effective after a reconnect.
> 262          */
> 263         rpc_set_port((struct sockaddr *)&newxprt->sc_xprt.xpt_remote, 0);
> 
> 
> As a general comment, your patch descriptions need to explain /why/
> each change is being made. For example, the initial patches in this
> series, although they properly split the changes into clean easily
> digestible hunks, are somewhat baffling until the reader gets to
> this one. This patch jumps right to announcing a solution.

Thanks for your comment.

> 
> There's no cover letter tying these changes together with a problem
> statement. What problematic behavior did you see that motivated this
> approach?

We have a private nfs server, it's DRC checks the src port, but rpcrdma doesnot
support resueport now, so we try to add it as tcp/udp.
Maybe someone needs the src port at rpcrdma connect, I made those patches. 

For the knfsd and nfs client, I don't meet any problem.

Thanks,
Kinglong Mee
> 
> 
>> Signed-off-by: Kinglong Mee <kinglongmee@xxxxxxxxx>
>> ---
>>  net/sunrpc/xprtrdma/transport.c |  36 ++++++++++++
>>  net/sunrpc/xprtrdma/verbs.c     | 100 ++++++++++++++++++++++++++++++++
>>  net/sunrpc/xprtrdma/xprt_rdma.h |   5 ++
>>  3 files changed, 141 insertions(+)
>>
>> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
>> index 9a8ce5df83ca..fee3b562932b 100644
>> --- a/net/sunrpc/xprtrdma/transport.c
>> +++ b/net/sunrpc/xprtrdma/transport.c
>> @@ -70,6 +70,10 @@ unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
>>  unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRWR;
>>  int xprt_rdma_pad_optimize;
>>  static struct xprt_class xprt_rdma;
>> +static unsigned int xprt_rdma_min_resvport_limit = RPC_MIN_RESVPORT;
>> +static unsigned int xprt_rdma_max_resvport_limit = RPC_MAX_RESVPORT;
>> +unsigned int xprt_rdma_min_resvport = RPC_DEF_MIN_RESVPORT;
>> +unsigned int xprt_rdma_max_resvport = RPC_DEF_MAX_RESVPORT;
>>  
>>  #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
>>  
>> @@ -137,6 +141,24 @@ static struct ctl_table xr_tunables_table[] = {
>>  		.mode		= 0644,
>>  		.proc_handler	= proc_dointvec,
>>  	},
>> +	{
>> +		.procname	= "rdma_min_resvport",
>> +		.data		= &xprt_rdma_min_resvport,
>> +		.maxlen		= sizeof(unsigned int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec_minmax,
>> +		.extra1		= &xprt_rdma_min_resvport_limit,
>> +		.extra2		= &xprt_rdma_max_resvport_limit
>> +	},
>> +	{
>> +		.procname	= "rdma_max_resvport",
>> +		.data		= &xprt_rdma_max_resvport,
>> +		.maxlen		= sizeof(unsigned int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec_minmax,
>> +		.extra1		= &xprt_rdma_min_resvport_limit,
>> +		.extra2		= &xprt_rdma_max_resvport_limit
>> +	},
>>  };
>>  
>>  #endif
>> @@ -346,6 +368,20 @@ xprt_setup_rdma(struct xprt_create *args)
>>  	xprt_rdma_format_addresses(xprt, sap);
>>  
>>  	new_xprt = rpcx_to_rdmax(xprt);
>> +
>> +	if (args->srcaddr)
>> +		memcpy(&new_xprt->rx_srcaddr, args->srcaddr, args->addrlen);
>> +	else {
>> +		rc = rpc_init_anyaddr(args->dstaddr->sa_family,
>> +					(struct sockaddr *)&new_xprt->rx_srcaddr);
>> +		if (rc != 0) {
>> +			xprt_rdma_free_addresses(xprt);
>> +			xprt_free(xprt);
>> +			module_put(THIS_MODULE);
>> +			return ERR_PTR(rc);
>> +		}
>> +	}
>> +
>>  	rc = rpcrdma_buffer_create(new_xprt);
>>  	if (rc) {
>>  		xprt_rdma_free_addresses(xprt);
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index 63262ef0c2e3..0ce5123d799b 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -285,6 +285,98 @@ static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn)
>>  	xprt_force_disconnect(ep->re_xprt);
>>  }
>>  
>> +static int rpcrdma_get_random_port(void)
>> +{
>> +	unsigned short min = xprt_rdma_min_resvport, max = xprt_rdma_max_resvport;
>> +	unsigned short range;
>> +	unsigned short rand;
>> +
>> +	if (max < min)
>> +		return -EADDRINUSE;
>> +	range = max - min + 1;
>> +	rand = get_random_u32_below(range);
>> +	return rand + min;
>> +}
>> +
>> +static void rpcrdma_set_srcport(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
>> +{
>> +        struct sockaddr *sap = (struct sockaddr *)&id->route.addr.src_addr;
>> +
>> +	if (r_xprt->rx_srcport == 0 && r_xprt->rx_xprt.reuseport) {
>> +		switch (sap->sa_family) {
>> +		case AF_INET6:
>> +			r_xprt->rx_srcport = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
>> +			break;
>> +		case AF_INET:
>> +			r_xprt->rx_srcport = ntohs(((struct sockaddr_in *)sap)->sin_port);
>> +			break;
>> +		}
>> +	}
>> +}
>> +
>> +static int rpcrdma_get_srcport(struct rpcrdma_xprt *r_xprt)
>> +{
>> +	int port = r_xprt->rx_srcport;
>> +
>> +	if (port == 0 && r_xprt->rx_xprt.resvport)
>> +		port = rpcrdma_get_random_port();
>> +	return port;
>> +}
>> +
>> +static unsigned short rpcrdma_next_srcport(struct rpcrdma_xprt *r_xprt, unsigned short port)
>> +{
>> +	if (r_xprt->rx_srcport != 0)
>> +		r_xprt->rx_srcport = 0;
>> +	if (!r_xprt->rx_xprt.resvport)
>> +		return 0;
>> +	if (port <= xprt_rdma_min_resvport || port > xprt_rdma_max_resvport)
>> +		return xprt_rdma_max_resvport;
>> +	return --port;
>> +}
>> +
>> +static int rpcrdma_bind(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
>> +{
>> +	struct sockaddr_storage myaddr;
>> +	int err, nloop = 0;
>> +	int port = rpcrdma_get_srcport(r_xprt);
>> +	unsigned short last;
>> +
>> +	/*
>> +	 * If we are asking for any ephemeral port (i.e. port == 0 &&
>> +	 * r_xprt->rx_xprt.resvport == 0), don't bind.  Let the local
>> +	 * port selection happen implicitly when the socket is used
>> +	 * (for example at connect time).
>> +	 *
>> +	 * This ensures that we can continue to establish TCP
>> +	 * connections even when all local ephemeral ports are already
>> +	 * a part of some TCP connection.  This makes no difference
>> +	 * for UDP sockets, but also doesn't harm them.
>> +	 *
>> +	 * If we're asking for any reserved port (i.e. port == 0 &&
>> +	 * r_xprt->rx_xprt.resvport == 1) rpcrdma_get_srcport above will
>> +	 * ensure that port is non-zero and we will bind as needed.
>> +	 */
>> +	if (port <= 0)
>> +		return port;
>> +
>> +	memcpy(&myaddr, &r_xprt->rx_srcaddr, r_xprt->rx_xprt.addrlen);
>> +	do {
>> +		rpc_set_port((struct sockaddr *)&myaddr, port);
>> +		err = rdma_bind_addr(id, (struct sockaddr *)&myaddr);
>> +		if (err == 0) {
>> +			if (r_xprt->rx_xprt.reuseport)
>> +				r_xprt->rx_srcport = port;
>> +			break;
>> +		}
>> +		last = port;
>> +		port = rpcrdma_next_srcport(r_xprt, port);
>> +		if (port > last)
>> +			nloop++;
>> +	} while (err == -EADDRINUSE && nloop != 2);
>> +
>> +	return err;
>> +}
>> +
>>  static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
>>  					    struct rpcrdma_ep *ep)
>>  {
>> @@ -300,6 +392,12 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
>>  	if (IS_ERR(id))
>>  		return id;
>>  
>> +	rc = rpcrdma_bind(r_xprt, id);
>> +	if (rc) {
>> +		rc = -ENOTCONN;
>> +		goto out;
>> +	}
>> +
>>  	ep->re_async_rc = -ETIMEDOUT;
>>  	rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
>>  			       RDMA_RESOLVE_TIMEOUT);
>> @@ -328,6 +426,8 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
>>  	if (rc)
>>  		goto out;
>>  
>> +	rpcrdma_set_srcport(r_xprt, id);
>> +
>>  	return id;
>>  
>>  out:
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index 8147d2b41494..9c7bcb541267 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -433,6 +433,9 @@ struct rpcrdma_xprt {
>>  	struct delayed_work	rx_connect_worker;
>>  	struct rpc_timeout	rx_timeout;
>>  	struct rpcrdma_stats	rx_stats;
>> +
>> +	struct sockaddr_storage	rx_srcaddr;
>> +	unsigned short		rx_srcport;
>>  };
>>  
>>  #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
>> @@ -581,6 +584,8 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
>>   */
>>  extern unsigned int xprt_rdma_max_inline_read;
>>  extern unsigned int xprt_rdma_max_inline_write;
>> +extern unsigned int xprt_rdma_min_resvport;
>> +extern unsigned int xprt_rdma_max_resvport;
>>  void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
>>  void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
>>  void xprt_rdma_close(struct rpc_xprt *xprt);
>> -- 
>> 2.47.0
>>
> 





[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux