Re: [RFC PATCH 4/5] SUNRPC: support resvport and reuseport for rpcrdma

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Oct 16, 2024 at 07:48:25PM +0800, Kinglong Mee wrote:
> NFSd's DRC key contains peer port, but rpcrdma does't support port resue now.
> This patch supports resvport and resueport as tcp/udp for rpcrdma.

An RDMA consumer is not in control of the "source port" in an RDMA
connection, thus the port number is meaningless. This is why the
Linux NFS client does not already support setting the source port on
RDMA mounts, and why NFSD sets the source port value to zero on
incoming connections; the DRC then always sees a zero port value in
its lookup key tuple.

See net/sunrpc/xprtrdma/svc_rdma_transport.c :: handle_connect_req() :

259         /* The remote port is arbitrary and not under the control of the
260          * client ULP. Set it to a fixed value so that the DRC continues
261          * to be effective after a reconnect.
262          */
263         rpc_set_port((struct sockaddr *)&newxprt->sc_xprt.xpt_remote, 0);


As a general comment, your patch descriptions need to explain /why/
each change is being made. For example, the initial patches in this
series, although they properly split the changes into clean easily
digestible hunks, are somewhat baffling until the reader gets to
this one. This patch jumps right to announcing a solution.

There's no cover letter tying these changes together with a problem
statement. What problematic behavior did you see that motivated this
approach?


> Signed-off-by: Kinglong Mee <kinglongmee@xxxxxxxxx>
> ---
>  net/sunrpc/xprtrdma/transport.c |  36 ++++++++++++
>  net/sunrpc/xprtrdma/verbs.c     | 100 ++++++++++++++++++++++++++++++++
>  net/sunrpc/xprtrdma/xprt_rdma.h |   5 ++
>  3 files changed, 141 insertions(+)
> 
> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
> index 9a8ce5df83ca..fee3b562932b 100644
> --- a/net/sunrpc/xprtrdma/transport.c
> +++ b/net/sunrpc/xprtrdma/transport.c
> @@ -70,6 +70,10 @@ unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
>  unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRWR;
>  int xprt_rdma_pad_optimize;
>  static struct xprt_class xprt_rdma;
> +static unsigned int xprt_rdma_min_resvport_limit = RPC_MIN_RESVPORT;
> +static unsigned int xprt_rdma_max_resvport_limit = RPC_MAX_RESVPORT;
> +unsigned int xprt_rdma_min_resvport = RPC_DEF_MIN_RESVPORT;
> +unsigned int xprt_rdma_max_resvport = RPC_DEF_MAX_RESVPORT;
>  
>  #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
>  
> @@ -137,6 +141,24 @@ static struct ctl_table xr_tunables_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec,
>  	},
> +	{
> +		.procname	= "rdma_min_resvport",
> +		.data		= &xprt_rdma_min_resvport,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &xprt_rdma_min_resvport_limit,
> +		.extra2		= &xprt_rdma_max_resvport_limit
> +	},
> +	{
> +		.procname	= "rdma_max_resvport",
> +		.data		= &xprt_rdma_max_resvport,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &xprt_rdma_min_resvport_limit,
> +		.extra2		= &xprt_rdma_max_resvport_limit
> +	},
>  };
>  
>  #endif
> @@ -346,6 +368,20 @@ xprt_setup_rdma(struct xprt_create *args)
>  	xprt_rdma_format_addresses(xprt, sap);
>  
>  	new_xprt = rpcx_to_rdmax(xprt);
> +
> +	if (args->srcaddr)
> +		memcpy(&new_xprt->rx_srcaddr, args->srcaddr, args->addrlen);
> +	else {
> +		rc = rpc_init_anyaddr(args->dstaddr->sa_family,
> +					(struct sockaddr *)&new_xprt->rx_srcaddr);
> +		if (rc != 0) {
> +			xprt_rdma_free_addresses(xprt);
> +			xprt_free(xprt);
> +			module_put(THIS_MODULE);
> +			return ERR_PTR(rc);
> +		}
> +	}
> +
>  	rc = rpcrdma_buffer_create(new_xprt);
>  	if (rc) {
>  		xprt_rdma_free_addresses(xprt);
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 63262ef0c2e3..0ce5123d799b 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -285,6 +285,98 @@ static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn)
>  	xprt_force_disconnect(ep->re_xprt);
>  }
>  
> +static int rpcrdma_get_random_port(void)
> +{
> +	unsigned short min = xprt_rdma_min_resvport, max = xprt_rdma_max_resvport;
> +	unsigned short range;
> +	unsigned short rand;
> +
> +	if (max < min)
> +		return -EADDRINUSE;
> +	range = max - min + 1;
> +	rand = get_random_u32_below(range);
> +	return rand + min;
> +}
> +
> +static void rpcrdma_set_srcport(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
> +{
> +        struct sockaddr *sap = (struct sockaddr *)&id->route.addr.src_addr;
> +
> +	if (r_xprt->rx_srcport == 0 && r_xprt->rx_xprt.reuseport) {
> +		switch (sap->sa_family) {
> +		case AF_INET6:
> +			r_xprt->rx_srcport = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
> +			break;
> +		case AF_INET:
> +			r_xprt->rx_srcport = ntohs(((struct sockaddr_in *)sap)->sin_port);
> +			break;
> +		}
> +	}
> +}
> +
> +static int rpcrdma_get_srcport(struct rpcrdma_xprt *r_xprt)
> +{
> +	int port = r_xprt->rx_srcport;
> +
> +	if (port == 0 && r_xprt->rx_xprt.resvport)
> +		port = rpcrdma_get_random_port();
> +	return port;
> +}
> +
> +static unsigned short rpcrdma_next_srcport(struct rpcrdma_xprt *r_xprt, unsigned short port)
> +{
> +	if (r_xprt->rx_srcport != 0)
> +		r_xprt->rx_srcport = 0;
> +	if (!r_xprt->rx_xprt.resvport)
> +		return 0;
> +	if (port <= xprt_rdma_min_resvport || port > xprt_rdma_max_resvport)
> +		return xprt_rdma_max_resvport;
> +	return --port;
> +}
> +
> +static int rpcrdma_bind(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
> +{
> +	struct sockaddr_storage myaddr;
> +	int err, nloop = 0;
> +	int port = rpcrdma_get_srcport(r_xprt);
> +	unsigned short last;
> +
> +	/*
> +	 * If we are asking for any ephemeral port (i.e. port == 0 &&
> +	 * r_xprt->rx_xprt.resvport == 0), don't bind.  Let the local
> +	 * port selection happen implicitly when the socket is used
> +	 * (for example at connect time).
> +	 *
> +	 * This ensures that we can continue to establish TCP
> +	 * connections even when all local ephemeral ports are already
> +	 * a part of some TCP connection.  This makes no difference
> +	 * for UDP sockets, but also doesn't harm them.
> +	 *
> +	 * If we're asking for any reserved port (i.e. port == 0 &&
> +	 * r_xprt->rx_xprt.resvport == 1) rpcrdma_get_srcport above will
> +	 * ensure that port is non-zero and we will bind as needed.
> +	 */
> +	if (port <= 0)
> +		return port;
> +
> +	memcpy(&myaddr, &r_xprt->rx_srcaddr, r_xprt->rx_xprt.addrlen);
> +	do {
> +		rpc_set_port((struct sockaddr *)&myaddr, port);
> +		err = rdma_bind_addr(id, (struct sockaddr *)&myaddr);
> +		if (err == 0) {
> +			if (r_xprt->rx_xprt.reuseport)
> +				r_xprt->rx_srcport = port;
> +			break;
> +		}
> +		last = port;
> +		port = rpcrdma_next_srcport(r_xprt, port);
> +		if (port > last)
> +			nloop++;
> +	} while (err == -EADDRINUSE && nloop != 2);
> +
> +	return err;
> +}
> +
>  static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
>  					    struct rpcrdma_ep *ep)
>  {
> @@ -300,6 +392,12 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
>  	if (IS_ERR(id))
>  		return id;
>  
> +	rc = rpcrdma_bind(r_xprt, id);
> +	if (rc) {
> +		rc = -ENOTCONN;
> +		goto out;
> +	}
> +
>  	ep->re_async_rc = -ETIMEDOUT;
>  	rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
>  			       RDMA_RESOLVE_TIMEOUT);
> @@ -328,6 +426,8 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
>  	if (rc)
>  		goto out;
>  
> +	rpcrdma_set_srcport(r_xprt, id);
> +
>  	return id;
>  
>  out:
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index 8147d2b41494..9c7bcb541267 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -433,6 +433,9 @@ struct rpcrdma_xprt {
>  	struct delayed_work	rx_connect_worker;
>  	struct rpc_timeout	rx_timeout;
>  	struct rpcrdma_stats	rx_stats;
> +
> +	struct sockaddr_storage	rx_srcaddr;
> +	unsigned short		rx_srcport;
>  };
>  
>  #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
> @@ -581,6 +584,8 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
>   */
>  extern unsigned int xprt_rdma_max_inline_read;
>  extern unsigned int xprt_rdma_max_inline_write;
> +extern unsigned int xprt_rdma_min_resvport;
> +extern unsigned int xprt_rdma_max_resvport;
>  void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
>  void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
>  void xprt_rdma_close(struct rpc_xprt *xprt);
> -- 
> 2.47.0
> 

-- 
Chuck Lever




[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux