Signed-off-by: Kinglong Mee <kinglongmee@xxxxxxxxx>
---
net/sunrpc/xprtrdma/transport.c | 36 ++++++++++++
net/sunrpc/xprtrdma/verbs.c | 100 ++++++++++++++++++++++++++++++++
net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++
3 files changed, 141 insertions(+)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9a8ce5df83ca..fee3b562932b 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -70,6 +70,10 @@ unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR;
int xprt_rdma_pad_optimize;
static struct xprt_class xprt_rdma;
+static unsigned int xprt_rdma_min_resvport_limit = RPC_MIN_RESVPORT;
+static unsigned int xprt_rdma_max_resvport_limit = RPC_MAX_RESVPORT;
+unsigned int xprt_rdma_min_resvport = RPC_DEF_MIN_RESVPORT;
+unsigned int xprt_rdma_max_resvport = RPC_DEF_MAX_RESVPORT;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -137,6 +141,24 @@ static struct ctl_table xr_tunables_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "rdma_min_resvport",
+ .data = &xprt_rdma_min_resvport,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xprt_rdma_min_resvport_limit,
+ .extra2 = &xprt_rdma_max_resvport_limit
+ },
+ {
+ .procname = "rdma_max_resvport",
+ .data = &xprt_rdma_max_resvport,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xprt_rdma_min_resvport_limit,
+ .extra2 = &xprt_rdma_max_resvport_limit
+ },
};
#endif
@@ -346,6 +368,20 @@ xprt_setup_rdma(struct xprt_create *args)
xprt_rdma_format_addresses(xprt, sap);
new_xprt = rpcx_to_rdmax(xprt);
+
+ if (args->srcaddr)
+ memcpy(&new_xprt->rx_srcaddr, args->srcaddr, args->addrlen);
+ else {
+ rc = rpc_init_anyaddr(args->dstaddr->sa_family,
+ (struct sockaddr *)&new_xprt->rx_srcaddr);
+ if (rc != 0) {
+ xprt_rdma_free_addresses(xprt);
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+ return ERR_PTR(rc);
+ }
+ }
+
rc = rpcrdma_buffer_create(new_xprt);
if (rc) {
xprt_rdma_free_addresses(xprt);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 63262ef0c2e3..0ce5123d799b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -285,6 +285,98 @@ static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn)
xprt_force_disconnect(ep->re_xprt);
}
+static int rpcrdma_get_random_port(void)
+{
+ unsigned short min = xprt_rdma_min_resvport, max = xprt_rdma_max_resvport;
+ unsigned short range;
+ unsigned short rand;
+
+ if (max < min)
+ return -EADDRINUSE;
+ range = max - min + 1;
+ rand = get_random_u32_below(range);
+ return rand + min;
+}
+
+static void rpcrdma_set_srcport(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
+{
+ struct sockaddr *sap = (struct sockaddr *)&id->route.addr.src_addr;
+
+ if (r_xprt->rx_srcport == 0 && r_xprt->rx_xprt.reuseport) {
+ switch (sap->sa_family) {
+ case AF_INET6:
+ r_xprt->rx_srcport = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+ break;
+ case AF_INET:
+ r_xprt->rx_srcport = ntohs(((struct sockaddr_in *)sap)->sin_port);
+ break;
+ }
+ }
+}
+
+static int rpcrdma_get_srcport(struct rpcrdma_xprt *r_xprt)
+{
+ int port = r_xprt->rx_srcport;
+
+ if (port == 0 && r_xprt->rx_xprt.resvport)
+ port = rpcrdma_get_random_port();
+ return port;
+}
+
+static unsigned short rpcrdma_next_srcport(struct rpcrdma_xprt *r_xprt, unsigned short port)
+{
+ if (r_xprt->rx_srcport != 0)
+ r_xprt->rx_srcport = 0;
+ if (!r_xprt->rx_xprt.resvport)
+ return 0;
+ if (port <= xprt_rdma_min_resvport || port > xprt_rdma_max_resvport)
+ return xprt_rdma_max_resvport;
+ return --port;
+}
+
+static int rpcrdma_bind(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id)
+{
+ struct sockaddr_storage myaddr;
+ int err, nloop = 0;
+ int port = rpcrdma_get_srcport(r_xprt);
+ unsigned short last;
+
+ /*
+ * If we are asking for any ephemeral port (i.e. port == 0 &&
+ * r_xprt->rx_xprt.resvport == 0), don't bind. Let the local
+ * port selection happen implicitly when the socket is used
+ * (for example at connect time).
+ *
+ * This ensures that we can continue to establish TCP
+ * connections even when all local ephemeral ports are already
+ * a part of some TCP connection. This makes no difference
+ * for UDP sockets, but also doesn't harm them.
+ *
+ * If we're asking for any reserved port (i.e. port == 0 &&
+ * r_xprt->rx_xprt.resvport == 1) rpcrdma_get_srcport above will
+ * ensure that port is non-zero and we will bind as needed.
+ */
+ if (port <= 0)
+ return port;
+
+ memcpy(&myaddr, &r_xprt->rx_srcaddr, r_xprt->rx_xprt.addrlen);
+ do {
+ rpc_set_port((struct sockaddr *)&myaddr, port);
+ err = rdma_bind_addr(id, (struct sockaddr *)&myaddr);
+ if (err == 0) {
+ if (r_xprt->rx_xprt.reuseport)
+ r_xprt->rx_srcport = port;
+ break;
+ }
+ last = port;
+ port = rpcrdma_next_srcport(r_xprt, port);
+ if (port > last)
+ nloop++;
+ } while (err == -EADDRINUSE && nloop != 2);
+
+ return err;
+}
+
static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_ep *ep)
{
@@ -300,6 +392,12 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
if (IS_ERR(id))
return id;
+ rc = rpcrdma_bind(r_xprt, id);
+ if (rc) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+
ep->re_async_rc = -ETIMEDOUT;
rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
RDMA_RESOLVE_TIMEOUT);
@@ -328,6 +426,8 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
if (rc)
goto out;
+ rpcrdma_set_srcport(r_xprt, id);
+
return id;
out:
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8147d2b41494..9c7bcb541267 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -433,6 +433,9 @@ struct rpcrdma_xprt {
struct delayed_work rx_connect_worker;
struct rpc_timeout rx_timeout;
struct rpcrdma_stats rx_stats;
+
+ struct sockaddr_storage rx_srcaddr;
+ unsigned short rx_srcport;
};
#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
@@ -581,6 +584,8 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
*/
extern unsigned int xprt_rdma_max_inline_read;
extern unsigned int xprt_rdma_max_inline_write;
+extern unsigned int xprt_rdma_min_resvport;
+extern unsigned int xprt_rdma_max_resvport;
void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
void xprt_rdma_close(struct rpc_xprt *xprt);
--
2.47.0