[PATCH] IB/Core: Changes to the IB Core infrastructure for RoCEv2 support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



 Changes include
 > Introduced a new 'RDMA_NETWORK_TYPE' in the IB Core
 > Adding a new 'network_hdr_type' in data structures like 'global_route',
 'work_completion' and 'path_record' structs
 > Determining if IPvX Address is routeable and set hop_limit in IP HDR accordingly
 > Repurpose rdma_addr_find_dmac_by_grh to get network hdr type

Signed-off-by: Somnath Kotur <somnath.kotur@xxxxxxxxxx>
Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@xxxxxxxxxx>
Signed-off-by: Devesh Sharma <devesh.sharma@xxxxxxxxxx>
---
PS: Not yet added UDP hdr addition to this. This sets the basic foundation for the
IP routeability. UDP source port /hdr filling should be relatively trivial and
will follow this patch based on how this design finally shapes up post review.

 drivers/infiniband/core/addr.c     |   13 +++++++++-
 drivers/infiniband/core/cm.c       |   15 +++++++----
 drivers/infiniband/core/cma.c      |    4 ++-
 drivers/infiniband/core/sa_query.c |    1 +
 drivers/infiniband/core/verbs.c    |   47 ++++++++++++++++++++++++++++++-----
 include/rdma/ib_addr.h             |    3 +-
 include/rdma/ib_sa.h               |    1 +
 include/rdma/ib_verbs.h            |   15 +++++++++++
 8 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 8172d37..77262e6 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -257,6 +257,9 @@ static int addr4_resolve(struct sockaddr_in *src_in,
 		goto put;
 	}
 
+	if (rt->rt_uses_gateway)
+		addr->network = RDMA_NETWORK_IPv4;
+
 	ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
 put:
 	ip_rt_put(rt);
@@ -271,6 +274,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 {
 	struct flowi6 fl6;
 	struct dst_entry *dst;
+	struct rt6_info *rt;
 	int ret;
 
 	memset(&fl6, 0, sizeof fl6);
@@ -282,6 +286,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 	if ((ret = dst->error))
 		goto put;
 
+	rt = (struct rt6_info *)dst;
 	if (ipv6_addr_any(&fl6.saddr)) {
 		ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
 					 &fl6.daddr, 0, &fl6.saddr);
@@ -305,6 +310,9 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 		goto put;
 	}
 
+	if (rt->rt6i_flags & RTF_GATEWAY)
+		addr->network = RDMA_NETWORK_IPv6;
+
 	ret = dst_fetch_ha(dst, addr, &fl6.daddr);
 put:
 	dst_release(dst);
@@ -458,7 +466,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
 }
 
 int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
-			       u16 *vlan_id)
+			       u16 *vlan_id, u8 *network_hdr_type)
 {
 	int ret = 0;
 	struct rdma_dev_addr dev_addr;
@@ -497,6 +505,9 @@ int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
 		return -ENODEV;
 	if (vlan_id)
 		*vlan_id = rdma_vlan_dev_vlan_id(dev);
+	if (network_hdr_type)
+		*network_hdr_type = dev_addr.network;
+
 	dev_put(dev);
 	return ret;
 }
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index e28a494..01986e8 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1233,8 +1233,8 @@ static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
 }
 
 static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
-					    struct ib_sa_path_rec *primary_path,
-					    struct ib_sa_path_rec *alt_path)
+				     struct ib_sa_path_rec *primary_path,
+				     struct ib_sa_path_rec *alt_path)
 {
 	memset(primary_path, 0, sizeof *primary_path);
 	primary_path->dgid = req_msg->primary_local_gid;
@@ -1520,9 +1520,10 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
 
 static int cm_req_handler(struct cm_work *work)
 {
-	struct ib_cm_id *cm_id;
 	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
 	struct cm_req_msg *req_msg;
+	struct ib_wc *wc;
+	struct ib_cm_id *cm_id;
 	int ret;
 
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
@@ -1531,10 +1532,10 @@ static int cm_req_handler(struct cm_work *work)
 	if (IS_ERR(cm_id))
 		return PTR_ERR(cm_id);
 
+	wc = work->mad_recv_wc->wc;
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	cm_id_priv->id.remote_id = req_msg->local_comm_id;
-	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
-				work->mad_recv_wc->recv_buf.grh,
+	cm_init_av_for_response(work->port, wc, work->mad_recv_wc->recv_buf.grh,
 				&cm_id_priv->av);
 	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
 							    id.local_id);
@@ -1558,11 +1559,13 @@ static int cm_req_handler(struct cm_work *work)
 	cm_id_priv->id.service_id = req_msg->service_id;
 	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
 
-	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+	cm_process_routed_req(req_msg, wc);
 	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
 
 	memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
 	work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+	work->path[0].network_hdr_type = wc->network_hdr_type;
+
 	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
 	if (ret) {
 		ib_get_cached_gid(work->port->cm_dev->ib_device,
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index d570030..285794a 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1924,7 +1924,9 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
 		    &route->path_rec->dgid);
 
-	route->path_rec->hop_limit = 1;
+	route->path_rec->network_hdr_type = addr->dev_addr.network;
+	if (route->path_rec->network_hdr_type != RDMA_NETWORK_IB)
+		route->path_rec->hop_limit = IPV6_DEFAULT_HOPLIMIT;
 	route->path_rec->reversible = 1;
 	route->path_rec->pkey = cpu_to_be16(0xffff);
 	route->path_rec->mtu_selector = IB_SA_EQ;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index c38f030..2377577 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -555,6 +555,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 		ah_attr->grh.flow_label    = be32_to_cpu(rec->flow_label);
 		ah_attr->grh.hop_limit     = rec->hop_limit;
 		ah_attr->grh.traffic_class = rec->traffic_class;
+		ah_attr->grh.network_hdr_type = rec->network_hdr_type;
 	}
 	if (force_grh) {
 		memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index c2b89cc..68d7d23 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -195,9 +195,14 @@ EXPORT_SYMBOL(ib_create_ah);
 int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		       struct ib_grh *grh, struct ib_ah_attr *ah_attr)
 {
-	u32 flow_class;
-	u16 gid_index;
 	int ret;
+	u16 gid_index;
+	u32 flow_class;
+	u8 *network_hdr_type;
+	struct sockaddr_in  src_in;
+	struct sockaddr_in  dst_in;
+	union rdma_network_hdr *l3grh;
+	union ib_gid *sgid, *dgid, ipv4_sgid, ipv4_dgid;
 	int is_eth = (rdma_port_get_link_layer(device, port_num) ==
 			IB_LINK_LAYER_ETHERNET);
 
@@ -206,18 +211,42 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		if (!(wc->wc_flags & IB_WC_GRH))
 			return -EPROTOTYPE;
 
+		sgid = &grh->sgid;
+		dgid = &grh->dgid;
+
 		if (wc->wc_flags & IB_WC_WITH_SMAC &&
 		    wc->wc_flags & IB_WC_WITH_VLAN) {
 			memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
 			ah_attr->vlan_id = wc->vlan_id;
 		} else {
-			ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
-					ah_attr->dmac, &ah_attr->vlan_id);
+			ah_attr->grh.hop_limit = grh->hop_limit;
+			if (wc->wc_flags & IB_WC_WITH_L3_TYPE &&
+			    wc->network_hdr_type == RDMA_NETWORK_IPv4) {
+				l3grh = (union rdma_network_hdr *)grh;
+				memcpy(&src_in.sin_addr.s_addr,
+				       &l3grh->roce4grh.saddr, 4);
+				memcpy(&dst_in.sin_addr.s_addr,
+				       &l3grh->roce4grh.daddr, 4);
+				ipv6_addr_set_v4mapped(src_in.sin_addr.s_addr,
+						       (struct in6_addr *)
+						       &ipv4_sgid);
+				ipv6_addr_set_v4mapped(dst_in.sin_addr.s_addr,
+						       (struct in6_addr *)
+						       &ipv4_dgid);
+				dgid = &ipv4_dgid;
+				sgid = &ipv4_sgid;
+				ah_attr->grh.hop_limit = l3grh->roce4grh.ttl;
+			}
+			network_hdr_type = &ah_attr->grh.network_hdr_type;
+			ret = rdma_addr_find_dmac_by_grh(dgid, sgid,
+							 ah_attr->dmac,
+							 &ah_attr->vlan_id,
+							 network_hdr_type);
 			if (ret)
 				return ret;
 		}
 	} else {
-		ah_attr->vlan_id = 0xffff;
+			ah_attr->vlan_id = 0xffff;
 	}
 
 	ah_attr->dlid = wc->slid;
@@ -237,7 +266,6 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		ah_attr->grh.sgid_index = (u8) gid_index;
 		flow_class = be32_to_cpu(grh->version_tclass_flow);
 		ah_attr->grh.flow_label = flow_class & 0xFFFFF;
-		ah_attr->grh.hop_limit = 0xFF;
 		ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
 	}
 	return 0;
@@ -869,6 +897,7 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
 {
 	int           ret = 0;
 	union ib_gid  sgid;
+	u8 *network_hdr_type;
 
 	if ((*qp_attr_mask & IB_QP_AV)  &&
 	    (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
@@ -881,8 +910,12 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
 			rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
 			qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
 		} else {
+			network_hdr_type =
+					&qp_attr->ah_attr.grh.network_hdr_type;
 			ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
-					qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
+							 qp_attr->ah_attr.dmac,
+							 &qp_attr->vlan_id,
+							 network_hdr_type);
 			if (ret)
 				goto out;
 			ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL);
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index ce55906..242c0a9 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -71,6 +71,7 @@ struct rdma_dev_addr {
 	unsigned short dev_type;
 	int bound_dev_if;
 	enum rdma_transport_type transport;
+	enum rdma_network_type network;
 };
 
 /**
@@ -112,7 +113,7 @@ int rdma_addr_size(struct sockaddr *addr);
 
 int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
 int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac,
-			       u16 *vlan_id);
+			       u16 *vlan_id, u8 *network_hdr_type);
 
 static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
 {
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 7e071a6..3946a67 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -157,6 +157,7 @@ struct ib_sa_path_rec {
 	u8           smac[ETH_ALEN];
 	u8           dmac[ETH_ALEN];
 	u16	     vlan_id;
+	u8	     network_hdr_type;
 };
 
 #define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 470a011..3f5dc91 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -49,6 +49,7 @@
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
 
 #include <linux/atomic.h>
 #include <asm/uaccess.h>
@@ -83,6 +84,12 @@ enum rdma_transport_type {
 __attribute_const__ enum rdma_transport_type
 rdma_node_get_transport(enum rdma_node_type node_type);
 
+enum rdma_network_type {
+	RDMA_NETWORK_IB,
+	RDMA_NETWORK_IPv4,
+	RDMA_NETWORK_IPv6
+};
+
 enum rdma_link_layer {
 	IB_LINK_LAYER_UNSPECIFIED,
 	IB_LINK_LAYER_INFINIBAND,
@@ -418,6 +425,7 @@ struct ib_global_route {
 	u8		sgid_index;
 	u8		hop_limit;
 	u8		traffic_class;
+	u8		network_hdr_type;
 };
 
 struct ib_grh {
@@ -429,6 +437,11 @@ struct ib_grh {
 	union ib_gid	dgid;
 };
 
+union rdma_network_hdr {
+	struct ib_grh ibgrh;
+	struct iphdr roce4grh;
+};
+
 enum {
 	IB_MULTICAST_QPN = 0xffffff
 };
@@ -666,6 +679,7 @@ enum ib_wc_flags {
 	IB_WC_IP_CSUM_OK	= (1<<3),
 	IB_WC_WITH_SMAC		= (1<<4),
 	IB_WC_WITH_VLAN		= (1<<5),
+	IB_WC_WITH_L3_TYPE	= (1<<6)
 };
 
 struct ib_wc {
@@ -688,6 +702,7 @@ struct ib_wc {
 	u8			port_num;	/* valid only for DR SMPs on switches */
 	u8			smac[ETH_ALEN];
 	u16			vlan_id;
+	u8			network_hdr_type;
 };
 
 enum ib_cq_notify_flags {
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux