On 4/22/2020 3:50 PM, Jason Gunthorpe wrote:
On Wed, Apr 22, 2020 at 11:39:46AM +0300, Maor Gottlieb wrote:
Add support to get the RoCE LAG xmit slave by building skb
of the RoCE packet and call to master_get_xmit_slave.
If driver wants to get the slave assume all slaves are available,
then need to set RDMA_LAG_FLAGS_HASH_ALL_SLAVES in flags.
Signed-off-by: Maor Gottlieb <maorg@xxxxxxxxxxxx>
Reviewed-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
drivers/infiniband/core/Makefile | 2 +-
drivers/infiniband/core/lag.c | 138 +++++++++++++++++++++++++++++++
include/rdma/ib_verbs.h | 2 +
include/rdma/lag.h | 22 +++++
4 files changed, 163 insertions(+), 1 deletion(-)
create mode 100644 drivers/infiniband/core/lag.c
create mode 100644 include/rdma/lag.h
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d1b14887960e..870f0fcd54d5 100644
+++ b/drivers/infiniband/core/Makefile
@@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
multicast.o mad.o smi.o agent.o mad_rmpp.o \
nldev.o restrack.o counters.o ib_core_uverbs.o \
- trace.o
+ trace.o lag.o
ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c
new file mode 100644
index 000000000000..3036fb3dc43a
+++ b/drivers/infiniband/core/lag.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/lag.h>
+
+static struct sk_buff *rdma_build_skb(struct ib_device *device,
+ struct net_device *netdev,
+ struct rdma_ah_attr *ah_attr)
+{
+ struct ipv6hdr *ip6h;
+ struct sk_buff *skb;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ struct udphdr *uh;
+ u8 smac[ETH_ALEN];
+ bool is_ipv4;
+ int hdr_len;
+
+ is_ipv4 = ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw);
+ hdr_len = ETH_HLEN + sizeof(struct udphdr) + LL_RESERVED_SPACE(netdev);
+ hdr_len += is_ipv4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr);
+
+ skb = alloc_skb(hdr_len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ skb->dev = netdev;
+ skb_reserve(skb, hdr_len);
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+ uh->source = htons(0xC000);
+ uh->dest = htons(ROCE_V2_UDP_DPORT);
+ uh->len = htons(sizeof(struct udphdr));
+
+ if (is_ipv4) {
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ iph->frag_off = 0;
+ iph->version = 4;
+ iph->protocol = IPPROTO_UDP;
+ iph->ihl = 0x5;
+ iph->tot_len = htons(sizeof(struct udphdr) + sizeof(struct
+ iphdr));
+ memcpy(&iph->saddr, ah_attr->grh.sgid_attr->gid.raw + 12,
+ sizeof(struct in_addr));
+ memcpy(&iph->daddr, ah_attr->grh.dgid.raw + 12,
+ sizeof(struct in_addr));
+ } else {
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+ ip6h->version = 6;
+ ip6h->nexthdr = IPPROTO_UDP;
+ memcpy(&ip6h->flow_lbl, &ah_attr->grh.flow_label,
+ sizeof(*ip6h->flow_lbl));
+ memcpy(&ip6h->saddr, ah_attr->grh.sgid_attr->gid.raw,
+ sizeof(struct in6_addr));
+ memcpy(&ip6h->daddr, ah_attr->grh.dgid.raw,
+ sizeof(struct in6_addr));
+ }
What about setting up the UDP header? It looks like this needs to be
before the sport patch and the sport patch needs to modify here too.
Yeah, we will need to set the udp source port by calling
rdma_flow_label_to_udp_sport (Introduced in the UDP source port series).
+void rdma_lag_put_ah_roce_slave(struct rdma_ah_attr *ah_attr)
+{
+ if (ah_attr->roce.xmit_slave)
+ dev_put(ah_attr->roce.xmit_slave);
+}
+
+int rdma_lag_get_ah_roce_slave(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr)
+{
+ struct net_device *master;
+ struct net_device *slave;
+
+ if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE &&
+ ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP))
+ return 0;
+
+ rcu_read_lock();
+ master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr);
+ if (IS_ERR(master)) {
+ rcu_read_unlock();
+ return PTR_ERR(master);
+ }
+ dev_hold(master);
+ rcu_read_unlock();
+
+ if (!netif_is_bond_master(master)) {
+ dev_put(master);
+ return 0;
+ }
+
+ slave = rdma_get_xmit_slave_udp(device, master, ah_attr);
+
+ dev_put(master);
+ if (!slave) {
+ ibdev_warn(device, "Failed to get lag xmit slave\n");
+ return -EINVAL;
+ }
+
+ ah_attr->roce.xmit_slave = slave;
Is xmit_slave is reliably NULL in the other return 0 cases?
It's hard to follow. Maybe it will be better to have this initialization
anyway.
Jason