On Fri, Dec 8, 2023 at 3:07 AM Bob Pearson <rpearsonhpe@xxxxxxxxx> wrote: > > > > On 12/6/23 19:47, Rain River wrote: > > On Tue, Dec 5, 2023 at 6:30 PM Zhu Yanjun <yanjun.zhu@xxxxxxxxx> wrote: > >> > >> > >> 在 2023/12/5 13:55, Zhu Yanjun 写道: > >>> Add David S. Miller and David Ahern. > >>> > >>> They are the maintainers in netdev and very familiar with mcast. > >>> > >>> Zhu Yanjun > >>> > >>> 在 2023/12/5 8:26, Bob Pearson 写道: > >>>> Currently the rdma_rxe driver does not receive mcast packets at all. > >>>> > >>>> Add code to rxe_mcast_add() and rxe_mcast_del() to register/deregister > >>>> the IP mcast address. This is required for mcast traffic to reach the > >>>> rxe driver when coming from an external source. > >>>> > >>>> Fixes: 8700e3e7c485 ("Soft RoCE driver") > >>>> Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx> > >>>> --- > >>>> drivers/infiniband/sw/rxe/rxe_mcast.c | 119 +++++++++++++++++++++----- > >>>> drivers/infiniband/sw/rxe/rxe_net.c | 2 +- > >>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 + > >>>> drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + > >>>> 4 files changed, 102 insertions(+), 21 deletions(-) > >>>> > >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c > >>>> b/drivers/infiniband/sw/rxe/rxe_mcast.c > >>>> index 86cc2e18a7fd..54735d07cee5 100644 > >>>> --- a/drivers/infiniband/sw/rxe/rxe_mcast.c > >>>> +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c > >>>> @@ -19,38 +19,116 @@ > >>>> * mcast packets in the rxe receive path. > >>>> */ > >>>> +#include <linux/igmp.h> > >>>> + > >>>> #include "rxe.h" > >>>> -/** > >>>> - * rxe_mcast_add - add multicast address to rxe device > >>>> - * @rxe: rxe device object > >>>> - * @mgid: multicast address as a gid > >>>> - * > >>>> - * Returns 0 on success else an error > >>>> - */ > >>>> -static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) > >>>> +static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid) > >>>> { > >>>> + struct in6_addr *addr6 = (struct in6_addr *)mgid; > >>>> + struct sock *sk = recv_sockets.sk6->sk; > >>>> unsigned char ll_addr[ETH_ALEN]; > >>>> + int err; > >>>> + > >>>> + spin_lock_bh(&sk->sk_lock.slock); > >>>> + rtnl_lock(); > >>>> + err = ipv6_sock_mc_join(sk, rxe->ndev->ifindex, addr6); > >> > >> > >> Normally sk_lock is used. Not sure if spin_lock_bh is correct or not. > > > > ./net/ipv6/addrconf.c-2915- lock_sock(sk); > > ./net/ipv6/addrconf.c-2916- if (join) > > ./net/ipv6/addrconf.c:2917: ret = ipv6_sock_mc_join(sk, > > ifindex, addr); > > ./net/ipv6/addrconf.c-2918- else > > ./net/ipv6/addrconf.c-2919- ret = ipv6_sock_mc_drop(sk, > > ifindex, addr); > > ./net/ipv6/addrconf.c-2920- release_sock(sk); > > > > Should be lock_sock? > > It works as well as spin_lock_bh() in preventing the RCU splat and > looks like the preferred way. I'll make this change. This is the implementation of lock_sock. lock_sock has not only spin_lock_bh, but also other protections for sk. You should use lock_sock, instead of spin_lock_bh. void lock_sock_nested(struct sock *sk, int subclass) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); > > Bob > > > >> > >> Please Jason or experts from netdev comment on this. > >> > >> Thanks, > >> > >> Zhu Yanjun > >> > >> > >>>> + rtnl_unlock(); > >>>> + spin_unlock_bh(&sk->sk_lock.slock); > >>>> + if (err && err != -EADDRINUSE) > >>>> + goto err_out; > >>>> ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); > >>>> + err = dev_mc_add(rxe->ndev, ll_addr); > >>>> + if (err) > >>>> + goto err_drop; > >>>> + > >>>> + return 0; > >>>> - return dev_mc_add(rxe->ndev, ll_addr); > >>>> +err_drop: > >>>> + spin_lock_bh(&sk->sk_lock.slock); > >>>> + rtnl_lock(); > >>>> + ipv6_sock_mc_drop(sk, rxe->ndev->ifindex, addr6); > >>>> + rtnl_unlock(); > >>>> + spin_unlock_bh(&sk->sk_lock.slock); > >>>> +err_out: > >>>> + return err; > >>>> } > >>>> -/** > >>>> - * rxe_mcast_del - delete multicast address from rxe device > >>>> - * @rxe: rxe device object > >>>> - * @mgid: multicast address as a gid > >>>> - * > >>>> - * Returns 0 on success else an error > >>>> - */ > >>>> -static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) > >>>> +static int rxe_mcast_add(struct rxe_mcg *mcg) > >>>> { > >>>> + struct rxe_dev *rxe = mcg->rxe; > >>>> + union ib_gid *mgid = &mcg->mgid; > >>>> unsigned char ll_addr[ETH_ALEN]; > >>>> + struct ip_mreqn imr = {}; > >>>> + int err; > >>>> + > >>>> + if (mcg->is_ipv6) > >>>> + return rxe_mcast_add6(rxe, mgid); > >>>> + > >>>> + imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12); > >>>> + imr.imr_ifindex = rxe->ndev->ifindex; > >>>> + rtnl_lock(); > >>>> + err = ip_mc_join_group(recv_sockets.sk4->sk, &imr); > >>>> + rtnl_unlock(); > >>>> + if (err && err != -EADDRINUSE) > >>>> + goto err_out; > >>>> + > >>>> + ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr); > >>>> + err = dev_mc_add(rxe->ndev, ll_addr); > >>>> + if (err) > >>>> + goto err_leave; > >>>> + > >>>> + return 0; > >>>> + > >>>> +err_leave: > >>>> + rtnl_lock(); > >>>> + ip_mc_leave_group(recv_sockets.sk4->sk, &imr); > >>>> + rtnl_unlock(); > >>>> +err_out: > >>>> + return err; > >>>> +} > >>>> + > >>>> +static int rxe_mcast_del6(struct rxe_dev *rxe, union ib_gid *mgid) > >>>> +{ > >>>> + struct sock *sk = recv_sockets.sk6->sk; > >>>> + unsigned char ll_addr[ETH_ALEN]; > >>>> + int err, err2; > >>>> ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); > >>>> + err = dev_mc_del(rxe->ndev, ll_addr); > >>>> + > >>>> + spin_lock_bh(&sk->sk_lock.slock); > >>>> + rtnl_lock(); > >>>> + err2 = ipv6_sock_mc_drop(sk, rxe->ndev->ifindex, > >>>> + (struct in6_addr *)mgid); > >>>> + rtnl_unlock(); > >>>> + spin_unlock_bh(&sk->sk_lock.slock); > >>>> + > >>>> + return err ?: err2; > >>>> +} > >>>> + > >>>> +static int rxe_mcast_del(struct rxe_mcg *mcg) > >>>> +{ > >>>> + struct rxe_dev *rxe = mcg->rxe; > >>>> + union ib_gid *mgid = &mcg->mgid; > >>>> + unsigned char ll_addr[ETH_ALEN]; > >>>> + struct ip_mreqn imr = {}; > >>>> + int err, err2; > >>>> + > >>>> + if (mcg->is_ipv6) > >>>> + return rxe_mcast_del6(rxe, mgid); > >>>> + > >>>> + imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12); > >>>> + imr.imr_ifindex = rxe->ndev->ifindex; > >>>> + ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr); > >>>> + err = dev_mc_del(rxe->ndev, ll_addr); > >>>> + > >>>> + rtnl_lock(); > >>>> + err2 = ip_mc_leave_group(recv_sockets.sk4->sk, &imr); > >>>> + rtnl_unlock(); > >>>> - return dev_mc_del(rxe->ndev, ll_addr); > >>>> + return err ?: err2; > >>>> } > >>>> /** > >>>> @@ -164,6 +242,7 @@ static void __rxe_init_mcg(struct rxe_dev *rxe, > >>>> union ib_gid *mgid, > >>>> { > >>>> kref_init(&mcg->ref_cnt); > >>>> memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid)); > >>>> + mcg->is_ipv6 = !ipv6_addr_v4mapped((struct in6_addr *)mgid); > >>>> INIT_LIST_HEAD(&mcg->qp_list); > >>>> mcg->rxe = rxe; > >>>> @@ -225,7 +304,7 @@ static struct rxe_mcg *rxe_get_mcg(struct > >>>> rxe_dev *rxe, union ib_gid *mgid) > >>>> spin_unlock_bh(&rxe->mcg_lock); > >>>> /* add mcast address outside of lock */ > >>>> - err = rxe_mcast_add(rxe, mgid); > >>>> + err = rxe_mcast_add(mcg); > >>>> if (!err) > >>>> return mcg; > >>>> @@ -273,7 +352,7 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg) > >>>> static void rxe_destroy_mcg(struct rxe_mcg *mcg) > >>>> { > >>>> /* delete mcast address outside of lock */ > >>>> - rxe_mcast_del(mcg->rxe, &mcg->mgid); > >>>> + rxe_mcast_del(mcg); > >>>> spin_lock_bh(&mcg->rxe->mcg_lock); > >>>> __rxe_destroy_mcg(mcg); > >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c > >>>> b/drivers/infiniband/sw/rxe/rxe_net.c > >>>> index 58c3f3759bf0..b481f8da2002 100644 > >>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c > >>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c > >>>> @@ -18,7 +18,7 @@ > >>>> #include "rxe_net.h" > >>>> #include "rxe_loc.h" > >>>> -static struct rxe_recv_sockets recv_sockets; > >>>> +struct rxe_recv_sockets recv_sockets; > >>>> static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, > >>>> struct net_device *ndev, > >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h > >>>> b/drivers/infiniband/sw/rxe/rxe_net.h > >>>> index 45d80d00f86b..89cee7d5340f 100644 > >>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h > >>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h > >>>> @@ -15,6 +15,7 @@ struct rxe_recv_sockets { > >>>> struct socket *sk4; > >>>> struct socket *sk6; > >>>> }; > >>>> +extern struct rxe_recv_sockets recv_sockets; > >>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev); > >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h > >>>> b/drivers/infiniband/sw/rxe/rxe_verbs.h > >>>> index ccb9d19ffe8a..7be9e6232dd9 100644 > >>>> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h > >>>> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h > >>>> @@ -352,6 +352,7 @@ struct rxe_mcg { > >>>> atomic_t qp_num; > >>>> u32 qkey; > >>>> u16 pkey; > >>>> + bool is_ipv6; > >>>> }; > >>>> struct rxe_mca { > >> >