Re: [PATCH for-next 4/9] RDMA/rxe: Fix delayed send packet handling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 7/24/23 12:59, Leon Romanovsky wrote:
> On Sun, Jul 23, 2023 at 09:03:34PM +0800, Zhu Yanjun wrote:
>> On Sat, Jul 22, 2023 at 4:51 AM Bob Pearson <rpearsonhpe@xxxxxxxxx> wrote:
>>>
>>> In cable pull testing some NICs can hold a send packet long enough
>>> to allow ulp protocol stacks to destroy the qp and the cleanup
>>> routines to timeout waiting for all qp references to be released.
>>> When the NIC driver finally frees the SKB the qp pointer is no longer
>>> valid and causes a seg fault in rxe_skb_tx_dtor().
>>
>> If a packet is held in some NICs, the later packets of this packet
>> will also be held by this NIC. So this will cause QP not to work well.
>> This is a serious problem. And the problem should be fixed in this
>> kind of NICs. We should not fix it in RXE.
>>
>> And a QP exists for at least several seconds. A packet can be held in
>> NIC for such a long time? This problem does exist in the real world,
>> or you imagine this scenario?
>>
>> I can not imagine this kind of scenario. Please Jason or Leon also check this.
> 
> I stopped to follow RXE changes for a long time. They don't make any sense to me.
> Even this patch, which moves existing IB reference counter from one place to another
> and does it for every SKB is beyond my understanding.

At the end of the day, we used to take a reference on the QP for each send packet and
drop it in the destructor function. (why? we are keeping a count of pending send packets
to keep from overloading the send queue.) But with link failures which can take many seconds
to recover this gets in the way of lustre cleaning up a bad connection when it times out
first. So instead we take a reference on the sock struct since if you destroy the QP you
also destroy the QP socket struct which drops its reference on the sock struct which gets it
freed before the packet is finally sent and the destructor gets called. That reference is dropped in
the destructor. We still need to get to the qp pointer in the destructor so we pass the
qp# in the sock->user_data instead of the qp pointer. Now if the qp has been destroyed the
qp lookup from the qp# fails and you just don't touch the counter. It really all just does work fine.

Bob
> 
> Thanks
> 
>>
>> Thanks.
>> Zhu Yanjun
>>>
>>> This patch passes the qp index instead of the qp to the skb destructor
>>> callback function. The call back is required to lookup the qp from the
>>> index and if it has been destroyed the lookup will return NULL and the
>>> qp will not be referenced avoiding the seg fault.
>>>
>>> Fixes: 8700e3e7c485 ("Soft RoCE driver")
>>> Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx>
>>> ---
>>>  drivers/infiniband/sw/rxe/rxe_net.c | 87 ++++++++++++++++++++++-------
>>>  drivers/infiniband/sw/rxe/rxe_qp.c  |  1 -
>>>  2 files changed, 67 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>> index cd59666158b1..10e4a752ff7c 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>> @@ -131,19 +131,28 @@ static struct dst_entry *rxe_find_route(struct net_device *ndev,
>>>         return dst;
>>>  }
>>>
>>> +static struct rxe_dev *get_rxe_from_skb(struct sk_buff *skb)
>>> +{
>>> +       struct rxe_dev *rxe;
>>> +       struct net_device *ndev = skb->dev;
>>> +
>>> +       rxe = rxe_get_dev_from_net(ndev);
>>> +       if (!rxe && is_vlan_dev(ndev))
>>> +               rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
>>> +
>>> +       return rxe;
>>> +}
>>> +
>>>  static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>>>  {
>>>         struct udphdr *udph;
>>>         struct rxe_dev *rxe;
>>> -       struct net_device *ndev = skb->dev;
>>>         struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
>>>
>>>         /* takes a reference on rxe->ib_dev
>>>          * drop when skb is freed
>>>          */
>>> -       rxe = rxe_get_dev_from_net(ndev);
>>> -       if (!rxe && is_vlan_dev(ndev))
>>> -               rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
>>> +       rxe = get_rxe_from_skb(skb);
>>>         if (!rxe)
>>>                 goto drop;
>>>
>>> @@ -345,46 +354,84 @@ int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt,
>>>
>>>  static void rxe_skb_tx_dtor(struct sk_buff *skb)
>>>  {
>>> -       struct sock *sk = skb->sk;
>>> -       struct rxe_qp *qp = sk->sk_user_data;
>>> -       int skb_out = atomic_dec_return(&qp->skb_out);
>>> +       struct rxe_dev *rxe;
>>> +       unsigned int index;
>>> +       struct rxe_qp *qp;
>>> +       int skb_out;
>>> +
>>> +       /* takes a ref on ib device if success */
>>> +       rxe = get_rxe_from_skb(skb);
>>> +       if (!rxe)
>>> +               goto out;
>>> +
>>> +       /* recover source qp index from sk->sk_user_data
>>> +        * free the reference taken in rxe_send
>>> +        */
>>> +       index = (int)(uintptr_t)skb->sk->sk_user_data;
>>> +       sock_put(skb->sk);
>>>
>>> +       /* lookup qp from index, takes a ref on success */
>>> +       qp = rxe_pool_get_index(&rxe->qp_pool, index);
>>> +       if (!qp)
>>> +               goto out_put_ibdev;
>>> +
>>> +       skb_out = atomic_dec_return(&qp->skb_out);
>>>         if (unlikely(qp->need_req_skb &&
>>>                      skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
>>>                 rxe_sched_task(&qp->req.task);
>>>
>>>         rxe_put(qp);
>>> +out_put_ibdev:
>>> +       ib_device_put(&rxe->ib_dev);
>>> +out:
>>> +       return;
>>>  }
>>>
>>>  static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt)
>>>  {
>>> +       struct rxe_qp *qp = pkt->qp;
>>> +       struct sock *sk;
>>>         int err;
>>>
>>> -       skb->destructor = rxe_skb_tx_dtor;
>>> -       skb->sk = pkt->qp->sk->sk;
>>> +       /* qp can be destroyed while this packet is waiting on
>>> +        * the tx queue. So need to protect sk.
>>> +        */
>>> +       sk = qp->sk->sk;
>>> +       sock_hold(sk);
>>> +       skb->sk = sk;
>>>
>>> -       rxe_get(pkt->qp);
>>>         atomic_inc(&pkt->qp->skb_out);
>>>
>>> +       sk->sk_user_data = (void *)(long)qp->elem.index;
>>> +       skb->destructor = rxe_skb_tx_dtor;
>>> +
>>>         if (skb->protocol == htons(ETH_P_IP)) {
>>> -               err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
>>> +               err = ip_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
>>>         } else if (skb->protocol == htons(ETH_P_IPV6)) {
>>> -               err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
>>> +               err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
>>>         } else {
>>> -               rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n",
>>> -                               skb->protocol);
>>> -               atomic_dec(&pkt->qp->skb_out);
>>> -               rxe_put(pkt->qp);
>>> -               kfree_skb(skb);
>>> -               return -EINVAL;
>>> +               rxe_dbg_qp(qp, "Unknown layer 3 protocol: %d",
>>> +                          skb->protocol);
>>> +               err = -EINVAL;
>>> +               goto err_not_sent;
>>>         }
>>>
>>> +       /* IP consumed the packet, the destructor will handle cleanup */
>>>         if (unlikely(net_xmit_eval(err))) {
>>> -               rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err);
>>> -               return -EAGAIN;
>>> +               rxe_dbg_qp(qp, "Error sending packet: %d", err);
>>> +               err = -EAGAIN;
>>> +               goto err_out;
>>>         }
>>>
>>>         return 0;
>>> +
>>> +err_not_sent:
>>> +       skb->destructor = NULL;
>>> +       atomic_dec(&pkt->qp->skb_out);
>>> +       kfree_skb(skb);
>>> +       sock_put(sk);
>>> +err_out:
>>> +       return err;
>>>  }
>>>
>>>  /* fix up a send packet to match the packets
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
>>> index a569b111a9d2..dcbf71031453 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_qp.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_qp.c
>>> @@ -194,7 +194,6 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
>>>         err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
>>>         if (err < 0)
>>>                 return err;
>>> -       qp->sk->sk->sk_user_data = qp;
>>>
>>>         /* pick a source UDP port number for this QP based on
>>>          * the source QPN. this spreads traffic for different QPs
>>> --
>>> 2.39.2
>>>




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux