Hi, Did you check the option to hold the netif_tx_lock_xxx() in ipoib_cm_rep_handler function (over the line set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow? Thanks, On Sat, Aug 18, 2018 at 12:31 AM, Aaron Knister <aaron.s.knister@xxxxxxxx> wrote: > Inside of start_xmit() the call to check if the connection is up and the > queueing of the packets for later transmission is not atomic which > leaves a window where cm_rep_handler can run, set the connection up, > dequeue pending packets and leave the subsequently queued packets by > start_xmit() sitting on neigh->queue until they're dropped when the > connection is torn down. This only applies to connected mode. These > dropped packets can really upset TCP, for example, and cause > multi-minute delays in transmission for open connections. > > Here's the code in start_xmit where we check to see if the connection > is up: > > if (ipoib_cm_get(neigh)) { > if (ipoib_cm_up(neigh)) { > ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); > goto unref; > } > } > > The race occurs if cm_rep_handler execution occurs after the above > connection check (specifically if it gets to the point where it acquires > priv->lock to dequeue pending skb's) but before the below code snippet > in start_xmit where packets are queued. > > if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { > push_pseudo_header(skb, phdr->hwaddr); > spin_lock_irqsave(&priv->lock, flags); > __skb_queue_tail(&neigh->queue, skb); > spin_unlock_irqrestore(&priv->lock, flags); > } else { > ++dev->stats.tx_dropped; > dev_kfree_skb_any(skb); > } > > The patch re-checks ipoib_cm_up with priv->lock held to avoid this > race condition. Since odds are the conn should be up most of the time > (and thus the connection *not* down most of the time) we don't hold the > lock for the first check attempt to avoid a slowdown from unecessary > locking for the majority of the packets transmitted during the > connection's life. > > Signed-off-by: Aaron Knister <aaron.s.knister@xxxxxxxx> > --- > drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++----- > 1 file changed, 39 insertions(+), 7 deletions(-) > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c > index 26cde95b..a950c916 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, > spin_unlock_irqrestore(&priv->lock, flags); > } > > +static bool defer_neigh_skb(struct sk_buff *skb, > + struct net_device *dev, > + struct ipoib_neigh *neigh, > + struct ipoib_pseudo_header *phdr) > +{ > + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { > + push_pseudo_header(skb, phdr->hwaddr); > + __skb_queue_tail(&neigh->queue, skb); > + return true; > + } > + > + return false; > +} > + > + > static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) > { > struct ipoib_dev_priv *priv = ipoib_priv(dev); > @@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) > struct ipoib_pseudo_header *phdr; > struct ipoib_header *header; > unsigned long flags; > + bool deferred_pkt = true; > > phdr = (struct ipoib_pseudo_header *) skb->data; > skb_pull(skb, sizeof(*phdr)); > @@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) > ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); > goto unref; > } > + /* > + * Re-check ipoib_cm_up with priv->lock held to avoid > + * race condition between start_xmit and skb_dequeue in > + * cm_rep_handler. Since odds are the conn should be up > + * most of the time, we don't hold the lock for the > + * first check above > + */ > + spin_lock_irqsave(&priv->lock, flags); > + if (ipoib_cm_up(neigh)) { > + spin_unlock_irqrestore(&priv->lock, flags); > + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); > + } else { > + deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr); > + spin_unlock_irqrestore(&priv->lock, flags); > + } > + > + goto unref; > } else if (neigh->ah && neigh->ah->valid) { > neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, > IPOIB_QPN(phdr->hwaddr)); > @@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) > neigh_refresh_path(neigh, phdr->hwaddr, dev); > } > > - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { > - push_pseudo_header(skb, phdr->hwaddr); > - spin_lock_irqsave(&priv->lock, flags); > - __skb_queue_tail(&neigh->queue, skb); > - spin_unlock_irqrestore(&priv->lock, flags); > - } else { > + spin_lock_irqsave(&priv->lock, flags); > + deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr); > + spin_unlock_irqrestore(&priv->lock, flags); > + > +unref: > + if (!deferred_pkt) { > ++dev->stats.tx_dropped; > dev_kfree_skb_any(skb); > } > > -unref: > ipoib_neigh_put(neigh); > > return NETDEV_TX_OK; > -- > 2.12.3 >