On Tue, Apr 13, 2021 at 10:29:03AM +0800, Jason Wang wrote: > > 在 2021/4/13 上午6:31, Michael S. Tsirkin 写道: > > On Mon, Apr 12, 2021 at 06:03:50PM -0400, Michael S. Tsirkin wrote: > > > I was working on the spurios interrupt problem and > > > I noticed something weird. > > > > > > static int virtnet_poll_tx(struct napi_struct *napi, int budget) > > > { > > > struct send_queue *sq = container_of(napi, struct send_queue, napi); > > > struct virtnet_info *vi = sq->vq->vdev->priv; > > > unsigned int index = vq2txq(sq->vq); > > > struct netdev_queue *txq; > > > > > > if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { > > > /* We don't need to enable cb for XDP */ > > > napi_complete_done(napi, 0); > > > return 0; > > > } > > > > > > txq = netdev_get_tx_queue(vi->dev, index); > > > __netif_tx_lock(txq, raw_smp_processor_id()); > > > free_old_xmit_skbs(sq, true); > > > __netif_tx_unlock(txq); > > > virtqueue_napi_complete(napi, sq->vq, 0); > > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > > > netif_tx_wake_queue(txq); > > > return 0; > > > } > > > > > > So virtqueue_napi_complete is called when txq is not locked, > > > thinkably start_xmit can happen right? > > > Yes, I think so. > > > > > > > > Now virtqueue_napi_complete > > > > > > static void virtqueue_napi_complete(struct napi_struct *napi, > > > struct virtqueue *vq, int processed) > > > { > > > int opaque; > > > > > > opaque = virtqueue_enable_cb_prepare(vq); > > > if (napi_complete_done(napi, processed)) { > > > if (unlikely(virtqueue_poll(vq, opaque))) > > > virtqueue_napi_schedule(napi, vq); > > > } else { > > > virtqueue_disable_cb(vq); > > > } > > > } > > > > > > > > > So it is calling virtqueue_enable_cb_prepare but tx queue > > > could be running and can process things in parallel ... > > > What gives? I suspect this corrupts the ring, and explains > > > why we are seeing weird hangs with vhost packed ring ... > > > > > > Jason? > > > It might cause the interrupt to be disabled unexpectedly I think. > > > > > > > > > > > -- > > > MST > > and wouldn't the following untested patch make sense then? > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index 82e520d2cb12..c23341b18eb5 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -1504,6 +1505,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > > struct virtnet_info *vi = sq->vq->vdev->priv; > > unsigned int index = vq2txq(sq->vq); > > struct netdev_queue *txq; > > + int opaque; > > + bool done; > > if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { > > /* We don't need to enable cb for XDP */ > > @@ -1514,9 +1517,26 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > > txq = netdev_get_tx_queue(vi->dev, index); > > __netif_tx_lock(txq, raw_smp_processor_id()); > > free_old_xmit_skbs(sq, true); > > + > > + opaque = virtqueue_enable_cb_prepare(sq->vq); > > + > > + done = napi_complete_done(napi, 0); > > + > > + if (!done) > > + virtqueue_disable_cb(sq->vq); > > + > > __netif_tx_unlock(txq); > > - virtqueue_napi_complete(napi, sq->vq, 0); > > + if (done) { > > + if (unlikely(virtqueue_poll(vq, opaque))) { > > + if (napi_schedule_prep(napi)) { > > + __netif_tx_lock(txq, raw_smp_processor_id()); > > + virtqueue_disable_cb(sq->vq); > > + __netif_tx_unlock(txq); > > + __napi_schedule(napi); > > + } > > + } > > + } > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > > netif_tx_wake_queue(txq); > > > I wonder why not simply protect the whole poll_tx with tx lock in this case? > > Thanks > Well it takes __netif_tx_lock internally does it not? Sounds like a deadlock to me .., -- MST _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization