Re: [RFC v3 17/29] vhost: Shadow virtqueue buffers forwarding

Jason Wang <jasowang@xxxxxxxxxx> · Wed, 2 Jun 2021 17:50:47 +0800

在 2021/5/20 上午12:28, Eugenio Pérez 写道:
Initial version of shadow virtqueue that actually forward buffers. The
exposed addresses are the qemu's virtual address, so devices with IOMMU
that does not allow full mapping of qemu's address space does not work
at the moment.

Also for simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

Later commits will solve some of these concerns.


It would be more more easier to review if you squash those 
"enhancements" into this patch.



Code also need to map used ring (device part) as RW in, and only in,
vhost-net. To map (or call vhost_device_iotlb_miss) inconditionally
would print an error in case of vhost devices with its own mapping
(vdpa).


I think we should not depend on the IOTLB miss. Instead, we should 
program the device IOTLB before starting the svq. Or is there anything 
that prevent you from doing this?


To know if this call is needed, vhost_sw_live_migration_start_vq and
vhost_sw_live_migration_stop copy the test performed in
vhost_dev_start. Testing for the actual backend type could be cleaner,
or checking for non-NULL vhost_force_iommu, enable_custom_iommu, or
another vhostOp. We could extract this test in its own function too,
so its name could give a better hint. Just copy the vhost_dev_start
check at the moment.

Signed-off-by: Eugenio Pérez <eperezma@xxxxxxxxxx>
---
  hw/virtio/vhost-shadow-virtqueue.c | 205 +++++++++++++++++++++++++++--
  hw/virtio/vhost.c                  | 134 ++++++++++++++++++-
  2 files changed, 325 insertions(+), 14 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index ff50f12410..6d767fe248 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -9,6 +9,7 @@
  
  #include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio-access.h"
  
  #include "standard-headers/linux/vhost_types.h"
  
@@ -48,9 +49,93 @@ typedef struct VhostShadowVirtqueue {
  
      /* Virtio device */
      VirtIODevice *vdev;
+
+    /* Map for returning guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next head to expose to device */
+    uint16_t avail_idx_shadow;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from device */
+    uint16_t used_idx;
  } VhostShadowVirtqueue;
  
-/* Forward guest notifications */
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec,
+                                    size_t num, bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq,
+                                          VirtQueueElement *elem)
+{
+    int head;
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    head = svq->free_head;
+
+    /* We need some descriptors here */
+    assert(elem->out_num || elem->in_num);
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put entry in available array (but don't update avail->idx until they
+     * do sync).
+     */
+    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(head);
+    svq->avail_idx_shadow++;
+
+    /* Expose descriptors to device */


It's better to describe the detail order.

E.g "Update avail index after the descriptor is wrote"


+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->avail_idx_shadow);
+
+    return head;
+
+}
+
+static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem)
+{
+    unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem);
+
+    svq->ring_id_maps[qemu_head] = elem;
+}
+
+/* Handle guest->device notifications */
  static void vhost_handle_guest_kick(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
@@ -60,7 +145,67 @@ static void vhost_handle_guest_kick(EventNotifier *n)
          return;
      }
  
-    event_notifier_set(&svq->kick_notifier);
+    /* Make available as many buffers as possible */
+    do {
+        if (virtio_queue_get_notification(svq->vq)) {
+            /* No more notifications until process all available */
+            virtio_queue_set_notification(svq->vq, false);
+        }
+
+        while (true) {
+            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            if (!elem) {
+                break;
+            }
+
+            vhost_shadow_vq_add(svq, elem);
+            event_notifier_set(&svq->kick_notifier);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    /* Get used idx must not be reordered */
+    smp_rmb();
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->used_idx != svq->shadow_used_idx;
+}
+
+static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_shadow_vq_more_used(svq)) {
+        return NULL;
+    }
+
+    last_used = svq->used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        error_report("Device %s says index %u is available", svq->vdev->name,
+                     used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    svq->used_idx++;
+    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
  }
  
  /* Forward vhost notifications */
@@ -69,17 +214,33 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n)
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               call_notifier);
      EventNotifier *masked_notifier;
+    VirtQueue *vq = svq->vq;
  
      masked_notifier = svq->masked_notifier.n;
  
-    if (!masked_notifier) {
-        unsigned n = virtio_get_queue_index(svq->vq);
-        virtio_queue_invalidate_signalled_used(svq->vdev, n);
-        virtio_notify_irqfd(svq->vdev, svq->vq);
-    } else if (!svq->masked_notifier.signaled) {
-        svq->masked_notifier.signaled = true;
-        event_notifier_set(svq->masked_notifier.n);
-    }
+    /* Make as many buffers as possible used. */
+    do {
+        unsigned i = 0;
+
+        /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */
+        while (true) {
+            g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq);
+            if (!elem) {
+                break;
+            }
+
+            assert(i < svq->vring.num);
+            virtqueue_fill(vq, elem, elem->len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        if (!masked_notifier) {
+            virtio_notify_irqfd(svq->vdev, svq->vq);


Any reason that you don't use virtio_notify() here?


+        } else if (!svq->masked_notifier.signaled) {
+            svq->masked_notifier.signaled = true;
+            event_notifier_set(svq->masked_notifier.n);
+        }


This is an example of the extra complexity if you do shadow virtqueue at 
virtio level.

If you do everything at e.g vhost-vdpa, you don't need to care about 
masked_notifer at all.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization