Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ

Jason Wang <jasowang@xxxxxxxxxx> · Thu, 3 Mar 2022 15:33:09 +0800

在 2022/3/1 下午4:50, Eugenio Perez Martin 写道:
On Mon, Feb 28, 2022 at 8:37 AM Jason Wang <jasowang@xxxxxxxxxx> wrote:

在 2022/2/27 下午9:41, Eugenio Pérez 写道:
Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Signed-off-by: Eugenio Pérez <eperezma@xxxxxxxxxx>
---
   hw/virtio/vhost-shadow-virtqueue.h |   6 +-
   include/hw/virtio/vhost-vdpa.h     |   3 +
   hw/virtio/vhost-shadow-virtqueue.c |  76 ++++++++++++++++-
   hw/virtio/vhost-vdpa.c             | 128 ++++++++++++++++++++++++-----
   4 files changed, 187 insertions(+), 26 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index 04c67685fd..b2f722d101 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -13,6 +13,7 @@
   #include "qemu/event_notifier.h"
   #include "hw/virtio/virtio.h"
   #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"

   /* Shadow virtqueue to relay notifications */
   typedef struct VhostShadowVirtqueue {
@@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue {
       /* Virtio device */
       VirtIODevice *vdev;

+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
       /* Map for use the guest's descriptors */
       VirtQueueElement **ring_id_maps;

@@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                        VirtQueue *vq);
   void vhost_svq_stop(VhostShadowVirtqueue *svq);

-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);

   void vhost_svq_free(gpointer vq);
   G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index 009a9f3b6b..ee8e939ad0 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -14,6 +14,7 @@

   #include <gmodule.h>

+#include "hw/virtio/vhost-iova-tree.h"
   #include "hw/virtio/virtio.h"
   #include "standard-headers/linux/vhost_types.h"

@@ -30,6 +31,8 @@ typedef struct vhost_vdpa {
       MemoryListener listener;
       struct vhost_vdpa_iova_range iova_range;
       bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
       GPtrArray *shadow_vqs;
       struct vhost_dev *dev;
       VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index a38d313755..7e073773d1 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -11,6 +11,7 @@
   #include "hw/virtio/vhost-shadow-virtqueue.h"

   #include "qemu/error-report.h"
+#include "qemu/log.h"
   #include "qemu/main-loop.h"
   #include "qemu/log.h"
   #include "linux-headers/linux/vhost.h"
@@ -84,7 +85,58 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
       }
   }

+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq    Shadow VirtQueue
+ * @vaddr  Translated IOVA addresses
+ * @iovec  Source qemu's VA addresses
+ * @num    Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     void **addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = (void *)(map->iova + off);
+
+        if (unlikely(int128_gt(int128_add(needle.translated_addr,
+                                          iovec[i].iov_len),
+                               map->translated_addr + map->size))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    void * const *vaddr_sg,

Nit: it looks to me we are not passing vaddr but iova here, so it might
be better to use "sg"?

Sure, this is a leftover of pre-IOVA translations. I see better to
write as you say.

                                       const struct iovec *iovec,
                                       size_t num, bool more_descs, bool write)
   {
@@ -103,7 +155,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
           } else {
               descs[i].flags = flags;
           }
-        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]);
           descs[i].len = cpu_to_le32(iovec[n].iov_len);

           last = i;
@@ -119,6 +171,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
   {
       unsigned avail_idx;
       vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num));

       *head = svq->free_head;

@@ -129,9 +183,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
           return false;
       }

-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
                               elem->in_num > 0, false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);

       /*
        * Put the entry in the available array (but don't update avail->idx until
@@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
    * Creates vhost shadow virtqueue, and instructs the vhost device to use the
    * shadow methods and file descriptors.
    *
+ * @iova_tree Tree to perform descriptors translations
+ *
    * Returns the new virtqueue or NULL.
    *
    * In case of error, reason is reported through error_report.
    */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
   {
       g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
       int r;
@@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void)

       event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
       event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
       return g_steal_pointer(&svq);

   err_init_hdev_call:
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 435b9c2e9e..56f9f125cd 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -209,6 +209,21 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                            vaddr, section->readonly);

       llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }

       vhost_vdpa_iotlb_batch_begin_once(v);
       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -261,6 +276,20 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,

       llsize = int128_sub(llend, int128_make64(iova));

+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
       vhost_vdpa_iotlb_batch_begin_once(v);
       ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
       if (ret) {
@@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,

       shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
       for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);

           if (unlikely(!svq)) {
               error_setg(errp, "Cannot create svq %u", n);
@@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
   /**
    * Unmap a SVQ area in the device
    */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
   {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
       int r;

-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
       return r == 0;
   }

   static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                          const VhostShadowVirtqueue *svq)
   {
+    DMAMap needle;
       struct vhost_vdpa *v = dev->opaque;
       struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
       bool ok;

       vhost_svq_get_vring_addr(svq, &svq_addr);

-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+    };

Let's simply initialize the member to zero during start of this function
then we can use needle->transalted_addr = XXX here.

Sure

+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
       if (unlikely(!ok)) {
           return false;
       }

-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+    };
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v          Vhost-vdpa device
+ * @needle     The area to search iova
+ * @errorp     Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size,
+                           (void *)needle->translated_addr,
+                           !(needle->perm & IOMMU_ACCESS_FLAG(0, 1)));

Let's simply use needle->perm == IOMMU_RO here?

The motivation to use this way is to be more resilient to the future.
For example, if a new flag is added.

But I'm totally ok with comparing with IOMMU_RO, I see that scenario
unlikely at the moment.

+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
   }

   /**
- * Map shadow virtqueue rings in device
+ * Map the shadow virtqueue rings in the device
    *
    * @dev   The vhost device
    * @svq   The shadow virtqueue
@@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr,
                                        Error **errp)
   {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
       struct vhost_vdpa *v = dev->opaque;
       size_t device_size = vhost_svq_device_area_size(svq);
       size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;

       ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);

-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,

Any reason for the "-1" here? I see several places do things like that,
it's probably hint of wrong API somehwere.

The "problem" is the api mismatch between _end and _last, to include
the last member in the size or not.

IOVA tree needs to use _end so we can allocate the last page in case
of available range ending in (uint64_t)-1 [1]. But If we change
vhost_svq_{device,driver}_area_size to make it inclusive,


These functions looks sane since it doesn't return a range. It's up to 
the caller to decide how to use the size.


  we need to
use "+1" in calls like qemu_memalign and memset at vhost_svq_start.
Probably in more places too


I'm not sure I get here. Maybe you can show which code may suffers if we 
don't decrease it by one here.

But current code may endup to passing qemu_real_host_page_size - 1 to 
vhost-VDPA which seems wrong?

E.g vhost_svq_device_area_size() return qemu_real_host_page_size, but it 
was decreased by 1 here for size, then we pass size to vhost_vdpa_dma_map().

Thanks



QEMU's emulated Intel iommu code solves it using the address mask as
the size, something that does not fit 100% with vhost devices since
they can allocate an arbitrary address of arbitrary size when using
vIOMMU. It's not a problem for vhost-vdpa at this moment since we make
sure we expose unaligned and whole pages with vrings, but I feel it
would only be to move the problem somewhere else.

Thanks!

[1] There are alternatives: to use Int128_t, etc. But I think it's
better not to change that in this patch series.

Thanks


+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
           return false;
       }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;

-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
       }
+    addr->used_user_addr = device_region.iova;

-    return r == 0;
+    return ok;
   }

   static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization