On 12/31/2015 07:17 PM, Michael S. Tsirkin wrote: > On Thu, Dec 31, 2015 at 03:13:45PM +0800, Jason Wang wrote: >> This patch tries to implement an device IOTLB for vhost. This could be >> used with for co-operation with userspace(qemu) implementation of >> iommu for a secure DMA environment in guest. >> >> The idea is simple. When vhost meets an IOTLB miss, it will request >> the assistance of userspace to do the translation, this is done >> through: >> >> - Fill the translation request in a preset userspace address (This >> address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY). >> - Notify userspace through eventfd (This eventfd was set through ioctl >> VHOST_SET_IOTLB_FD). >> >> When userspace finishes the translation, it will update the vhost >> IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of >> snooping the IOTLB invalidation of IOMMU IOTLB and use >> VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost. >> >> For simplicity, IOTLB was implemented with a simple hash array. The >> index were calculated from IOVA page frame number which can only works >> at PAGE_SIZE level. >> >> An qemu implementation (for reference) is available at: >> git@xxxxxxxxxx:jasowang/qemu.git iommu >> >> TODO & Known issues: >> >> - read/write permission validation was not implemented. >> - no feature negotiation. >> - VHOST_SET_MEM_TABLE is not reused (maybe there's a chance). >> - working at PAGE_SIZE level, don't support large mappings. >> - better data structure for IOTLB instead of simple hash array. >> - better API, e.g using mmap() instead of preset userspace address. >> >> Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx> > Interesting. I'm working on a slightly different approach > which is direct vt-d support in vhost. I've considered this approach. May have advantages but the issues here is vt-d emulation is even in-complete in qemu and I believe we don't want to duplicate the code in both vhost-kernel and vhost-user? > This one has the advantage of being more portable. Right, the patch tries to be architecture independent. > >> --- >> drivers/vhost/net.c | 2 +- >> drivers/vhost/vhost.c | 190 ++++++++++++++++++++++++++++++++++++++++++++- >> drivers/vhost/vhost.h | 13 ++++ >> include/uapi/linux/vhost.h | 26 +++++++ >> 4 files changed, 229 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c >> index 9eda69e..a172be9 100644 >> --- a/drivers/vhost/net.c >> +++ b/drivers/vhost/net.c >> @@ -1083,7 +1083,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, >> r = vhost_dev_ioctl(&n->dev, ioctl, argp); >> if (r == -ENOIOCTLCMD) >> r = vhost_vring_ioctl(&n->dev, ioctl, argp); >> - else >> + else if (ioctl != VHOST_UPDATE_IOTLB) >> vhost_net_flush(n); >> mutex_unlock(&n->dev.mutex); >> return r; >> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >> index eec2f11..729fe05 100644 >> --- a/drivers/vhost/vhost.c >> +++ b/drivers/vhost/vhost.c >> @@ -113,6 +113,11 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq) >> } >> #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */ >> >> +static inline int vhost_iotlb_hash(u64 iova) >> +{ >> + return (iova >> PAGE_SHIFT) & (VHOST_IOTLB_SIZE - 1); >> +} >> + >> static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, >> poll_table *pt) >> { >> @@ -384,8 +389,14 @@ void vhost_dev_init(struct vhost_dev *dev, >> dev->memory = NULL; >> dev->mm = NULL; >> spin_lock_init(&dev->work_lock); >> + spin_lock_init(&dev->iotlb_lock); >> + mutex_init(&dev->iotlb_req_mutex); >> INIT_LIST_HEAD(&dev->work_list); >> dev->worker = NULL; >> + dev->iotlb_request = NULL; >> + dev->iotlb_ctx = NULL; >> + dev->iotlb_file = NULL; >> + dev->pending_request.flags.type = VHOST_IOTLB_INVALIDATE; >> >> for (i = 0; i < dev->nvqs; ++i) { >> vq = dev->vqs[i]; >> @@ -393,12 +404,17 @@ void vhost_dev_init(struct vhost_dev *dev, >> vq->indirect = NULL; >> vq->heads = NULL; >> vq->dev = dev; >> + vq->iotlb_request = NULL; >> mutex_init(&vq->mutex); >> vhost_vq_reset(dev, vq); >> if (vq->handle_kick) >> vhost_poll_init(&vq->poll, vq->handle_kick, >> POLLIN, dev); >> } >> + >> + init_completion(&dev->iotlb_completion); >> + for (i = 0; i < VHOST_IOTLB_SIZE; i++) >> + dev->iotlb[i].flags.valid = VHOST_IOTLB_INVALID; >> } >> EXPORT_SYMBOL_GPL(vhost_dev_init); >> >> @@ -940,9 +956,10 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) >> { >> struct file *eventfp, *filep = NULL; >> struct eventfd_ctx *ctx = NULL; >> + struct vhost_iotlb_entry entry; >> u64 p; >> long r; >> - int i, fd; >> + int index, i, fd; >> >> /* If you are not the owner, you can become one */ >> if (ioctl == VHOST_SET_OWNER) { >> @@ -1008,6 +1025,80 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) >> if (filep) >> fput(filep); >> break; >> + case VHOST_SET_IOTLB_FD: >> + r = get_user(fd, (int __user *)argp); >> + if (r < 0) >> + break; >> + eventfp = fd == -1 ? NULL : eventfd_fget(fd); >> + if (IS_ERR(eventfp)) { >> + r = PTR_ERR(eventfp); >> + break; >> + } >> + if (eventfp != d->iotlb_file) { >> + filep = d->iotlb_file; >> + d->iotlb_file = eventfp; >> + ctx = d->iotlb_ctx; >> + d->iotlb_ctx = eventfp ? >> + eventfd_ctx_fileget(eventfp) : NULL; >> + } else >> + filep = eventfp; >> + for (i = 0; i < d->nvqs; ++i) { >> + mutex_lock(&d->vqs[i]->mutex); >> + d->vqs[i]->iotlb_ctx = d->iotlb_ctx; >> + mutex_unlock(&d->vqs[i]->mutex); >> + } >> + if (ctx) >> + eventfd_ctx_put(ctx); >> + if (filep) >> + fput(filep); >> + break; >> + case VHOST_SET_IOTLB_REQUEST_ENTRY: >> + if (!access_ok(VERIFY_READ, argp, sizeof(*d->iotlb_request))) >> + return -EFAULT; >> + if (!access_ok(VERIFY_WRITE, argp, sizeof(*d->iotlb_request))) >> + return -EFAULT; >> + d->iotlb_request = argp; >> + for (i = 0; i < d->nvqs; ++i) { >> + mutex_lock(&d->vqs[i]->mutex); >> + d->vqs[i]->iotlb_request = argp; >> + mutex_unlock(&d->vqs[i]->mutex); >> + } >> + break; >> + case VHOST_UPDATE_IOTLB: >> + r = copy_from_user(&entry, argp, sizeof(entry)); >> + if (r < 0) { >> + r = -EFAULT; >> + goto done; >> + } >> + >> + index = vhost_iotlb_hash(entry.iova); >> + >> + spin_lock(&d->iotlb_lock); >> + switch (entry.flags.type) { >> + case VHOST_IOTLB_UPDATE: >> + d->iotlb[index] = entry; >> + break; >> + case VHOST_IOTLB_INVALIDATE: >> + if (d->iotlb[index].iova == entry.iova) >> + d->iotlb[index] = entry; >> + break; >> + default: >> + r = -EINVAL; >> + } >> + spin_unlock(&d->iotlb_lock); >> + >> + if (!r && entry.flags.type != VHOST_IOTLB_INVALIDATE) { >> + mutex_lock(&d->iotlb_req_mutex); >> + if (entry.iova == d->pending_request.iova && >> + d->pending_request.flags.type == >> + VHOST_IOTLB_MISS) { >> + d->pending_request = entry; >> + complete(&d->iotlb_completion); >> + } >> + mutex_unlock(&d->iotlb_req_mutex); >> + } >> + >> + break; >> default: >> r = -ENOIOCTLCMD; >> break; >> @@ -1177,9 +1268,104 @@ int vhost_init_used(struct vhost_virtqueue *vq) >> } >> EXPORT_SYMBOL_GPL(vhost_init_used); >> >> +static struct vhost_iotlb_entry vhost_iotlb_miss(struct vhost_virtqueue *vq, >> + u64 iova) >> +{ >> + struct completion *c = &vq->dev->iotlb_completion; >> + struct vhost_iotlb_entry *pending = &vq->dev->pending_request; >> + struct vhost_iotlb_entry entry = { >> + .flags.valid = VHOST_IOTLB_INVALID, >> + }; >> + >> + mutex_lock(&vq->dev->iotlb_req_mutex); >> + >> + if (!vq->iotlb_ctx) >> + goto err; >> + >> + if (!vq->dev->iotlb_request) >> + goto err; >> + >> + if (pending->flags.type == VHOST_IOTLB_MISS) >> + goto err; >> + >> + pending->iova = iova & PAGE_MASK; >> + pending->flags.type = VHOST_IOTLB_MISS; >> + >> + if (copy_to_user(vq->dev->iotlb_request, pending, >> + sizeof(struct vhost_iotlb_entry))) { >> + goto err; >> + } >> + >> + mutex_unlock(&vq->dev->iotlb_req_mutex); >> + >> + eventfd_signal(vq->iotlb_ctx, 1); >> + wait_for_completion_interruptible(c); > This can still be under vq lock, can it not? > Looks like this can cause deadlocks. Yes, looks like a solution here is completing the pending translation request and make it fail if there's an ioctl other than IOTLB updating. > >> + >> + mutex_lock(&vq->dev->iotlb_req_mutex); >> + entry = vq->dev->pending_request; >> + mutex_unlock(&vq->dev->iotlb_req_mutex); >> + >> + return entry; >> +err: >> + mutex_unlock(&vq->dev->iotlb_req_mutex); >> + return entry; >> +} >> + >> +static int translate_iotlb(struct vhost_virtqueue *vq, u64 iova, u32 len, >> + struct iovec iov[], int iov_size) >> +{ >> + struct vhost_iotlb_entry *entry; >> + struct vhost_iotlb_entry miss; >> + struct vhost_dev *dev = vq->dev; >> + int ret = 0; >> + u64 s = 0, size; >> + >> + spin_lock(&dev->iotlb_lock); >> + >> + while ((u64) len > s) { >> + if (unlikely(ret >= iov_size)) { >> + ret = -ENOBUFS; >> + break; >> + } >> + entry = &vq->dev->iotlb[vhost_iotlb_hash(iova)]; >> + if ((entry->iova != (iova & PAGE_MASK)) || >> + (entry->flags.valid != VHOST_IOTLB_VALID)) { >> + >> + spin_unlock(&dev->iotlb_lock); >> + miss = vhost_iotlb_miss(vq, iova); >> + spin_lock(&dev->iotlb_lock); >> + >> + if (miss.flags.valid != VHOST_IOTLB_VALID || >> + miss.iova != (iova & PAGE_MASK)) { >> + ret = -EFAULT; >> + goto err; >> + } >> + entry = &miss; >> + } >> + >> + if (entry->iova == (iova & PAGE_MASK)) { >> + size = entry->userspace_addr + entry->size - iova; >> + iov[ret].iov_base = >> + (void __user *)(entry->userspace_addr + >> + (iova & (PAGE_SIZE - 1))); >> + iov[ret].iov_len = min((u64)len - s, size); >> + s += size; >> + iova += size; >> + ret++; >> + } else { >> + BUG(); >> + } >> + } >> + >> +err: >> + spin_unlock(&dev->iotlb_lock); >> + return ret; >> +} >> + >> static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, >> struct iovec iov[], int iov_size) >> { >> +#if 0 >> const struct vhost_memory_region *reg; >> struct vhost_memory *mem; >> struct iovec *_iov; >> @@ -1209,6 +1395,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, >> } >> >> return ret; >> +#endif >> + return translate_iotlb(vq, addr, len, iov, iov_size); >> } >> >> /* Each buffer in the virtqueues is actually a chain of descriptors. This >> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h >> index d3f7674..d254efc 100644 >> --- a/drivers/vhost/vhost.h >> +++ b/drivers/vhost/vhost.h >> @@ -68,6 +68,8 @@ struct vhost_virtqueue { >> struct eventfd_ctx *call_ctx; >> struct eventfd_ctx *error_ctx; >> struct eventfd_ctx *log_ctx; >> + struct eventfd_ctx *iotlb_ctx; >> + struct vhost_iotlb __user *iotlb_request; >> >> struct vhost_poll poll; >> >> @@ -116,6 +118,8 @@ struct vhost_virtqueue { >> #endif >> }; >> >> +#define VHOST_IOTLB_SIZE 1024 >> + >> struct vhost_dev { >> struct vhost_memory *memory; >> struct mm_struct *mm; > > >> @@ -124,9 +128,18 @@ struct vhost_dev { >> int nvqs; >> struct file *log_file; >> struct eventfd_ctx *log_ctx; >> + struct file *iotlb_file; >> + struct eventfd_ctx *iotlb_ctx; >> + struct mutex iotlb_req_mutex; >> + struct vhost_iotlb_entry __user *iotlb_request; >> + struct vhost_iotlb_entry pending_request; >> + struct completion iotlb_completion; >> + struct vhost_iotlb_entry request; >> spinlock_t work_lock; >> struct list_head work_list; >> struct task_struct *worker; >> + spinlock_t iotlb_lock; >> + struct vhost_iotlb_entry iotlb[VHOST_IOTLB_SIZE]; >> }; >> >> void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); >> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h >> index ab373191..400e513 100644 >> --- a/include/uapi/linux/vhost.h >> +++ b/include/uapi/linux/vhost.h >> @@ -63,6 +63,26 @@ struct vhost_memory { >> struct vhost_memory_region regions[0]; >> }; >> >> +struct vhost_iotlb_entry { >> + __u64 iova; >> + __u64 size; >> + __u64 userspace_addr; >> + struct { >> +#define VHOST_IOTLB_PERM_READ 0x1 >> +#define VHOST_IOTLB_PERM_WRITE 0x10 >> + __u8 perm; >> +#define VHOST_IOTLB_MISS 1 >> +#define VHOST_IOTLB_UPDATE 2 >> +#define VHOST_IOTLB_INVALIDATE 3 >> + __u8 type; >> +#define VHOST_IOTLB_INVALID 0x1 >> +#define VHOST_IOTLB_VALID 0x2 >> + __u8 valid; >> + __u8 u8_padding; >> + __u32 padding; >> + } flags; >> +}; >> + >> /* ioctls */ >> >> #define VHOST_VIRTIO 0xAF >> @@ -127,6 +147,12 @@ struct vhost_memory { >> /* Set eventfd to signal an error */ >> #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) >> >> +/* IOTLB */ >> +/* Specify an eventfd file descriptor to signle on IOTLB miss */ >> +#define VHOST_SET_IOTLB_FD _IOW(VHOST_VIRTIO, 0x23, int) >> +#define VHOST_UPDATE_IOTLB _IOW(VHOST_VIRTIO, 0x24, struct vhost_iotlb_entry) >> +#define VHOST_SET_IOTLB_REQUEST_ENTRY _IOW(VHOST_VIRTIO, 0x25, struct vhost_iotlb_entry) >> + >> /* VHOST_NET specific defines */ >> >> /* Attach virtio net ring to a raw socket, or tap device. >> -- >> 2.5.0 > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html