Re: [PATCH RFC] vhost: basic device IOTLB support

Jason Wang <jasowang@xxxxxxxxxx> · Mon, 4 Jan 2016 14:08:15 +0800

On 12/31/2015 07:17 PM, Michael S. Tsirkin wrote:
> On Thu, Dec 31, 2015 at 03:13:45PM +0800, Jason Wang wrote:
>> This patch tries to implement an device IOTLB for vhost. This could be
>> used with for co-operation with userspace(qemu) implementation of
>> iommu for a secure DMA environment in guest.
>>
>> The idea is simple. When vhost meets an IOTLB miss, it will request
>> the assistance of userspace to do the translation, this is done
>> through:
>>
>> - Fill the translation request in a preset userspace address (This
>>   address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY).
>> - Notify userspace through eventfd (This eventfd was set through ioctl
>>   VHOST_SET_IOTLB_FD).
>>
>> When userspace finishes the translation, it will update the vhost
>> IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of
>> snooping the IOTLB invalidation of IOMMU IOTLB and use
>> VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost.
>>
>> For simplicity, IOTLB was implemented with a simple hash array. The
>> index were calculated from IOVA page frame number which can only works
>> at PAGE_SIZE level.
>>
>> An qemu implementation (for reference) is available at:
>> git@xxxxxxxxxx:jasowang/qemu.git iommu
>>
>> TODO & Known issues:
>>
>> - read/write permission validation was not implemented.
>> - no feature negotiation.
>> - VHOST_SET_MEM_TABLE is not reused (maybe there's a chance).
>> - working at PAGE_SIZE level, don't support large mappings.
>> - better data structure for IOTLB instead of simple hash array.
>> - better API, e.g using mmap() instead of preset userspace address.
>>
>> Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
> Interesting. I'm working on a slightly different approach
> which is direct vt-d support in vhost.

I've considered this approach. May have advantages but the issues here
is vt-d emulation is even in-complete in qemu and I believe we don't
want to duplicate the code in both vhost-kernel and vhost-user?

> This one has the advantage of being more portable.

Right, the patch tries to be architecture independent.

>
>> ---
>>  drivers/vhost/net.c        |   2 +-
>>  drivers/vhost/vhost.c      | 190 ++++++++++++++++++++++++++++++++++++++++++++-
>>  drivers/vhost/vhost.h      |  13 ++++
>>  include/uapi/linux/vhost.h |  26 +++++++
>>  4 files changed, 229 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 9eda69e..a172be9 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -1083,7 +1083,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
>>  		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
>>  		if (r == -ENOIOCTLCMD)
>>  			r = vhost_vring_ioctl(&n->dev, ioctl, argp);
>> -		else
>> +		else if (ioctl != VHOST_UPDATE_IOTLB)
>>  			vhost_net_flush(n);
>>  		mutex_unlock(&n->dev.mutex);
>>  		return r;
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index eec2f11..729fe05 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -113,6 +113,11 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
>>  }
>>  #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
>>  
>> +static inline int vhost_iotlb_hash(u64 iova)
>> +{
>> +	return (iova >> PAGE_SHIFT) & (VHOST_IOTLB_SIZE - 1);
>> +}
>> +
>>  static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
>>  			    poll_table *pt)
>>  {
>> @@ -384,8 +389,14 @@ void vhost_dev_init(struct vhost_dev *dev,
>>  	dev->memory = NULL;
>>  	dev->mm = NULL;
>>  	spin_lock_init(&dev->work_lock);
>> +	spin_lock_init(&dev->iotlb_lock);
>> +	mutex_init(&dev->iotlb_req_mutex);
>>  	INIT_LIST_HEAD(&dev->work_list);
>>  	dev->worker = NULL;
>> +	dev->iotlb_request = NULL;
>> +	dev->iotlb_ctx = NULL;
>> +	dev->iotlb_file = NULL;
>> +	dev->pending_request.flags.type = VHOST_IOTLB_INVALIDATE;
>>  
>>  	for (i = 0; i < dev->nvqs; ++i) {
>>  		vq = dev->vqs[i];
>> @@ -393,12 +404,17 @@ void vhost_dev_init(struct vhost_dev *dev,
>>  		vq->indirect = NULL;
>>  		vq->heads = NULL;
>>  		vq->dev = dev;
>> +		vq->iotlb_request = NULL;
>>  		mutex_init(&vq->mutex);
>>  		vhost_vq_reset(dev, vq);
>>  		if (vq->handle_kick)
>>  			vhost_poll_init(&vq->poll, vq->handle_kick,
>>  					POLLIN, dev);
>>  	}
>> +
>> +	init_completion(&dev->iotlb_completion);
>> +	for (i = 0; i < VHOST_IOTLB_SIZE; i++)
>> +		dev->iotlb[i].flags.valid = VHOST_IOTLB_INVALID;
>>  }
>>  EXPORT_SYMBOL_GPL(vhost_dev_init);
>>  
>> @@ -940,9 +956,10 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>>  {
>>  	struct file *eventfp, *filep = NULL;
>>  	struct eventfd_ctx *ctx = NULL;
>> +	struct vhost_iotlb_entry entry;
>>  	u64 p;
>>  	long r;
>> -	int i, fd;
>> +	int index, i, fd;
>>  
>>  	/* If you are not the owner, you can become one */
>>  	if (ioctl == VHOST_SET_OWNER) {
>> @@ -1008,6 +1025,80 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>>  		if (filep)
>>  			fput(filep);
>>  		break;
>> +	case VHOST_SET_IOTLB_FD:
>> +		r = get_user(fd, (int __user *)argp);
>> +		if (r < 0)
>> +			break;
>> +		eventfp = fd == -1 ? NULL : eventfd_fget(fd);
>> +		if (IS_ERR(eventfp)) {
>> +			r = PTR_ERR(eventfp);
>> +			break;
>> +		}
>> +		if (eventfp != d->iotlb_file) {
>> +			filep = d->iotlb_file;
>> +			d->iotlb_file = eventfp;
>> +			ctx = d->iotlb_ctx;
>> +			d->iotlb_ctx = eventfp ?
>> +				eventfd_ctx_fileget(eventfp) : NULL;
>> +		} else
>> +			filep = eventfp;
>> +		for (i = 0; i < d->nvqs; ++i) {
>> +			mutex_lock(&d->vqs[i]->mutex);
>> +			d->vqs[i]->iotlb_ctx = d->iotlb_ctx;
>> +			mutex_unlock(&d->vqs[i]->mutex);
>> +		}
>> +		if (ctx)
>> +			eventfd_ctx_put(ctx);
>> +		if (filep)
>> +			fput(filep);
>> +		break;
>> +	case VHOST_SET_IOTLB_REQUEST_ENTRY:
>> +		if (!access_ok(VERIFY_READ, argp, sizeof(*d->iotlb_request)))
>> +			return -EFAULT;
>> +		if (!access_ok(VERIFY_WRITE, argp, sizeof(*d->iotlb_request)))
>> +			return -EFAULT;
>> +		d->iotlb_request = argp;
>> +		for (i = 0; i < d->nvqs; ++i) {
>> +			mutex_lock(&d->vqs[i]->mutex);
>> +			d->vqs[i]->iotlb_request = argp;
>> +			mutex_unlock(&d->vqs[i]->mutex);
>> +		}
>> +		break;
>> +	case VHOST_UPDATE_IOTLB:
>> +		r = copy_from_user(&entry, argp, sizeof(entry));
>> +		if (r < 0) {
>> +			r = -EFAULT;
>> +			goto done;
>> +		}
>> +
>> +		index = vhost_iotlb_hash(entry.iova);
>> +
>> +		spin_lock(&d->iotlb_lock);
>> +		switch (entry.flags.type) {
>> +		case VHOST_IOTLB_UPDATE:
>> +			d->iotlb[index] = entry;
>> +			break;
>> +		case VHOST_IOTLB_INVALIDATE:
>> +			if (d->iotlb[index].iova == entry.iova)
>> +				d->iotlb[index] = entry;
>> +			break;
>> +		default:
>> +			r = -EINVAL;
>> +		}
>> +		spin_unlock(&d->iotlb_lock);
>> +
>> +		if (!r && entry.flags.type != VHOST_IOTLB_INVALIDATE) {
>> +			mutex_lock(&d->iotlb_req_mutex);
>> +			if (entry.iova == d->pending_request.iova &&
>> +			    d->pending_request.flags.type ==
>> +				VHOST_IOTLB_MISS) {
>> +				d->pending_request = entry;
>> +				complete(&d->iotlb_completion);
>> +			}
>> +			mutex_unlock(&d->iotlb_req_mutex);
>> +		}
>> +
>> +		break;
>>  	default:
>>  		r = -ENOIOCTLCMD;
>>  		break;
>> @@ -1177,9 +1268,104 @@ int vhost_init_used(struct vhost_virtqueue *vq)
>>  }
>>  EXPORT_SYMBOL_GPL(vhost_init_used);
>>  
>> +static struct vhost_iotlb_entry vhost_iotlb_miss(struct vhost_virtqueue *vq,
>> +						 u64 iova)
>> +{
>> +	struct completion *c = &vq->dev->iotlb_completion;
>> +	struct vhost_iotlb_entry *pending = &vq->dev->pending_request;
>> +	struct vhost_iotlb_entry entry = {
>> +		.flags.valid = VHOST_IOTLB_INVALID,
>> +	};
>> +
>> +	mutex_lock(&vq->dev->iotlb_req_mutex);
>> +
>> +	if (!vq->iotlb_ctx)
>> +		goto err;
>> +
>> +	if (!vq->dev->iotlb_request)
>> +		goto err;
>> +
>> +	if (pending->flags.type == VHOST_IOTLB_MISS)
>> +		goto err;
>> +
>> +	pending->iova = iova & PAGE_MASK;
>> +	pending->flags.type = VHOST_IOTLB_MISS;
>> +
>> +	if (copy_to_user(vq->dev->iotlb_request, pending,
>> +			 sizeof(struct vhost_iotlb_entry))) {
>> +		goto err;
>> +	}
>> +
>> +	mutex_unlock(&vq->dev->iotlb_req_mutex);
>> +
>> +	eventfd_signal(vq->iotlb_ctx, 1);
>> +	wait_for_completion_interruptible(c);
> This can still be under vq lock, can it not?
> Looks like this can cause deadlocks.

Yes, looks like a solution here is completing the pending translation
request and make it fail if there's an ioctl other than IOTLB updating.

>
>> +
>> +	mutex_lock(&vq->dev->iotlb_req_mutex);
>> +	entry = vq->dev->pending_request;
>> +	mutex_unlock(&vq->dev->iotlb_req_mutex);
>> +
>> +	return entry;
>> +err:
>> +	mutex_unlock(&vq->dev->iotlb_req_mutex);
>> +	return entry;
>> +}
>> +
>> +static int translate_iotlb(struct vhost_virtqueue *vq, u64 iova, u32 len,
>> +			   struct iovec iov[], int iov_size)
>> +{
>> +	struct vhost_iotlb_entry *entry;
>> +	struct vhost_iotlb_entry miss;
>> +	struct vhost_dev *dev = vq->dev;
>> +	int ret = 0;
>> +	u64 s = 0, size;
>> +
>> +	spin_lock(&dev->iotlb_lock);
>> +
>> +	while ((u64) len > s) {
>> +		if (unlikely(ret >= iov_size)) {
>> +			ret = -ENOBUFS;
>> +			break;
>> +		}
>> +		entry = &vq->dev->iotlb[vhost_iotlb_hash(iova)];
>> +		if ((entry->iova != (iova & PAGE_MASK)) ||
>> +		    (entry->flags.valid != VHOST_IOTLB_VALID)) {
>> +
>> +			spin_unlock(&dev->iotlb_lock);
>> +			miss = vhost_iotlb_miss(vq, iova);
>> +			spin_lock(&dev->iotlb_lock);
>> +
>> +			if (miss.flags.valid != VHOST_IOTLB_VALID ||
>> +			    miss.iova != (iova & PAGE_MASK)) {
>> +				ret = -EFAULT;
>> +				goto err;
>> +			}
>> +			entry = &miss;
>> +		}
>> +
>> +		if (entry->iova == (iova & PAGE_MASK)) {
>> +			size = entry->userspace_addr + entry->size - iova;
>> +			iov[ret].iov_base =
>> +				(void __user *)(entry->userspace_addr +
>> +						(iova & (PAGE_SIZE - 1)));
>> +			iov[ret].iov_len = min((u64)len - s, size);
>> +			s += size;
>> +			iova += size;
>> +			ret++;
>> +		} else {
>> +			BUG();
>> +		}
>> +	}
>> +
>> +err:
>> +	spin_unlock(&dev->iotlb_lock);
>> +	return ret;
>> +}
>> +
>>  static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>>  			  struct iovec iov[], int iov_size)
>>  {
>> +#if 0
>>  	const struct vhost_memory_region *reg;
>>  	struct vhost_memory *mem;
>>  	struct iovec *_iov;
>> @@ -1209,6 +1395,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>>  	}
>>  
>>  	return ret;
>> +#endif
>> +	return translate_iotlb(vq, addr, len, iov, iov_size);
>>  }
>>  
>>  /* Each buffer in the virtqueues is actually a chain of descriptors.  This
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index d3f7674..d254efc 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -68,6 +68,8 @@ struct vhost_virtqueue {
>>  	struct eventfd_ctx *call_ctx;
>>  	struct eventfd_ctx *error_ctx;
>>  	struct eventfd_ctx *log_ctx;
>> +	struct eventfd_ctx *iotlb_ctx;
>> +	struct vhost_iotlb __user *iotlb_request;
>>  
>>  	struct vhost_poll poll;
>>  
>> @@ -116,6 +118,8 @@ struct vhost_virtqueue {
>>  #endif
>>  };
>>  
>> +#define VHOST_IOTLB_SIZE 1024
>> +
>>  struct vhost_dev {
>>  	struct vhost_memory *memory;
>>  	struct mm_struct *mm;
>
>
>> @@ -124,9 +128,18 @@ struct vhost_dev {
>>  	int nvqs;
>>  	struct file *log_file;
>>  	struct eventfd_ctx *log_ctx;
>> +	struct file *iotlb_file;
>> +	struct eventfd_ctx *iotlb_ctx;
>> +	struct mutex iotlb_req_mutex;
>> +	struct vhost_iotlb_entry __user *iotlb_request;
>> +	struct vhost_iotlb_entry pending_request;
>> +	struct completion iotlb_completion;
>> +	struct vhost_iotlb_entry request;
>>  	spinlock_t work_lock;
>>  	struct list_head work_list;
>>  	struct task_struct *worker;
>> +	spinlock_t iotlb_lock;
>> +	struct vhost_iotlb_entry iotlb[VHOST_IOTLB_SIZE];
>>  };
>>  
>>  void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
>> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
>> index ab373191..400e513 100644
>> --- a/include/uapi/linux/vhost.h
>> +++ b/include/uapi/linux/vhost.h
>> @@ -63,6 +63,26 @@ struct vhost_memory {
>>  	struct vhost_memory_region regions[0];
>>  };
>>  
>> +struct vhost_iotlb_entry {
>> +	__u64 iova;
>> +	__u64 size;
>> +	__u64 userspace_addr;
>> +	struct {
>> +#define VHOST_IOTLB_PERM_READ      0x1
>> +#define VHOST_IOTLB_PERM_WRITE     0x10
>> +		__u8  perm;
>> +#define VHOST_IOTLB_MISS           1
>> +#define VHOST_IOTLB_UPDATE         2
>> +#define VHOST_IOTLB_INVALIDATE     3
>> +		__u8  type;
>> +#define VHOST_IOTLB_INVALID        0x1
>> +#define VHOST_IOTLB_VALID          0x2
>> +		__u8  valid;
>> +		__u8  u8_padding;
>> +		__u32 padding;
>> +	} flags;
>> +};
>> +
>>  /* ioctls */
>>  
>>  #define VHOST_VIRTIO 0xAF
>> @@ -127,6 +147,12 @@ struct vhost_memory {
>>  /* Set eventfd to signal an error */
>>  #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
>>  
>> +/* IOTLB */
>> +/* Specify an eventfd file descriptor to signle on IOTLB miss */
>> +#define VHOST_SET_IOTLB_FD _IOW(VHOST_VIRTIO, 0x23, int)
>> +#define VHOST_UPDATE_IOTLB _IOW(VHOST_VIRTIO, 0x24, struct vhost_iotlb_entry)
>> +#define VHOST_SET_IOTLB_REQUEST_ENTRY _IOW(VHOST_VIRTIO, 0x25, struct vhost_iotlb_entry)
>> +
>>  /* VHOST_NET specific defines */
>>  
>>  /* Attach virtio net ring to a raw socket, or tap device.
>> -- 
>> 2.5.0
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization