Re: [RFC PATCH v2 3/3] VFIO: Type1 IOMMU mapping support for vGPU

Jike Song <jike.song@xxxxxxxxx> · Wed, 02 Mar 2016 16:38:34 +0800

On 02/24/2016 12:24 AM, Kirti Wankhede wrote:
> Aim of this module is to pin and unpin guest memory.
> This module provides interface to GPU driver that can be used to map guest
> physical memory into its kernel space driver.
> Currently this module has duplicate code from vfio_iommu_type1.c
> Working on refining functions to reuse existing code in vfio_iommu_type1.c and
> with that will add API to unpin pages.
> This is for the reference to review the overall design of vGPU.
> 
> Thanks,
> Kirti.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@xxxxxxxxxx>
> Signed-off-by: Neo Jia <cjia@xxxxxxxxxx>
> ---
>  drivers/vgpu/Makefile                |    1 +
>  drivers/vgpu/vfio_iommu_type1_vgpu.c |  423 ++++++++++++++++++++++++++++++++++
>  2 files changed, 424 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/vgpu/vfio_iommu_type1_vgpu.c
> 
> diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile
> index a0a2655..8ace18d 100644
> --- a/drivers/vgpu/Makefile
> +++ b/drivers/vgpu/Makefile
> @@ -3,3 +3,4 @@ vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o
>  
>  obj-$(CONFIG_VGPU)			+= vgpu.o
>  obj-$(CONFIG_VGPU_VFIO)                 += vgpu_vfio.o
> +obj-$(CONFIG_VFIO_IOMMU_TYPE1_VGPU)     += vfio_iommu_type1_vgpu.o
> diff --git a/drivers/vgpu/vfio_iommu_type1_vgpu.c b/drivers/vgpu/vfio_iommu_type1_vgpu.c
> new file mode 100644
> index 0000000..0b36ae5
> --- /dev/null
> +++ b/drivers/vgpu/vfio_iommu_type1_vgpu.c
> @@ -0,0 +1,423 @@
> +/*
> + * VGPU : IOMMU DMA mapping support for VGPU
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@xxxxxxxxxx>
> + *	       Kirti Wankhede <kwankhede@xxxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/compat.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/miscdevice.h>
> +#include <linux/sched.h>
> +#include <linux/wait.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/vgpu.h>
> +
> +#include "vgpu_private.h"
> +
> +#define DRIVER_VERSION	"0.1"
> +#define DRIVER_AUTHOR	"NVIDIA Corporation"
> +#define DRIVER_DESC     "VGPU Type1 IOMMU driver for VFIO"
> +
> +// VFIO structures
> +
> +struct vfio_iommu_vgpu {
> +	struct mutex lock;
> +	struct iommu_group *group;
> +	struct vgpu_device *vgpu_dev;
> +	struct rb_root dma_list;
> +	struct mm_struct * vm_mm;
> +};
> +
> +struct vgpu_vfio_dma {
> +	struct rb_node node;
> +	dma_addr_t iova;
> +	unsigned long vaddr;
> +	size_t size;
> +	int prot;
> +};
> +
> +/*
> + * VGPU VFIO FOPs definition
> + *
> + */
> +
> +/*
> + * Duplicated from vfio_link_dma, just quick hack ... should
> + * reuse code later
> + */
> +
> +static void vgpu_link_dma(struct vfio_iommu_vgpu *iommu,
> +			  struct vgpu_vfio_dma *new)
> +{
> +	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
> +	struct vgpu_vfio_dma *dma;
> +
> +	while (*link) {
> +		parent = *link;
> +		dma = rb_entry(parent, struct vgpu_vfio_dma, node);
> +
> +		if (new->iova + new->size <= dma->iova)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &iommu->dma_list);
> +}
> +
> +static struct vgpu_vfio_dma *vgpu_find_dma(struct vfio_iommu_vgpu *iommu,
> +					   dma_addr_t start, size_t size)
> +{
> +	struct rb_node *node = iommu->dma_list.rb_node;
> +
> +	while (node) {
> +		struct vgpu_vfio_dma *dma = rb_entry(node, struct vgpu_vfio_dma, node);
> +
> +		if (start + size <= dma->iova)
> +			node = node->rb_left;
> +		else if (start >= dma->iova + dma->size)
> +			node = node->rb_right;
> +		else
> +			return dma;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vgpu_unlink_dma(struct vfio_iommu_vgpu *iommu, struct vgpu_vfio_dma *old)
> +{
> +	rb_erase(&old->node, &iommu->dma_list);
> +}
> +
> +static void vgpu_dump_dma(struct vfio_iommu_vgpu *iommu)
> +{
> +	struct vgpu_vfio_dma *c, *n;
> +	uint32_t i = 0;
> +
> +	rbtree_postorder_for_each_entry_safe(c, n, &iommu->dma_list, node)
> +		printk(KERN_INFO "%s: dma[%d] iova:0x%llx, vaddr:0x%lx, size:0x%lx\n",
> +		       __FUNCTION__, i++, c->iova, c->vaddr, c->size);
> +}
> +
> +static int vgpu_dma_do_track(struct vfio_iommu_vgpu * vgpu_iommu,
> +	struct vfio_iommu_type1_dma_map *map)
> +{
> +	dma_addr_t iova = map->iova;
> +	unsigned long vaddr = map->vaddr;
> +	int ret = 0, prot = 0;
> +	struct vgpu_vfio_dma *vgpu_dma;
> +
> +	mutex_lock(&vgpu_iommu->lock);
> +
> +	if (vgpu_find_dma(vgpu_iommu, map->iova, map->size)) {
> +		mutex_unlock(&vgpu_iommu->lock);
> +		return -EEXIST;
> +	}
> +
> +	vgpu_dma = kzalloc(sizeof(*vgpu_dma), GFP_KERNEL);
> +
> +	if (!vgpu_dma) {
> +		mutex_unlock(&vgpu_iommu->lock);
> +		return -ENOMEM;
> +	}
> +
> +	vgpu_dma->iova = iova;
> +	vgpu_dma->vaddr = vaddr;
> +	vgpu_dma->prot = prot;
> +	vgpu_dma->size = map->size;
> +
> +	vgpu_link_dma(vgpu_iommu, vgpu_dma);

Hi Kirti & Neo,

seems that no one actually setup mappings for IOMMU here?

> +
> +	mutex_unlock(&vgpu_iommu->lock);
> +	return ret;
> +}
> +
> +static int vgpu_dma_do_untrack(struct vfio_iommu_vgpu * vgpu_iommu,
> +	struct vfio_iommu_type1_dma_unmap *unmap)
> +{
> +	struct vgpu_vfio_dma *vgpu_dma;
> +	size_t unmapped = 0;
> +	int ret = 0;
> +
> +	mutex_lock(&vgpu_iommu->lock);
> +
> +	vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova, 0);
> +	if (vgpu_dma && vgpu_dma->iova != unmap->iova) {
> +		ret = -EINVAL;
> +		goto unlock;
> +	}
> +
> +	vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova + unmap->size - 1, 0);
> +	if (vgpu_dma && vgpu_dma->iova + vgpu_dma->size != unmap->iova + unmap->size) {
> +		ret = -EINVAL;
> +		goto unlock;
> +	}
> +
> +	while (( vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova, unmap->size))) {
> +		unmapped += vgpu_dma->size;
> +		vgpu_unlink_dma(vgpu_iommu, vgpu_dma);
> +	}
> +
> +unlock:
> +	mutex_unlock(&vgpu_iommu->lock);
> +	unmap->size = unmapped;
> +
> +	return ret;
> +}
> +
> +/* Ugly hack to quickly test single deivce ... */
> +
> +static struct vfio_iommu_vgpu *_local_iommu = NULL;
> +
> +int vgpu_dma_do_translate(dma_addr_t *gfn_buffer, uint32_t count)
> +{
> +	int i = 0, ret = 0, prot = 0;
> +	unsigned long remote_vaddr = 0, pfn = 0;
> +	struct vfio_iommu_vgpu *vgpu_iommu = _local_iommu;
> +	struct vgpu_vfio_dma *vgpu_dma;
> +	struct page *page[1];
> +	// unsigned long * addr = NULL;
> +	struct mm_struct *mm = vgpu_iommu->vm_mm;
> +
> +	prot = IOMMU_READ | IOMMU_WRITE;
> +
> +	printk(KERN_INFO "%s: >>>>\n", __FUNCTION__);
> +
> +	mutex_lock(&vgpu_iommu->lock);
> +
> +	vgpu_dump_dma(vgpu_iommu);
> +
> +	for (i = 0; i < count; i++) {
> +		dma_addr_t iova = gfn_buffer[i] << PAGE_SHIFT;
> +		vgpu_dma = vgpu_find_dma(vgpu_iommu, iova, 0 /*  size */);
> +
> +		if (!vgpu_dma) {
> +			printk(KERN_INFO "%s: fail locate iova[%d]:0x%llx\n", __FUNCTION__, i, iova);
> +			ret = -EINVAL;
> +			goto unlock;
> +		}
> +
> +		remote_vaddr = vgpu_dma->vaddr + iova - vgpu_dma->iova;
> +		printk(KERN_INFO "%s: find dma iova[%d]:0x%llx, vaddr:0x%lx, size:0x%lx, remote_vaddr:0x%lx\n",
> +			__FUNCTION__, i, vgpu_dma->iova,
> +			vgpu_dma->vaddr, vgpu_dma->size, remote_vaddr);
> +
> +		if (get_user_pages_unlocked(NULL, mm, remote_vaddr, 1, 1, 0, page) == 1) {
> +			pfn = page_to_pfn(page[0]);
> +			printk(KERN_INFO "%s: pfn[%d]:0x%lx\n", __FUNCTION__, i, pfn);
> +			// addr = vmap(page, 1, VM_MAP, PAGE_KERNEL);
> +		}
> +		else {
> +			printk(KERN_INFO "%s: fail to pin pfn[%d]\n", __FUNCTION__, i);
> +			ret = -ENOMEM;
> +			goto unlock;
> +		}
> +
> +		gfn_buffer[i] = pfn;
> +		// vunmap(addr);
> +
> +	}
> +
> +unlock:
> +	mutex_unlock(&vgpu_iommu->lock);
> +	printk(KERN_INFO "%s: <<<<\n", __FUNCTION__);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vgpu_dma_do_translate);
> +
> +static void *vfio_iommu_vgpu_open(unsigned long arg)
> +{
> +	struct vfio_iommu_vgpu *iommu;
> +
> +	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> +
> +	if (!iommu)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&iommu->lock);
> +
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +
> +	/* TODO: Keep track the v2 vs. v1, for now only assume
> +	 * we are v2 due to QEMU code */
> +	_local_iommu = iommu;
> +	return iommu;
> +}
> +
> +static void vfio_iommu_vgpu_release(void *iommu_data)
> +{
> +	struct vfio_iommu_vgpu *iommu = iommu_data;
> +	kfree(iommu);
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +}
> +
> +static long vfio_iommu_vgpu_ioctl(void *iommu_data,
> +		unsigned int cmd, unsigned long arg)
> +{
> +	int ret = 0;
> +	unsigned long minsz;
> +	struct vfio_iommu_vgpu *vgpu_iommu = iommu_data;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION:
> +	{
> +		if ((arg == VFIO_TYPE1_IOMMU) || (arg == VFIO_TYPE1v2_IOMMU))
> +			return 1;
> +		else
> +			return 0;
> +	}
> +
> +	case VFIO_IOMMU_GET_INFO:
> +	{
> +		struct vfio_iommu_type1_info info;
> +		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.flags = 0;
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +	case VFIO_IOMMU_MAP_DMA:
> +	{
> +		// TODO
> +		struct vfio_iommu_type1_dma_map map;
> +		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
> +
> +		if (copy_from_user(&map, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (map.argsz < minsz)
> +			return -EINVAL;
> +
> +		printk(KERN_INFO "VGPU-IOMMU:MAP_DMA flags:%d, vaddr:0x%llx, iova:0x%llx, size:0x%llx\n",
> +			map.flags, map.vaddr, map.iova, map.size);
> +
> +		/*
> +		 * TODO: Tracking code is mostly duplicated from TYPE1 IOMMU, ideally,
> +		 * this should be merged into one single file and reuse data
> +		 * structure
> +		 *
> +		 */
> +		ret = vgpu_dma_do_track(vgpu_iommu, &map);
> +		break;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA:
> +	{
> +		// TODO
> +		struct vfio_iommu_type1_dma_unmap unmap;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
> +
> +		if (copy_from_user(&unmap, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (unmap.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = vgpu_dma_do_untrack(vgpu_iommu, &unmap);
> +		break;
> +	}
> +	default:
> +	{
> +		printk(KERN_INFO "%s cmd default ", __FUNCTION__);
> +		ret = -ENOTTY;
> +		break;
> +	}
> +	}
> +
> +	return ret;
> +}
> +
> +
> +static int vfio_iommu_vgpu_attach_group(void *iommu_data,
> +		                        struct iommu_group *iommu_group)
> +{
> +	struct vfio_iommu_vgpu *iommu = iommu_data;
> +	struct vgpu_device *vgpu_dev = NULL;
> +
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +
> +	vgpu_dev = get_vgpu_device_from_group(iommu_group);
> +	if (vgpu_dev) {
> +		iommu->vgpu_dev = vgpu_dev;
> +		iommu->group = iommu_group;
> +
> +		/* IOMMU shares the same life cylce as VM MM */
> +		iommu->vm_mm = current->mm;
> +
> +		return 0;
> +	}
> +	iommu->group = iommu_group;
> +	return 1;
> +}
> +
> +static void vfio_iommu_vgpu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct vfio_iommu_vgpu *iommu = iommu_data;
> +
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +	iommu->vm_mm = NULL;
> +	iommu->group = NULL;
> +
> +	return;
> +}
> +
> +
> +static const struct vfio_iommu_driver_ops vfio_iommu_vgpu_driver_ops = {
> +	.name           = "vgpu_vfio",
> +	.owner          = THIS_MODULE,
> +	.open           = vfio_iommu_vgpu_open,
> +	.release        = vfio_iommu_vgpu_release,
> +	.ioctl          = vfio_iommu_vgpu_ioctl,
> +	.attach_group   = vfio_iommu_vgpu_attach_group,
> +	.detach_group   = vfio_iommu_vgpu_detach_group,
> +};
> +
> +
> +int vgpu_vfio_iommu_init(void)
> +{
> +	int rc = vfio_register_iommu_driver(&vfio_iommu_vgpu_driver_ops);
> +
> +	printk(KERN_INFO "%s\n", __FUNCTION__);
> +	if (rc < 0) {
> +		printk(KERN_ERR "Error: failed to register vfio iommu, err:%d\n", rc);
> +	}
> +
> +	return rc;
> +}
> +
> +void vgpu_vfio_iommu_exit(void)
> +{
> +	// unregister vgpu_vfio driver
> +	vfio_unregister_iommu_driver(&vfio_iommu_vgpu_driver_ops);
> +	printk(KERN_INFO "%s\n", __FUNCTION__);
> +}
> +
> +
> +module_init(vgpu_vfio_iommu_init);
> +module_exit(vgpu_vfio_iommu_exit);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> 

--
Thanks,
Jike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html