[PATCH v2 3/5] vfio: VFIO core group interface

Alex Williamson <alex.williamson@xxxxxxxxxx> · Mon, 23 Jan 2012 10:21:01 -0700

This provides the base group management with conduits to the
IOMMU driver and VFIO bus drivers.

Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---

 drivers/vfio/vfio_main.c    | 1248 +++++++++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio_private.h |   36 +
 2 files changed, 1284 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/vfio_main.c
 create mode 100644 drivers/vfio/vfio_private.h

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
new file mode 100644
index 0000000..fcd6476
--- /dev/null
+++ b/drivers/vfio/vfio_main.c
@@ -0,0 +1,1248 @@
+/*
+ * VFIO framework
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@xxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@xxxxxxxxx
+ */
+
+#include <linux/cdev.h>
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/iommu.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/wait.h>
+
+#include "vfio_private.h"
+
+#define DRIVER_VERSION	"0.2"
+#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@xxxxxxxxxx>"
+#define DRIVER_DESC	"VFIO - User Level meta-driver"
+
+static struct vfio {
+	dev_t			devt;
+	struct cdev		cdev;
+	struct list_head	group_list;
+	struct mutex		lock;
+	struct kref		kref;
+	struct class		*class;
+	struct idr		idr;
+	wait_queue_head_t	release_q;
+} vfio;
+
+static const struct file_operations vfio_group_fops;
+
+struct vfio_group {
+	dev_t			devt;
+	unsigned int		groupid;
+	struct bus_type		*bus;
+	struct vfio_iommu	*iommu;
+	struct list_head	device_list;
+	struct list_head	iommu_next;
+	struct list_head	group_next;
+	struct device		*dev;
+	struct kobject		*devices_kobj;
+	int			refcnt;
+	bool			tainted;
+};
+
+struct vfio_device {
+	struct device			*dev;
+	const struct vfio_device_ops	*ops;
+	struct vfio_group		*group;
+	struct list_head		device_next;
+	bool				attached;
+	bool				deleteme;
+	int				refcnt;
+	void				*device_data;
+};
+
+/*
+ * Helper functions called under vfio.lock
+ */
+
+/* Return true if any devices within a group are opened */
+static bool __vfio_group_devs_inuse(struct vfio_group *group)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &group->device_list) {
+		struct vfio_device *device;
+
+		device = list_entry(pos, struct vfio_device, device_next);
+		if (device->refcnt)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Return true if any of the groups attached to an iommu are opened.
+ * We can only tear apart merged groups when nothing is left open.
+ */
+static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+
+		group = list_entry(pos, struct vfio_group, iommu_next);
+		if (group->refcnt)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * An iommu is "in use" if it has a file descriptor open or if any of
+ * the groups assigned to the iommu have devices open.
+ */
+static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+
+	if (iommu->refcnt)
+		return true;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+
+		group = list_entry(pos, struct vfio_group, iommu_next);
+
+		if (__vfio_group_devs_inuse(group))
+			return true;
+	}
+	return false;
+}
+
+static void __vfio_group_set_iommu(struct vfio_group *group,
+				   struct vfio_iommu *iommu)
+{
+	if (group->iommu)
+		list_del(&group->iommu_next);
+	if (iommu)
+		list_add(&group->iommu_next, &iommu->group_list);
+
+	group->iommu = iommu;
+}
+
+static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
+				    struct vfio_device *device)
+{
+	if (WARN_ON(!iommu->domain && device->attached))
+		return;
+
+	if (!device->attached)
+		return;
+
+	iommu_detach_device(iommu->domain, device->dev);
+	device->attached = false;
+}
+
+static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
+				      struct vfio_group *group)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &group->device_list) {
+		struct vfio_device *device;
+
+		device = list_entry(pos, struct vfio_device, device_next);
+		__vfio_iommu_detach_dev(iommu, device);
+	}
+}
+
+static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
+				   struct vfio_device *device)
+{
+	int ret;
+
+	if (WARN_ON(device->attached || !iommu || !iommu->domain))
+		return -EINVAL;
+
+	ret = iommu_attach_device(iommu->domain, device->dev);
+	if (!ret)
+		device->attached = true;
+
+	return ret;
+}
+
+static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,
+				     struct vfio_group *group)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &group->device_list) {
+		struct vfio_device *device;
+		int ret;
+
+		device = list_entry(pos, struct vfio_device, device_next);
+		ret = __vfio_iommu_attach_dev(iommu, device);
+		if (ret) {
+			__vfio_iommu_detach_group(iommu, group);
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * The iommu is viable, ie. ready to be configured, when all the devices
+ * for all the groups attached to the iommu are bound to their vfio device
+ * drivers (ex. vfio-pci).  This sets the device_data private data pointer.
+ */
+static bool __vfio_iommu_viable(struct vfio_iommu *iommu)
+{
+	struct list_head *gpos, *dpos;
+
+	list_for_each(gpos, &iommu->group_list) {
+		struct vfio_group *group;
+		group = list_entry(gpos, struct vfio_group, iommu_next);
+
+		if (group->tainted)
+			return false;
+
+		list_for_each(dpos, &group->device_list) {
+			struct vfio_device *device;
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+
+			if (!device->device_data)
+				return false;
+		}
+	}
+	return true;
+}
+
+static void __vfio_iommu_close(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+
+	if (!iommu->domain)
+		return;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+		group = list_entry(pos, struct vfio_group, iommu_next);
+
+		__vfio_iommu_detach_group(iommu, group);
+	}
+
+	vfio_iommu_unmapall(iommu);
+
+	iommu_domain_free(iommu->domain);
+	iommu->domain = NULL;
+	iommu->mm = NULL;
+}
+
+/*
+ * Open the IOMMU.  This gates all access to the iommu or device file
+ * descriptors and sets current->mm as the exclusive user.
+ */
+static int __vfio_iommu_open(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+	int ret;
+
+	if (!__vfio_iommu_viable(iommu))
+		return -EBUSY;
+
+	if (iommu->domain)
+		return -EINVAL;
+
+	iommu->domain = iommu_domain_alloc(iommu->bus);
+	if (!iommu->domain)
+		return -ENOMEM;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+		group = list_entry(pos, struct vfio_group, iommu_next);
+
+		ret = __vfio_iommu_attach_group(iommu, group);
+		if (ret) {
+			__vfio_iommu_close(iommu);
+			return ret;
+		}
+	}
+
+	iommu->cache = (iommu_domain_has_cap(iommu->domain,
+					     IOMMU_CAP_CACHE_COHERENCY) != 0);
+	iommu->mm = current->mm;
+
+	return 0;
+}
+
+/*
+ * Actively try to tear down the iommu and merged groups.  If there are no
+ * open iommu or device fds, we close the iommu.  If we close the iommu and
+ * there are also no open group fds, we can further dissolve the group to
+ * iommu association and free the iommu data structure.
+ */
+static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)
+{
+
+	if (__vfio_iommu_inuse(iommu))
+		return -EBUSY;
+
+	__vfio_iommu_close(iommu);
+
+	if (!__vfio_iommu_groups_inuse(iommu)) {
+		struct list_head *pos, *ppos;
+
+		list_for_each_safe(pos, ppos, &iommu->group_list) {
+			struct vfio_group *group;
+
+			group = list_entry(pos, struct vfio_group, iommu_next);
+			__vfio_group_set_iommu(group, NULL);
+		}
+
+		kfree(iommu);
+	}
+
+	return 0;
+}
+
+static struct vfio_device *__vfio_lookup_dev(struct device *dev)
+{
+	struct list_head *gpos;
+	unsigned int groupid;
+
+	if (iommu_device_group(dev, &groupid))
+		return NULL;
+
+	list_for_each(gpos, &vfio.group_list) {
+		struct vfio_group *group;
+		struct list_head *dpos;
+
+		group = list_entry(gpos, struct vfio_group, group_next);
+
+		if (group->groupid != groupid || group->bus != dev->bus)
+			continue;
+
+		list_for_each(dpos, &group->device_list) {
+			struct vfio_device *device;
+
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+
+			if (device->dev == dev)
+				return device;
+		}
+	}
+	return NULL;
+}
+
+static struct vfio_group *__vfio_dev_to_group(struct device *dev,
+					      unsigned int groupid)
+{
+	struct list_head *pos;
+	struct vfio_group *group;
+
+	list_for_each(pos, &vfio.group_list) {
+		group = list_entry(pos, struct vfio_group, group_next);
+		if (group->groupid == groupid && group->bus == dev->bus)
+			return group;
+	}
+
+	return NULL;
+}
+
+struct vfio_device *__vfio_group_find_device(struct vfio_group *group,
+					     struct device *dev)
+{
+	struct list_head *pos;
+	struct vfio_device *device;
+
+	list_for_each(pos, &group->device_list) {
+		device = list_entry(pos, struct vfio_device, device_next);
+		if (device->dev == dev)
+			return device;
+	}
+
+	return NULL;
+}
+
+static struct vfio_group *__vfio_create_group(struct device *dev,
+					      unsigned int groupid)
+{
+	struct vfio_group *group;
+	int ret, minor;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+
+	/*
+	 * We can't recover from this.  If we can't even get memory for
+	 * the group, we can't track the device and we don't have a place
+	 * to mark the groupid tainted.  Failures below should at least
+	 * return a tainted group.
+	 */
+	BUG_ON(!group);
+
+	group->groupid = groupid;
+	group->bus = dev->bus;
+	INIT_LIST_HEAD(&group->device_list);
+
+	group->tainted = true;
+	list_add(&group->group_next, &vfio.group_list);
+
+again:
+	if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0))
+		goto out;
+
+	ret = idr_get_new(&vfio.idr, group, &minor);
+	if (ret == -EAGAIN)
+		goto again;
+	if (ret || minor > MINORMASK) {
+		if (minor > MINORMASK)
+			idr_remove(&vfio.idr, minor);
+		goto out;
+	}
+
+	group->devt = MKDEV(MAJOR(vfio.devt), minor);
+	group->dev = device_create(vfio.class, NULL, group->devt, group,
+				   "%s:%u", dev->bus->name, groupid);
+	if (IS_ERR(group->dev))
+		goto out_device;
+
+	/* Create a place to link individual devices in sysfs */
+	group->devices_kobj = kobject_create_and_add("devices",
+						     &group->dev->kobj);
+	if (!group->devices_kobj)
+		goto out_kobj;
+
+	group->tainted = false;
+
+	return group;
+
+out_kobj:
+	device_destroy(vfio.class, group->devt);
+out_device:
+	group->dev = NULL;
+	group->devt = 0;
+	idr_remove(&vfio.idr, minor);
+out:
+	printk(KERN_WARNING "vfio: Failed to complete setup on group %u, "
+	       "marking as unusable\n", groupid);
+
+	return group;
+}
+
+static struct vfio_iommu *vfio_create_iommu(struct vfio_group *group)
+{
+	struct vfio_iommu *iommu;
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&iommu->group_list);
+	INIT_LIST_HEAD(&iommu->dma_list);
+	mutex_init(&iommu->lock);
+	iommu->bus = group->bus;
+
+	return iommu;
+}
+
+/*
+ * All release paths simply decrement the refcnt, attempt to teardown
+ * the iommu and merged groups, and wakeup anything that might be
+ * waiting if we successfully dissolve anything.
+ */
+static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)
+{
+	bool wake;
+
+	mutex_lock(&vfio.lock);
+
+	(*refcnt)--;
+	wake = (__vfio_try_dissolve_iommu(iommu) == 0);
+
+	mutex_unlock(&vfio.lock);
+
+	if (wake)
+		wake_up(&vfio.release_q);
+
+	return 0;
+}
+
+/*
+ * Device fops - passthrough to vfio device driver w/ device_data
+ */
+static int vfio_device_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_device *device = filep->private_data;
+
+	vfio_do_release(&device->refcnt, device->group->iommu);
+
+	device->ops->release(device->device_data);
+
+	return 0;
+}
+
+static long vfio_device_unl_ioctl(struct file *filep,
+				  unsigned int cmd, unsigned long arg)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->ioctl(device->device_data, cmd, arg);
+}
+
+static ssize_t vfio_device_read(struct file *filep, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->read(device->device_data, buf, count, ppos);
+}
+
+static ssize_t vfio_device_write(struct file *filep, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->write(device->device_data, buf, count, ppos);
+}
+
+static int vfio_device_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->mmap(device->device_data, vma);
+}
+
+#ifdef CONFIG_COMPAT
+static long vfio_device_compat_ioctl(struct file *filep,
+				     unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_device_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+const struct file_operations vfio_device_fops = {
+	.owner		= THIS_MODULE,
+	.release	= vfio_device_release,
+	.read		= vfio_device_read,
+	.write		= vfio_device_write,
+	.unlocked_ioctl	= vfio_device_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_device_compat_ioctl,
+#endif
+	.mmap		= vfio_device_mmap,
+};
+
+/*
+ * Group fops
+ */
+static int vfio_group_open(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *group;
+	int ret = 0;
+
+	mutex_lock(&vfio.lock);
+
+	group = idr_find(&vfio.idr, iminor(inode));
+
+	if (!group) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	filep->private_data = group;
+
+	if (!group->iommu) {
+		struct vfio_iommu *iommu;
+
+		iommu = vfio_create_iommu(group);
+		if (IS_ERR(iommu)) {
+			ret = PTR_ERR(iommu);
+			goto out;
+		}
+		__vfio_group_set_iommu(group, iommu);
+	}
+	group->refcnt++;
+
+out:
+	mutex_unlock(&vfio.lock);
+
+	return ret;
+}
+
+static int vfio_group_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *group = filep->private_data;
+
+	return vfio_do_release(&group->refcnt, group->iommu);
+}
+
+/*
+ * Attempt to merge the group pointed to by fd into group.  The merge-ee
+ * group must not have an iommu or any devices open because we cannot
+ * maintain that context across the merge.  The merge-er group can be
+ * in use.
+ */
+static int vfio_group_merge(struct vfio_group *group, int fd)
+{
+	struct vfio_group *new;
+	struct vfio_iommu *old_iommu;
+	struct file *file;
+	int ret = 0;
+	bool opened = false;
+
+	mutex_lock(&vfio.lock);
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out_noput;
+	}
+
+	/* Sanity check, is this really our fd? */
+	if (file->f_op != &vfio_group_fops) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	new = file->private_data;
+
+	if (!new || new == group || !new->iommu ||
+	    new->iommu->domain || new->bus != group->bus) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * We need to attach all the devices to each domain separately
+	 * in order to validate that the capabilities match for both.
+	 */
+	ret = __vfio_iommu_open(new->iommu);
+	if (ret)
+		goto out;
+
+	if (!group->iommu->domain) {
+		ret = __vfio_iommu_open(group->iommu);
+		if (ret)
+			goto out;
+		opened = true;
+	}
+
+	/*
+	 * If cache coherency doesn't match we'd potentially need to
+	 * remap existing iommu mappings in the merge-er domain.
+	 * Poor return to bother trying to allow this currently.
+	 */
+	if (iommu_domain_has_cap(group->iommu->domain,
+				 IOMMU_CAP_CACHE_COHERENCY) !=
+	    iommu_domain_has_cap(new->iommu->domain,
+				 IOMMU_CAP_CACHE_COHERENCY)) {
+		__vfio_iommu_close(new->iommu);
+		if (opened)
+			__vfio_iommu_close(group->iommu);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Close the iommu for the merge-ee and attach all its devices
+	 * to the merge-er iommu.
+	 */
+	__vfio_iommu_close(new->iommu);
+
+	ret = __vfio_iommu_attach_group(group->iommu, new);
+	if (ret)
+		goto out;
+
+	/* set_iommu unlinks new from the iommu, so save a pointer to it */
+	old_iommu = new->iommu;
+	__vfio_group_set_iommu(new, group->iommu);
+	kfree(old_iommu);
+
+out:
+	fput(file);
+out_noput:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/* Unmerge a group */
+static int vfio_group_unmerge(struct vfio_group *group)
+{
+	struct vfio_iommu *iommu;
+	int ret = 0;
+
+	/*
+	 * Since the merge-out group is already opened, it needs to
+	 * have an iommu struct associated with it.
+	 */
+	iommu = vfio_create_iommu(group);
+	if (IS_ERR(iommu))
+		return PTR_ERR(iommu);
+
+	mutex_lock(&vfio.lock);
+
+	if (list_is_singular(&group->iommu->group_list)) {
+		ret = -EINVAL; /* Not merged group */
+		goto out;
+	}
+
+	/* We can't merge-out a group with devices still in use. */
+	if (__vfio_group_devs_inuse(group)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	__vfio_iommu_detach_group(group->iommu, group);
+	__vfio_group_set_iommu(group, iommu);
+
+out:
+	if (ret)
+		kfree(iommu);
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/*
+ * Get a new iommu file descriptor.  This will open the iommu, setting
+ * the current->mm ownership if it's not already set.
+ */
+static int vfio_group_get_iommu_fd(struct vfio_group *group)
+{
+	int ret = 0;
+
+	mutex_lock(&vfio.lock);
+
+	if (!group->iommu->domain) {
+		ret = __vfio_iommu_open(group->iommu);
+		if (ret)
+			goto out;
+	}
+
+	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
+			       group->iommu, O_RDWR);
+	if (ret < 0)
+		goto out;
+
+	group->iommu->refcnt++;
+out:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/*
+ * Get a new device file descriptor.  This will open the iommu, setting
+ * the current->mm ownership if it's not already set.  It's difficult to
+ * specify the requirements for matching a user supplied buffer to a
+ * device, so we use a vfio driver callback to test for a match.  For
+ * PCI, dev_name(dev) is unique, but other drivers may require including
+ * a parent device string.
+ */
+static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
+{
+	struct vfio_iommu *iommu = group->iommu;
+	struct list_head *gpos;
+	int ret = -ENODEV;
+
+	mutex_lock(&vfio.lock);
+
+	if (!iommu->domain) {
+		ret = __vfio_iommu_open(iommu);
+		if (ret)
+			goto out;
+	}
+
+	list_for_each(gpos, &iommu->group_list) {
+		struct list_head *dpos;
+
+		group = list_entry(gpos, struct vfio_group, iommu_next);
+
+		list_for_each(dpos, &group->device_list) {
+			struct vfio_device *device;
+			struct file *file;
+
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+
+			if (!device->ops->match(device->dev, buf))
+				continue;
+
+			ret = device->ops->open(device->device_data);
+			if (ret)
+				goto out;
+
+			/*
+			 * We can't use anon_inode_getfd(), like above
+			 * because we need to modify the f_mode flags
+			 * directly to allow more than just ioctls
+			 */
+			ret = get_unused_fd();
+			if (ret < 0) {
+				device->ops->release(device->device_data);
+				goto out;
+			}
+
+			file = anon_inode_getfile("[vfio-device]",
+						  &vfio_device_fops,
+						  device, O_RDWR);
+			if (IS_ERR(file)) {
+				put_unused_fd(ret);
+				ret = PTR_ERR(file);
+				device->ops->release(device->device_data);
+				goto out;
+			}
+
+			/*
+			 * TODO: add an anon_inode interface to do this.
+			 * Appears to be missing by lack of need rather than
+			 * explicitly prevented.  Now there's need.
+			 */
+			file->f_mode |= (FMODE_LSEEK |
+					 FMODE_PREAD |
+					 FMODE_PWRITE);
+
+			fd_install(ret, file);
+
+			device->refcnt++;
+			goto out;
+		}
+	}
+out:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+static long vfio_group_unl_ioctl(struct file *filep,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct vfio_group *group = filep->private_data;
+
+	if (cmd == VFIO_GROUP_GET_INFO) {
+		struct vfio_group_info info;
+		unsigned long minsz;
+
+		minsz = offsetofend(struct vfio_group_info, flags);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		mutex_lock(&vfio.lock);
+		if (__vfio_iommu_viable(group->iommu))
+			info.flags |= VFIO_GROUP_FLAGS_VIABLE;
+		mutex_unlock(&vfio.lock);
+
+		if (group->iommu->mm)
+			info.flags |= VFIO_GROUP_FLAGS_MM_LOCKED;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+
+	/* Below commands are restricted once the mm is set */
+	if (group->iommu->mm && group->iommu->mm != current->mm)
+		return -EPERM;
+
+	if (cmd == VFIO_GROUP_MERGE) {
+		int fd;
+
+		if (get_user(fd, (int __user *)arg))
+			return -EFAULT;
+		if (fd < 0)
+			return -EINVAL;
+
+		return vfio_group_merge(group, fd);
+
+	} else if (cmd == VFIO_GROUP_UNMERGE) {
+
+		return vfio_group_unmerge(group);
+
+	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {
+
+		return vfio_group_get_iommu_fd(group);
+
+	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {
+		char *buf;
+		int ret;
+
+		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
+
+		ret = vfio_group_get_device_fd(group, buf);
+		kfree(buf);
+		return ret;
+	}
+
+	return -ENOTTY;
+}
+
+#ifdef CONFIG_COMPAT
+static long vfio_group_compat_ioctl(struct file *filep,
+				    unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_group_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+static const struct file_operations vfio_group_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vfio_group_open,
+	.release	= vfio_group_release,
+	.unlocked_ioctl	= vfio_group_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_group_compat_ioctl,
+#endif
+};
+
+/* iommu fd release hook */
+int vfio_release_iommu(struct vfio_iommu *iommu)
+{
+	return vfio_do_release(&iommu->refcnt, iommu);
+}
+
+/*
+ * VFIO driver API
+ */
+
+/*
+ * Add a new device to the vfio framework with associated vfio driver
+ * callbacks.  This is the entry point for vfio drivers to register devices.
+ */
+int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
+{
+	struct vfio_group *group;
+	struct vfio_device *device;
+	unsigned int groupid;
+	int ret = 0;
+
+	if (iommu_device_group(dev, &groupid))
+		return -ENODEV;
+
+	if (WARN_ON(!ops))
+		return -EINVAL;
+
+	mutex_lock(&vfio.lock);
+
+	group = __vfio_dev_to_group(dev, groupid);
+	if (!group)
+		group = __vfio_create_group(dev, groupid); /* No fail */
+
+	device = __vfio_group_find_device(group, dev);
+	if (!device) {
+		device = kzalloc(sizeof(*device), GFP_KERNEL);
+		if (WARN_ON(!device)) {
+			/*
+			 * We created the group, but can't add this device,
+			 * taint the group to prevent it being used.  If
+			 * it's already in use, we have to BUG_ON.
+			 * XXX - Kill the user process?
+			 */
+			group->tainted = true;
+			BUG_ON(group->iommu && group->iommu->domain);
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		list_add(&device->device_next, &group->device_list);
+		device->dev = dev;
+		device->ops = ops;
+		device->group = group;
+
+		if (!group->devices_kobj ||
+		    sysfs_create_link(group->devices_kobj,
+				      &dev->kobj, dev_name(dev)))
+			printk(KERN_WARNING
+			       "vfio: Unable to create sysfs link to %s\n",
+			       dev_name(dev));
+
+		if (group->iommu && group->iommu->domain) {
+			printk(KERN_WARNING "Adding device %s to group %s:%u "
+			       "while group is already in use!!\n",
+			       dev_name(dev), group->bus->name, group->groupid);
+
+			mutex_unlock(&vfio.lock);
+
+			ret = ops->claim(dev);
+
+			BUG_ON(ret);
+
+			goto out_unlocked;
+		}
+
+	}
+out:
+	mutex_unlock(&vfio.lock);
+out_unlocked:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_group_add_dev);
+
+/* Remove a device from the vfio framework */
+void vfio_group_del_dev(struct device *dev)
+{
+	struct vfio_group *group;
+	struct vfio_device *device;
+	unsigned int groupid;
+
+	if (iommu_device_group(dev, &groupid))
+		return;
+
+	mutex_lock(&vfio.lock);
+
+	group = __vfio_dev_to_group(dev, groupid);
+
+	if (WARN_ON(!group))
+		goto out;
+
+	device = __vfio_group_find_device(group, dev);
+
+	if (WARN_ON(!device))
+		goto out;
+
+	/*
+	 * If device is bound to a bus driver, we'll get a chance to
+	 * unbind it first.  Just mark it to be removed after unbind.
+	 */
+	if (device->device_data) {
+		device->deleteme = true;
+		goto out;
+	}
+
+	if (device->attached)
+		__vfio_iommu_detach_dev(group->iommu, device);
+
+	list_del(&device->device_next);
+
+	if (group->devices_kobj)
+		sysfs_remove_link(group->devices_kobj, dev_name(dev));
+
+	kfree(device);
+
+	/*
+	 * If this was the only device in the group, remove the group.
+	 * Note that we intentionally unmerge empty groups here if the
+	 * group fd isn't opened.
+	 */
+	if (list_empty(&group->device_list) && group->refcnt == 0) {
+		struct vfio_iommu *iommu = group->iommu;
+
+		if (iommu) {
+			__vfio_group_set_iommu(group, NULL);
+			__vfio_try_dissolve_iommu(iommu);
+		}
+
+		/*
+		 * Groups can be mostly placeholders if setup isn't
+		 * completed, remove them carefully.
+		 */
+		if (group->devices_kobj)
+			kobject_put(group->devices_kobj);
+		if (group->dev) {
+			device_destroy(vfio.class, group->devt);
+			idr_remove(&vfio.idr, MINOR(group->devt));
+		}
+		list_del(&group->group_next);
+		kfree(group);
+	}
+
+out:
+	mutex_unlock(&vfio.lock);
+}
+EXPORT_SYMBOL_GPL(vfio_group_del_dev);
+
+/*
+ * When a device is bound to a vfio device driver (ex. vfio-pci), this
+ * entry point is used to mark the device usable (viable).  The vfio
+ * device driver associates a private device_data struct with the device
+ * here, which will later be return for vfio_device_fops callbacks.
+ */
+int vfio_bind_dev(struct device *dev, void *device_data)
+{
+	struct vfio_device *device;
+	int ret;
+
+	if (WARN_ON(!device_data))
+		return -EINVAL;
+
+	mutex_lock(&vfio.lock);
+
+	device = __vfio_lookup_dev(dev);
+
+	if (WARN_ON(!device)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = dev_set_drvdata(dev, device);
+	if (!ret)
+		device->device_data = device_data;
+
+out:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_bind_dev);
+
+/* A device is only removable if the iommu for the group is not in use. */
+static bool vfio_device_removeable(struct vfio_device *device)
+{
+	bool ret = true;
+
+	mutex_lock(&vfio.lock);
+
+	if (device->group->iommu && __vfio_iommu_inuse(device->group->iommu))
+		ret = false;
+
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/*
+ * Notify vfio that a device is being unbound from the vfio device driver
+ * and return the device private device_data pointer.  If the group is
+ * in use, we need to block or take other measures to make it safe for
+ * the device to be removed from the iommu.
+ */
+void *vfio_unbind_dev(struct device *dev)
+{
+	struct vfio_device *device = dev_get_drvdata(dev);
+	void *device_data;
+
+	if (WARN_ON(!device))
+		return NULL;
+again:
+	if (!vfio_device_removeable(device)) {
+		/*
+		 * XXX signal for all devices in group to be removed or
+		 * resort to killing the process holding the device fds.
+		 * For now just block waiting for releases to wake us.
+		 */
+		wait_event(vfio.release_q, vfio_device_removeable(device));
+	}
+
+	mutex_lock(&vfio.lock);
+
+	/* Need to re-check that the device is still removable under lock. */
+	if (device->group->iommu && __vfio_iommu_inuse(device->group->iommu)) {
+		mutex_unlock(&vfio.lock);
+		goto again;
+	}
+
+	device_data = device->device_data;
+
+	device->device_data = NULL;
+	dev_set_drvdata(dev, NULL);
+
+	mutex_unlock(&vfio.lock);
+
+	if (device->deleteme)
+		vfio_group_del_dev(dev);
+
+	return device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_unbind_dev);
+
+/*
+ * Module/class support
+ */
+static void vfio_class_release(struct kref *kref)
+{
+	class_destroy(vfio.class);
+	vfio.class = NULL;
+}
+
+static char *vfio_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
+}
+
+static int __init vfio_init(void)
+{
+	int ret;
+
+	idr_init(&vfio.idr);
+	mutex_init(&vfio.lock);
+	INIT_LIST_HEAD(&vfio.group_list);
+	init_waitqueue_head(&vfio.release_q);
+
+	kref_init(&vfio.kref);
+	vfio.class = class_create(THIS_MODULE, "vfio");
+	if (IS_ERR(vfio.class)) {
+		ret = PTR_ERR(vfio.class);
+		goto err_class;
+	}
+
+	vfio.class->devnode = vfio_devnode;
+
+	/* FIXME - how many minors to allocate... all of them! */
+	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
+	if (ret)
+		goto err_chrdev;
+
+	cdev_init(&vfio.cdev, &vfio_group_fops);
+	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
+	if (ret)
+		goto err_cdev;
+
+	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
+
+	return 0;
+
+err_cdev:
+	unregister_chrdev_region(vfio.devt, MINORMASK);
+err_chrdev:
+	kref_put(&vfio.kref, vfio_class_release);
+err_class:
+	return ret;
+}
+
+static void __exit vfio_cleanup(void)
+{
+	struct list_head *gpos, *gppos;
+
+	list_for_each_safe(gpos, gppos, &vfio.group_list) {
+		struct vfio_group *group;
+		struct list_head *dpos, *dppos;
+
+		group = list_entry(gpos, struct vfio_group, group_next);
+
+		list_for_each_safe(dpos, dppos, &group->device_list) {
+			struct vfio_device *device;
+
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+			vfio_group_del_dev(device->dev);
+		}
+	}
+
+	idr_destroy(&vfio.idr);
+	cdev_del(&vfio.cdev);
+	unregister_chrdev_region(vfio.devt, MINORMASK);
+	kref_put(&vfio.kref, vfio_class_release);
+}
+
+module_init(vfio_init);
+module_exit(vfio_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
new file mode 100644
index 0000000..1921fb9
--- /dev/null
+++ b/drivers/vfio/vfio_private.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@xxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@xxxxxxxxx
+ */
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#ifndef VFIO_PRIVATE_H
+#define VFIO_PRIVATE_H
+
+struct vfio_iommu {
+	struct iommu_domain		*domain;
+	struct bus_type			*bus;
+	struct mutex			lock;
+	struct list_head		dma_list;
+	struct mm_struct		*mm;
+	struct list_head		group_list;
+	int				refcnt;
+	bool				cache;
+};
+
+extern const struct file_operations vfio_iommu_fops;
+
+extern int vfio_release_iommu(struct vfio_iommu *iommu);
+extern void vfio_iommu_unmapall(struct vfio_iommu *iommu);
+
+#endif /* VFIO_PRIVATE_H */

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html