Re: [PATCH 3/3] VFIO V4: VFIO driver: Non-privileged user level PCI drivers

Tom Lyon <pugs@xxxxxxxxx> · Thu, 30 Sep 2010 14:43:11 -0700

Comments inline...

On Sunday, September 26, 2010 07:54:19 am Michael S. Tsirkin wrote:
> I did a quick pass, mostly on memory locking/DMA code.
> Some comments inside.
> 
> > +/*
> > + * This code handles mapping and unmapping of user data buffers
> > + * into DMA'ble space using the IOMMU
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/pci.h>
> > +#include <linux/mm.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/iommu.h>
> > +#include <linux/uiommu.h>
> > +#include <linux/sched.h>
> > +#include <linux/vfio.h>
> > +
> > +/* Unmap DMA region */
> > +/* dgate must be held */
> > +static void vfio_dma_unmap(struct vfio_listener *listener,
> > +			struct dma_map_page *mlp)
> > +{
> > +	int i;
> > +	struct vfio_dev *vdev = listener->vdev;
> > +
> > +	list_del(&mlp->list);
> > +	for (i = 0; i < mlp->npage; i++)
> > +		(void) uiommu_unmap(vdev->udomain,
> > +				mlp->daddr + i*PAGE_SIZE, 0);
> 
> Pls put spaces around *, + etc.
> I think recent checkpatch versions even warn around this ...
OK, cleaned up.

> 
> > +	for (i = 0; i < mlp->npage; i++) {
> > +		if (mlp->rdwr)
> > +			SetPageDirty(mlp->pages[i]);
> > +		put_page(mlp->pages[i]);
> > +	}
> > +	vdev->mapcount--;
> > +	listener->mm->locked_vm -= mlp->npage;
> 
> Is there a race against mlock call here?
Alas, yes. I took another look at the related infiniband code, and now
have adopted their way of doing it.

> 
> > +	vdev->locked_pages -= mlp->npage;
> > +	vfree(mlp->pages);
> > +	kfree(mlp);
> > +}
> > +
> > +/* Unmap ALL DMA regions */
> > +void vfio_dma_unmapall(struct vfio_listener *listener)
> > +{
> > +	struct list_head *pos, *pos2;
> > +	struct dma_map_page *mlp;
> > +
> > +	mutex_lock(&listener->vdev->dgate);
> > +	list_for_each_safe(pos, pos2, &listener->dm_list) {
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		vfio_dma_unmap(listener, mlp);
> > +	}
> > +	mutex_unlock(&listener->vdev->dgate);
> > +}
> > +
> > +int vfio_dma_unmap_dm(struct vfio_listener *listener, struct
> > vfio_dma_map *dmp) +{
> > +	unsigned long start, npage;
> > +	struct dma_map_page *mlp;
> > +	struct list_head *pos, *pos2;
> > +	int ret;
> > +
> > +	start = dmp->vaddr & ~PAGE_SIZE;
> 
> Can address become unaligned? Most logic seems to assume
> an aligned address ...
Just extra paranoia.

> 
> > +	npage = dmp->size >> PAGE_SHIFT;
> > +
> > +	ret = -ENXIO;
> > +	mutex_lock(&listener->vdev->dgate);
> > +	list_for_each_safe(pos, pos2, &listener->dm_list) {
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		if (dmp->vaddr != mlp->vaddr || mlp->npage != npage)
> > +			continue;
> > +		ret = 0;
> > +		vfio_dma_unmap(listener, mlp);
> > +		break;
> > +	}
> > +	mutex_unlock(&listener->vdev->dgate);
> > +	return ret;
> > +}
> > +
> > +#ifdef CONFIG_MMU_NOTIFIER
> > +/* Handle MMU notifications - user process freed or realloced memory
> > + * which may be in use in a DMA region. Clean up region if so.
> > + */
> > +static void vfio_dma_handle_mmu_notify(struct mmu_notifier *mn,
> > +		unsigned long start, unsigned long end)
> > +{
> > +	struct vfio_listener *listener;
> > +	unsigned long myend;
> > +	struct list_head *pos, *pos2;
> > +	struct dma_map_page *mlp;
> > +
> > +	listener = container_of(mn, struct vfio_listener, mmu_notifier);
> > +	mutex_lock(&listener->vdev->dgate);
> > +	list_for_each_safe(pos, pos2, &listener->dm_list) {
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		if (mlp->vaddr >= end)
> > +			continue;
> > +		/*
> > +		 * Ranges overlap if they're not disjoint; and they're
> > +		 * disjoint if the end of one is before the start of
> > +		 * the other one.
> > +		 */
> > +		myend = mlp->vaddr + (mlp->npage << PAGE_SHIFT) - 1;
> > +		if (!(myend <= start || end <= mlp->vaddr)) {
> 
> I suggest open the () and ivert the condition.
I can understand the code better this way.

> 
> > +			printk(KERN_WARNING
> > +				"%s: demap start %lx end %lx va %lx pa %lx\n",
> > +				__func__, start, end,
> > +				mlp->vaddr, (long)mlp->daddr);
> > +			vfio_dma_unmap(listener, mlp);
> 
> And then what would happen? How does user interpret this warning?
> How can driver/device recover?
It's just a warning that the buffer was demapped due to mmu notifier, instead 
of explicitly.  If the user code accidentally frees or reuses its buffers this 
can happen.

> 
> > +		}
> > +	}
> > +	mutex_unlock(&listener->vdev->dgate);
> > +}
> > +
> > +static void vfio_dma_inval_page(struct mmu_notifier *mn,
> > +		struct mm_struct *mm, unsigned long addr)
> > +{
> > +	vfio_dma_handle_mmu_notify(mn, addr, addr + PAGE_SIZE);
> > +}
> > +
> > +static void vfio_dma_inval_range_start(struct mmu_notifier *mn,
> > +		struct mm_struct *mm, unsigned long start, unsigned long end)
> > +{
> > +	vfio_dma_handle_mmu_notify(mn, start, end);
> > +}
> > +
> > +static const struct mmu_notifier_ops vfio_dma_mmu_notifier_ops = {
> > +	.invalidate_page = vfio_dma_inval_page,
> > +	.invalidate_range_start = vfio_dma_inval_range_start,
> > +};
> > +#endif	/* CONFIG_MMU_NOTIFIER */
> > +
> > +/*
> > + * Map usr buffer at specific IO virtual address
> > + */
> > +static struct dma_map_page *vfio_dma_map_iova(
> > +		struct vfio_listener *listener,
> > +		unsigned long start_iova,
> > +		struct page **pages,
> > +		int npage,
> > +		int rdwr)
> > +{
> > +	struct vfio_dev *vdev = listener->vdev;
> > +	int ret;
> > +	int i;
> > +	phys_addr_t hpa;
> > +	struct dma_map_page *mlp;
> > +	unsigned long iova = start_iova;
> > +
> > +	if (vdev->udomain == NULL)
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	for (i = 0; i < npage; i++) {
> > +		if (uiommu_iova_to_phys(vdev->udomain, iova + i*PAGE_SIZE))
> > +			return ERR_PTR(-EBUSY);
> > +	}
> > +
> > +	mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
> > +	if (mlp == NULL)
> > +		return ERR_PTR(-ENOMEM);
> > +	rdwr = rdwr ? IOMMU_READ|IOMMU_WRITE : IOMMU_READ;
> > +	if (vdev->cachec)
> > +		rdwr |= IOMMU_CACHE;
> > +	for (i = 0; i < npage; i++) {
> > +		hpa = page_to_phys(pages[i]);
> > +		ret = uiommu_map(vdev->udomain, iova, hpa, 0, rdwr);
> > +		if (ret) {
> > +			while (--i > 0) {
> > +				iova -= PAGE_SIZE;
> > +				(void) uiommu_unmap(vdev->udomain,
> > +						iova, 0);
> > +			}
> > +			kfree(mlp);
> > +			return ERR_PTR(ret);
> > +		}
> > +		iova += PAGE_SIZE;
> > +	}
> > +	vdev->mapcount++;
> > +
> > +	mlp->pages = pages;
> > +	mlp->daddr = start_iova;
> > +	mlp->npage = npage;
> > +	return mlp;
> > +}
> > +
> > +int vfio_dma_map_common(struct vfio_listener *listener,
> > +		unsigned int cmd, struct vfio_dma_map *dmp)
> > +{
> > +	int locked, lock_limit;
> > +	struct page **pages;
> > +	int npage;
> > +	struct dma_map_page *mlp;
> > +	int rdwr = (dmp->flags & VFIO_FLAG_WRITE) ? 1 : 0;
> > +	int ret = 0;
> > +
> > +	if (dmp->vaddr & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +	if (dmp->size & (PAGE_SIZE-1))
> > +		return -EINVAL;
> 
> size must be full pages? Maybe document this?
Its in the header file and Doc file.

> 
> > +	if (dmp->size <= 0)
> 
> It's u64. Can it be < 0?
More paranoia.

> 
> > +		return -EINVAL;
> > +	npage = dmp->size >> PAGE_SHIFT;
Added a check for max size - 4G for now.
> 
> This assignment can overflow the integer.
> 
> > +	if (npage <= 0)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&listener->vdev->dgate);
> > +
> > +	/* account for locked pages */
> > +	locked = npage + current->mm->locked_vm;
> 
> Again this can race against mlock I think.
Yes.

> 
> > +	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur
> > +			>> PAGE_SHIFT;
> > +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
> 
> rlimit/capability access might also be racy: don't we need
> task lock for that?
Noone else seems to take a task lock for this sort of thing. Can you point me 
at task lock code?

> 
> > +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK exceeded\n",
> > +			__func__);
> > +		ret = -ENOMEM;
> > +		goto out_lock;
> > +	}
> > +	/* only 1 address space per fd */
> > +	if (current->mm != listener->mm) {
> > +		if (listener->mm != NULL) {
> > +			ret = -EINVAL;
> > +			goto out_lock;
> > +		}
> > +		listener->mm = current->mm;
> > +#ifdef CONFIG_MMU_NOTIFIER
> > +		listener->mmu_notifier.ops = &vfio_dma_mmu_notifier_ops;
> > +		ret = mmu_notifier_register(&listener->mmu_notifier,
> > +						listener->mm);
> > +		if (ret)
> > +			printk(KERN_ERR "%s: mmu_notifier_register failed %d\n",
> > +				__func__, ret);
> > +		ret = 0;
> 
> What exactly are you doing with the notifiers?
> This driver seems to lock all DMA memory, how can
> it get moved?
> And why is an error ignored?
The physical pages get locked, but the mmu notifier detects when the virtual 
pages get re-used without an intervening de-map.

> 
> > +#endif
> > +	}
> > +
> > +	pages = vmalloc(npage * sizeof(struct page *));
> 
> npage comes from userspace? What if it's a huge value?
> Also, on a 32 bit system, we will run out of vmalloc space
> quickly if we let userspace tie it up indefinitely ...
> This is slow path - maybe just lock pages one by one?
Still have to lock and remember all the locked pages.  Max lock size of 4G 
will help this.

> 
> > +	if (pages == NULL) {
> > +		ret = ENOMEM;
> > +		goto out_lock;
> > +	}
> > +	ret = get_user_pages_fast(dmp->vaddr, npage, rdwr, pages);
> > +	if (ret != npage) {
> > +		printk(KERN_ERR "%s: get_user_pages_fast returns %d, not %d\n",
> > +			__func__, ret, npage);
> > +		kfree(pages);
> > +		ret = -EFAULT;
> > +		goto out_lock;
> > +	}
> > +	ret = 0;
> > +
> > +	mlp = vfio_dma_map_iova(listener, dmp->dmaaddr,
> > +				pages, npage, rdwr);
> > +	if (IS_ERR(mlp)) {
> > +		ret = PTR_ERR(mlp);
> > +		vfree(pages);
> > +		goto out_lock;
> > +	}
> > +	mlp->vaddr = dmp->vaddr;
> > +	mlp->rdwr = rdwr;
> > +	dmp->dmaaddr = mlp->daddr;
> > +	list_add(&mlp->list, &listener->dm_list);
> > +
> > +	current->mm->locked_vm += npage;
> > +	listener->vdev->locked_pages += npage;
> 
> This looks too aggressive.
> So if you want to use 2 devices, you will
> have to double the mlock rlimit for the process?
If you know 2 devices are in the same domain, you don't have to repeat the 
call. If you don't know, then you might double lock pages.
> 
> I think this ioctl would be better done
> on the iommu device than on vfio: all it does
> is pass calls to iommu anyway.
> The you can share locking between devices.
Yes, but you have to carry around another fd

> 
> > +out_lock:
> > +	mutex_unlock(&listener->vdev->dgate);
> > +	return ret;
> > +}
> > +
> > +int vfio_domain_unset(struct vfio_dev *vdev)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +
> > +	if (vdev->udomain == NULL)
> 
> !vdev->udomain
Got rid of all NULL comparisons.

> 
> > +		return 0;
> > +	if (vdev->mapcount)
> > +		return -EBUSY;
> > +	uiommu_detach_device(vdev->udomain, &pdev->dev);
> > +	uiommu_put(vdev->udomain);
> > +	vdev->udomain = NULL;
> > +	return 0;
> > +}
> > +
> > +int vfio_domain_set(struct vfio_dev *vdev, int fd, int unsafe_ok)
> > +{
> > +	struct uiommu_domain *udomain;
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int ret;
> > +	int safe;
> > +
> > +	if (vdev->udomain)
> > +		return -EBUSY;
> > +	udomain = uiommu_fdget(fd);
> > +	if (IS_ERR(udomain))
> > +		return PTR_ERR(udomain);
> > +
> > +	safe = 0;
> > +#ifdef IOMMU_CAP_INTR_REMAP	/* >= 2.6.36 */
> > +	/* iommu domain must also isolate dev interrupts */
> > +	if (uiommu_domain_has_cap(udomain, IOMMU_CAP_INTR_REMAP))
> > +		safe = 1;
> > +#endif
> > +	if (!safe && !unsafe_ok) {
> > +		printk(KERN_WARNING "%s: no interrupt remapping!\n", __func__);
> > +		return -EINVAL;
> > +	}
> > +
> > +	vfio_domain_unset(vdev);
> > +	ret = uiommu_attach_device(udomain, &pdev->dev);
> > +	if (ret) {
> > +		printk(KERN_ERR "%s: attach_device failed %d\n",
> > +				__func__, ret);
> > +		uiommu_put(udomain);
> > +		return ret;
> > +	}
> > +	vdev->cachec = iommu_domain_has_cap(udomain->domain,
> > +				IOMMU_CAP_CACHE_COHERENCY);
> > +	vdev->udomain = udomain;
> > +	return 0;
> > +}
> > diff --git a/drivers/vfio/vfio_intrs.c b/drivers/vfio/vfio_intrs.c
> > new file mode 100644
> > index 0000000..4ced09c
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_intrs.c
> > @@ -0,0 +1,257 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@xxxxxxxxx
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger@xxxxxxxxxxxxx>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Hans J. Koch <hjk@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg@xxxxxxxxx>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst@xxxxxxxxxx>
> > + */
> > +
> > +/*
> > + * This code handles catching interrupts and translating
> > + * them to events on eventfds
> > + */
> > +
> > +#include <linux/device.h>
> > +#include <linux/interrupt.h>
> > +#include <linux/eventfd.h>
> > +#include <linux/pci.h>
> > +#include <linux/mmu_notifier.h>
> > +
> > +#include <linux/vfio.h>
> > +
> > +
> > +/*
> > + * vfio_interrupt - IRQ hardware interrupt handler
> > + */
> > +irqreturn_t vfio_interrupt(int irq, void *dev_id)
> > +{
> > +	struct vfio_dev *vdev = (struct vfio_dev *)dev_id;
> 
> don't cast void pointers
OK. 
> 
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	irqreturn_t ret = IRQ_NONE;
> > +	u32 cmd_status_dword;
> > +	u16 origcmd, newcmd, status;
> > +
> > +	spin_lock_irq(&vdev->irqlock);
> > +	pci_block_user_cfg_access(pdev);
> > +
> > +	/* Read both command and status registers in a single 32-bit 
operation.
> > +	 * Note: we could cache the value for command and move the status 
read
> > +	 * out of the lock if there was a way to get notified of user changes
> > +	 * to command register through sysfs. Should be good for shared irqs.
> > */ +	pci_read_config_dword(pdev, PCI_COMMAND, &cmd_status_dword);
> > +	origcmd = cmd_status_dword;
> > +	status = cmd_status_dword >> 16;
> > +
> > +	/* Check interrupt status register to see whether our device
> > +	 * triggered the interrupt. */
> > +	if (!(status & PCI_STATUS_INTERRUPT))
> > +		goto done;
> > +
> > +	/* We triggered the interrupt, disable it. */
> > +	newcmd = origcmd | PCI_COMMAND_INTX_DISABLE;
> > +	if (newcmd != origcmd)
> > +		pci_write_config_word(pdev, PCI_COMMAND, newcmd);
> > +
> > +	ret = IRQ_HANDLED;
> > +done:
> > +	pci_unblock_user_cfg_access(pdev);
> > +	spin_unlock_irq(&vdev->irqlock);
> > +	if (ret != IRQ_HANDLED)
> > +		return ret;
> > +	if (vdev->ev_irq)
> > +		eventfd_signal(vdev->ev_irq, 1);
> > +	return ret;
> > +}
> > +
> > +/*
> > + * MSI and MSI-X Interrupt handler.
> > + * Just signal an event
> > + */
> > +static irqreturn_t msihandler(int irq, void *arg)
> > +{
> > +	struct eventfd_ctx *ctx = arg;
> > +
> > +	eventfd_signal(ctx, 1);
> > +	return IRQ_HANDLED;
> > +}
> > +
> > +void vfio_drop_msi(struct vfio_dev *vdev)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int i;
> > +
> > +	if (vdev->ev_msi) {
> > +		for (i = 0; i < vdev->msi_nvec; i++) {
> > +			free_irq(pdev->irq + i, vdev->ev_msi[i]);
> > +			if (vdev->ev_msi[i])
> > +				eventfd_ctx_put(vdev->ev_msi[i]);
> > +		}
> > +	}
> > +	kfree(vdev->ev_msi);
> > +	vdev->ev_msi = NULL;
> > +	vdev->msi_nvec = 0;
> > +	pci_disable_msi(pdev);
> > +}
> > +
> > +int vfio_setup_msi(struct vfio_dev *vdev, int nvec, void __user *uarg)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	struct eventfd_ctx *ctx;
> > +	int i, n, l2;
> > +	int ret = 0;
> > +	int fd;
> > +
> > +	if (nvec < 1 || nvec > 32)
> > +		return -EINVAL;
> > +	vdev->ev_msi = kzalloc(nvec * sizeof(struct eventfd_ctx *),
> > +				GFP_KERNEL);
> > +	if (vdev->ev_msi == NULL)
> > +		return -ENOMEM;
> > +
> > +	for (i = 0; i < nvec; i++) {
> > +		if (copy_from_user(&fd, uarg, sizeof fd)) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		uarg += sizeof fd;
> > +		ctx = eventfd_ctx_fdget(fd);
> > +		if (IS_ERR(ctx)) {
> > +			ret = PTR_ERR(ctx);
> > +			break;
> 
> so goto out here?
Why?

> 
> > +		}
> > +		vdev->ev_msi[i] = ctx;
> > +	}
> > +	if (ret)
> > +		goto out;
> > +	ret = pci_enable_msi_block(pdev, nvec);
> > +	if (ret) {
> > +		if (ret > 0)
> > +			ret = -EINVAL;
> > +		goto out;
> > +	}
> > +	for (i = 0; i < nvec; i++) {
> > +		ret = request_irq(pdev->irq + i, msihandler, 0,
> > +			vdev->name, vdev->ev_msi[i]);
> > +		if (ret)
> > +			break;
> > +		vdev->msi_nvec = i+1;
> > +	}
> > +
> > +	/*
> > +	 * compute the virtual hardware field for max msi vectors -
> > +	 * it is the log base 2 of the number of vectors
> > +	 */
> > +	l2 = 0;
> > +	n = vdev->msi_nvec;
> > +	if (n >= (1 << 4)) {
> > +		n >>= 4;
> > +		l2 += 4;
> > +	}
> > +	if (n >= (1 << 2)) {
> > +		n >>= 2;
> > +		l2 += 2;
> > +	}
> > +	if (n >= (1 << 1))
> > +		l2 += 1;
> 
> what is this doing? Will using fls() help?
It is computing log2(n) for n <= 32. I added a comment.

> 
> > +	vdev->msi_qmax = l2;
> > +out:
> > +	if (ret)
> > +		vfio_drop_msi(vdev);
> > +	return ret;
> > +}
> > +
> > +void vfio_drop_msix(struct vfio_dev *vdev)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int i;
> > +
> > +	if (vdev->ev_msix && vdev->msix) {
> > +		for (i = 0; i < vdev->msix_nvec; i++) {
> > +			free_irq(vdev->msix[i].vector, vdev->ev_msix[i]);
> > +			if (vdev->ev_msix[i])
> > +				eventfd_ctx_put(vdev->ev_msix[i]);
> > +		}
> > +	}
> 
> No need for external {}
OK.

> 
> > +	kfree(vdev->ev_msix);
> > +	vdev->ev_msix = NULL;
> > +	kfree(vdev->msix);
> > +	vdev->msix = NULL;
> > +	vdev->msix_nvec = 0;
> > +	pci_disable_msix(pdev);
> > +}
> > +
> > +int vfio_setup_msix(struct vfio_dev *vdev, int nvec, void __user *uarg)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	struct eventfd_ctx *ctx;
> > +	int ret = 0;
> > +	int i;
> > +	int fd;
> > +	int pos;
> > +	u16 flags = 0;
> > +
> > +	pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
> > +	if (!pos)
> > +		return -EINVAL;
> > +	pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &flags);
> > +	if (nvec < 1 || nvec > (flags & PCI_MSIX_FLAGS_QSIZE) + 1)
> > +		return -EINVAL;
> > +
> > +	vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
> > +				GFP_KERNEL);
> > +	if (vdev->msix == NULL)
> > +		return -ENOMEM;
> > +	vdev->ev_msix = kzalloc(nvec * sizeof(struct eventfd_ctx *),
> > +				GFP_KERNEL);
> > +	if (vdev->ev_msix == NULL) {
> > +		kfree(vdev->msix);
> > +		return -ENOMEM;
> > +	}
> > +	for (i = 0; i < nvec; i++) {
> > +		if (copy_from_user(&fd, uarg, sizeof fd)) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		uarg += sizeof fd;
> > +		ctx = eventfd_ctx_fdget(fd);
> > +		if (IS_ERR(ctx)) {
> > +			ret = PTR_ERR(ctx);
> > +			break;
> > +		}
> > +		vdev->msix[i].entry = i;
> > +		vdev->ev_msix[i] = ctx;
> > +	}
> > +	if (!ret)
> > +		ret = pci_enable_msix(pdev, vdev->msix, nvec);
> > +	vdev->msix_nvec = 0;
> > +	for (i = 0; i < nvec && !ret; i++) {
> > +		ret = request_irq(vdev->msix[i].vector, msihandler, 0,
> > +			vdev->name, vdev->ev_msix[i]);
> > +		if (ret)
> > +			break;
> > +		vdev->msix_nvec = i+1;
> > +	}
> > +	if (ret)
> > +		vfio_drop_msix(vdev);
> > +	return ret;
> > +}
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > new file mode 100644
> > index 0000000..a18e39a
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -0,0 +1,768 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@xxxxxxxxx
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger@xxxxxxxxxxxxx>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Hans J. Koch <hjk@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg@xxxxxxxxx>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst@xxxxxxxxxx>
> > + */
> > +
> > +/*
> > + * VFIO main module: driver to allow non-privileged user programs
> > + * to imlpement direct mapped device drivers for PCI* devices
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/mm.h>
> > +#include <linux/idr.h>
> > +#include <linux/string.h>
> > +#include <linux/interrupt.h>
> > +#include <linux/fs.h>
> > +#include <linux/eventfd.h>
> > +#include <linux/pci.h>
> > +#include <linux/iommu.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/suspend.h>
> > +
> > +#include <linux/vfio.h>
> > +
> > +
> > +#define DRIVER_VERSION	"0.1"
> > +#define DRIVER_AUTHOR	"Tom Lyon <pugs@xxxxxxxxx>"
> > +#define DRIVER_DESC	"VFIO - User Level PCI meta-driver"
> > +
> > +/*
> > + * Only a very few platforms today (Intel X7500) fully support
> > + * both DMA remapping and interrupt remapping in the IOMMU.
> > + * Everyone has DMA remapping but interrupt remapping is missing
> > + * in some Intel hardware and software, and its missing in the AMD
> > + * IOMMU software. Interrupt remapping is needed to really protect the
> > + * system from user level driver mischief.  Until it is in more
> > platforms + * we allow the admin to load the module with
> > allow_unsafe_intrs=1 + * which will make this driver useful (but not
> > safe)
> > + * on those platforms.
> > + */
> > +static int allow_unsafe_intrs;
> > +module_param(allow_unsafe_intrs, int, 0);
> > +
> > +static int vfio_major = -1;
> > +static DEFINE_IDR(vfio_idr);
> > +static int vfio_max_minor;
> > +/* Protect idr accesses */
> > +static DEFINE_MUTEX(vfio_minor_lock);
> > +
> > +/*
> > + * Does [a1,b1) overlap [a2,b2) ?
> > + */
> > +static inline int overlap(int a1, int b1, int a2, int b2)
> > +{
> > +	/*
> > +	 * Ranges overlap if they're not disjoint; and they're
> > +	 * disjoint if the end of one is before the start of
> > +	 * the other one.
> > +	 */
> > +	return !(b2 <= a1 || b1 <= a2);
> > +}
> > +
> > +static int vfio_open(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_dev *vdev;
> > +	struct vfio_listener *listener;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio_minor_lock);
> > +	vdev = idr_find(&vfio_idr, iminor(inode));
> > +	mutex_unlock(&vfio_minor_lock);
> > +	if (!vdev) {
> > +		ret = -ENODEV;
> > +		goto out;
> > +	}
> > +
> > +	listener = kzalloc(sizeof(*listener), GFP_KERNEL);
> > +	if (!listener) {
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +	mutex_lock(&vdev->lgate);
> > +	listener->vdev = vdev;
> > +	INIT_LIST_HEAD(&listener->dm_list);
> > +	filep->private_data = listener;
> > +	if (vdev->listeners == 0)
> > +		ret = pci_enable_device(vdev->pdev);
> 
> Why would you want to enable device on open?
> Doing this later when domain is set would add an extra level of
> protection as device would reject reads/writes when not enabled.
Unfortunately, pci_enable_device does some black magic with pci_bios_enable 
which is platform dependent and which I don't really understand.  I'm pretty 
sure this has to be there before an assignment to an iommu.
> 
> 
> Also, don't you want to do pci_set_master at some point?
No, the user code can do it and the rest of the kernel doesn't care once  its 
under the iommu.
> 
> > +	if (ret == 0)
> 
> !ret or better if (ret)
> 		 goto err;
OK.

> 
> > +		vdev->listeners++;
> > +	mutex_unlock(&vdev->lgate);
> > +	if (ret)
> > +		kfree(listener);
> 
> this error handling is
> 
> > +out:
> > +	return ret;
> > +}
> > +
> > +static int vfio_release(struct inode *inode, struct file *filep)
> > +{
> > +	int ret = 0;
> > +	struct vfio_listener *listener = filep->private_data;
> > +	struct vfio_dev *vdev = listener->vdev;
> > +
> > +	vfio_dma_unmapall(listener);
> > +	if (listener->mm) {
> > +#ifdef CONFIG_MMU_NOTIFIER
> > +		mmu_notifier_unregister(&listener->mmu_notifier, listener->mm);
> > +#endif
> > +		listener->mm = NULL;
> > +	}
> > +
> > +	mutex_lock(&vdev->lgate);
> > +	if (--vdev->listeners <= 0) {
> > +		/* we don't need to hold igate here since there are
> > +		 * no more listeners doing ioctls
> > +		 */
> > +		if (vdev->ev_msix)
> > +			vfio_drop_msix(vdev);
> > +		if (vdev->ev_msi)
> > +			vfio_drop_msi(vdev);
> > +		if (vdev->ev_irq) {
> > +			eventfd_ctx_put(vdev->ev_irq);
> > +			vdev->ev_irq = NULL;
> > +		}
> > +		kfree(vdev->vconfig);
> > +		vdev->vconfig = NULL;
> > +		kfree(vdev->pci_config_map);
> > +		vdev->pci_config_map = NULL;
> > +		pci_disable_device(vdev->pdev);
> > +		vfio_domain_unset(vdev);
> 
> This does not seem to remove bus master before close.
> If the userspace driver dies, and device is doing DMA
> into userspace, what will prevent DMA after
> you unset the domain?
Actually, pci_disable_device does little else than disable bus master.

> 
> > +		wake_up(&vdev->dev_idle_q);
> > +	}
> > +	mutex_unlock(&vdev->lgate);
> > +
> > +	kfree(listener);
> > +	return ret;
> > +}
> > +
> > +static ssize_t vfio_read(struct file *filep, char __user *buf,
> > +			size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_listener *listener = filep->private_data;
> > +	struct vfio_dev *vdev = listener->vdev;
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int pci_space;
> > +
> > +	pci_space = vfio_offset_to_pci_space(*ppos);
> > +
> > +	/* config reads are OK before iommu domain set */
> > +	if (pci_space == VFIO_PCI_CONFIG_RESOURCE)
> > +		return vfio_config_readwrite(0, vdev, buf, count, ppos);
> > +
> > +	/* no other reads until IOMMU domain set */
> > +	if (vdev->udomain == NULL)
> > +		return -EINVAL;
> > +	if (pci_space > PCI_ROM_RESOURCE)
> > +		return -EINVAL;
> > +	if (pci_resource_flags(pdev, pci_space) & IORESOURCE_IO)
> > +		return vfio_io_readwrite(0, vdev, buf, count, ppos);
> > +	if (pci_resource_flags(pdev, pci_space) & IORESOURCE_MEM)
> > +		return vfio_mem_readwrite(0, vdev, buf, count, ppos);
> > +	if (pci_space == PCI_ROM_RESOURCE)
> > +		return vfio_mem_readwrite(0, vdev, buf, count, ppos);
> > +	return -EINVAL;
> > +}
> > +
> > +static int vfio_msix_check(struct vfio_dev *vdev, u64 start, u32 len)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	u16 pos;
> > +	u32 table_offset;
> > +	u16 table_size;
> > +	u8 bir;
> > +	u32 lo, hi, startp, endp;
> > +
> > +	pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
> > +	if (!pos)
> > +		return 0;
> > +
> > +	pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &table_size);
> > +	table_size = (table_size & PCI_MSIX_FLAGS_QSIZE) + 1;
> > +	pci_read_config_dword(pdev, pos + 4, &table_offset);
> > +	bir = table_offset & PCI_MSIX_FLAGS_BIRMASK;
> > +	lo = table_offset >> PAGE_SHIFT;
> > +	hi = (table_offset + PCI_MSIX_ENTRY_SIZE * table_size + PAGE_SIZE - 
1)
> > +		>> PAGE_SHIFT;
> > +	startp = start >> PAGE_SHIFT;
> > +	endp = (start + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
> > +	if (bir == vfio_offset_to_pci_space(start) &&
> > +	    overlap(lo, hi, startp, endp)) {
> > +		printk(KERN_WARNING "%s: cannot write msi-x vectors\n",
> > +			__func__);
> > +		return -EINVAL;
> > +	}
> > +	return 0;
> > +}
> > +
> > +static ssize_t vfio_write(struct file *filep, const char __user *buf,
> > +			size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_listener *listener = filep->private_data;
> > +	struct vfio_dev *vdev = listener->vdev;
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int pci_space;
> > +	int ret;
> > +
> > +	/* no writes until IOMMU domain set */
> > +	if (vdev->udomain == NULL)
> > +		return -EINVAL;
> > +	pci_space = vfio_offset_to_pci_space(*ppos);
> > +	if (pci_space == VFIO_PCI_CONFIG_RESOURCE)
> > +		return vfio_config_readwrite(1, vdev,
> > +					(char __user *)buf, count, ppos);
> > +	if (pci_space > PCI_ROM_RESOURCE)
> > +		return -EINVAL;
> > +	if (pci_resource_flags(pdev, pci_space) & IORESOURCE_IO)
> > +		return vfio_io_readwrite(1, vdev,
> > +					(char __user *)buf, count, ppos);
> > +	if (pci_resource_flags(pdev, pci_space) & IORESOURCE_MEM) {
> > +		if (allow_unsafe_intrs) {
> > +			/* don't allow writes to msi-x vectors */
> > +			ret = vfio_msix_check(vdev, *ppos, count);
> > +			if (ret)
> > +				return ret;
> > +		}
> > +		return vfio_mem_readwrite(1, vdev,
> > +				(char __user *)buf, count, ppos);
> > +	}
> > +	return -EINVAL;
> > +}
> > +
> > +static int vfio_mmap(struct file *filep, struct vm_area_struct *vma)
> > +{
> > +	struct vfio_listener *listener = filep->private_data;
> > +	struct vfio_dev *vdev = listener->vdev;
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	unsigned long requested, actual;
> > +	int pci_space;
> > +	u64 start;
> > +	u32 len;
> > +	unsigned long phys;
> > +	int ret;
> > +
> > +	/* no reads or writes until IOMMU domain set */
> > +	if (vdev->udomain == NULL)
> > +		return -EINVAL;
> 
> What happens if user creates a mapping when domain is
> set, and then removes it with DOMAIN_UNSET ioctl?
> Can't userdpace access an unprotected device now?
>  we should just drop DOMAIN_UNSET, and document
> that iommu can not be changed once set.
Unset returns EBUSY if mappings are still in place.
But I don't expect anyone to bother with unsets.

> 
> > +
> > +	if (vma->vm_end < vma->vm_start)
> > +		return -EINVAL;
> > +	if ((vma->vm_flags & VM_SHARED) == 0)
> > +		return -EINVAL;
> > +
> > +
> > +	pci_space = vfio_offset_to_pci_space((u64)vma->vm_pgoff << 
PAGE_SHIFT);
> > +	if (pci_space > PCI_ROM_RESOURCE)
> > +		return -EINVAL;
> > +	switch (pci_space) {
> > +	case PCI_ROM_RESOURCE:
> > +		if (vma->vm_flags & VM_WRITE)
> > +			return -EINVAL;
> > +		if (pci_resource_flags(pdev, PCI_ROM_RESOURCE) == 0)
> > +			return -EINVAL;
> > +		actual = pci_resource_len(pdev, PCI_ROM_RESOURCE) >> PAGE_SHIFT;
> > +		break;
> > +	default:
> > +		if ((pci_resource_flags(pdev, pci_space) & IORESOURCE_MEM) == 0)
> > +			return -EINVAL;
> > +		actual = pci_resource_len(pdev, pci_space) >> PAGE_SHIFT;
> > +		break;
> > +	}
> > +
> > +	requested = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> > +	if (requested > actual || actual == 0)
> > +		return -EINVAL;
> > +
> > +	start = vma->vm_pgoff << PAGE_SHIFT;
> > +	len = vma->vm_end - vma->vm_start;
> > +	if (allow_unsafe_intrs && (vma->vm_flags & VM_WRITE)) {
> > +		/*
> > +		 * Deter users from screwing up MSI-X intrs
> > +		 */
> > +		ret = vfio_msix_check(vdev, start, len);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	vma->vm_private_data = vdev;
> > +	vma->vm_flags |= VM_IO | VM_RESERVED;
> > +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > +	phys = pci_resource_start(pdev, pci_space) >> PAGE_SHIFT;
> > +
> > +	return remap_pfn_range(vma, vma->vm_start, phys,
> > +			       vma->vm_end - vma->vm_start,
> > +			       vma->vm_page_prot);
> > +}
> > +
> > +static long vfio_unl_ioctl(struct file *filep,
> > +			unsigned int cmd,
> > +			unsigned long arg)
> > +{
> > +	struct vfio_listener *listener = filep->private_data;
> > +	struct vfio_dev *vdev = listener->vdev;
> > +	void __user *uarg = (void __user *)arg;
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	struct vfio_dma_map *dm;
> > +	int ret = 0;
> > +	int fd, nfd;
> > +	int bar;
> > +
> > +	if (vdev == NULL)
> > +		return -EINVAL;
> > +
> > +	switch (cmd) {
> > +
> > +	case VFIO_DMA_MAP_IOVA:
> > +		dm = kmalloc(sizeof *dm, GFP_KERNEL);
> 
> Why bother allocating on heap? It's a small structure ...
Vestigial nonsense; removed.

> 
> > +		if (dm == NULL)
> > +			return -ENOMEM;
> > +		if (copy_from_user(dm, uarg, sizeof *dm)) {
> > +			kfree(dm);
> > +			return -EFAULT;
> > +		}
> > +		ret = vfio_dma_map_common(listener, cmd, dm);
> > +		if (!ret && copy_to_user(uarg, dm, sizeof *dm))
> > +			ret = -EFAULT;
> > +		kfree(dm);
> > +		break;
> > +
> > +	case VFIO_DMA_UNMAP:
> > +		dm = kmalloc(sizeof *dm, GFP_KERNEL);
> 
> same here
> 
> > +		if (dm == NULL)
> > +			return -ENOMEM;
> > +		if (copy_from_user(dm, uarg, sizeof *dm)) {
> > +			kfree(dm);
> > +			return -EFAULT;
> > +		}
> > +		ret = vfio_dma_unmap_dm(listener, dm);
> > +		kfree(dm);
> > +		break;
> > +
> > +	case VFIO_EVENTFD_IRQ:
> > +		if (copy_from_user(&fd, uarg, sizeof fd))
> > +			return -EFAULT;
> > +		mutex_lock(&vdev->igate);
> > +		if (vdev->ev_irq)
> > +			eventfd_ctx_put(vdev->ev_irq);
> > +		if (fd >= 0) {
> > +			vdev->ev_irq = eventfd_ctx_fdget(fd);
> > +			if (vdev->ev_irq == NULL)
> > +				ret = -EINVAL;
> > +		}
> > +		mutex_unlock(&vdev->igate);
> > +		break;
> > +
> > +	case VFIO_EVENTFDS_MSI:
> > +		if (copy_from_user(&nfd, uarg, sizeof nfd))
> > +			return -EFAULT;
> > +		uarg += sizeof nfd;
> > +		mutex_lock(&vdev->igate);
> > +		if (nfd > 0 && vdev->ev_msi == NULL)
> 
> == NULL -> ! here and elsewhere
> 
> > +			ret = vfio_setup_msi(vdev, nfd, uarg);
> > +		else if (nfd == 0 && vdev->ev_msi)
> > +			vfio_drop_msi(vdev);
> > +		else
> > +			ret = -EINVAL;
> > +		mutex_unlock(&vdev->igate);
> > +		break;
> > +
> > +	case VFIO_EVENTFDS_MSIX:
> > +		if (copy_from_user(&nfd, uarg, sizeof nfd))
> > +			return -EFAULT;
> > +		uarg += sizeof nfd;
> 
> Maybe cast to int __user *.
> Then use simple + 1 for access instead of sizeof,
> and get_user instead of copy_from_user.
Done.

> 
> > +		mutex_lock(&vdev->igate);
> > +		if (nfd > 0 && vdev->ev_msix == NULL)
> > +			ret = vfio_setup_msix(vdev, nfd, uarg);
> > +		else if (nfd == 0 && vdev->ev_msix)
> > +			vfio_drop_msix(vdev);
> > +		else
> > +			ret = -EINVAL;
> > +		mutex_unlock(&vdev->igate);
> > +		break;
> > +
> > +	case VFIO_BAR_LEN:
> > +		if (copy_from_user(&bar, uarg, sizeof bar))
> > +			return -EFAULT;
> > +		if (bar < 0 || bar > PCI_ROM_RESOURCE)
> > +			return -EINVAL;
> > +		if (pci_resource_start(pdev, bar))
> > +			bar = pci_resource_len(pdev, bar);
> > +		else
> > +			bar = 0;
> > +		if (copy_to_user(uarg, &bar, sizeof bar))
> > +			return -EFAULT;
> > +		break;
> > +
> > +	case VFIO_DOMAIN_SET:
> > +		if (copy_from_user(&fd, uarg, sizeof fd))
> > +			return -EFAULT;
> > +		ret = vfio_domain_set(vdev, fd, allow_unsafe_intrs);
> > +		break;
> > +
> > +	case VFIO_DOMAIN_UNSET:
> > +		ret = vfio_domain_unset(vdev);
> > +		break;
> > +
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +	return ret;
> > +}
> > +
> > +static const struct file_operations vfio_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.open		= vfio_open,
> > +	.release	= vfio_release,
> > +	.read		= vfio_read,
> > +	.write		= vfio_write,
> > +	.unlocked_ioctl	= vfio_unl_ioctl,
> > +	.mmap		= vfio_mmap,
> > +};
> > +
> > +static int vfio_get_devnum(struct vfio_dev *vdev)
> > +{
> > +	int retval = -ENOMEM;
> > +	int id;
> > +
> > +	mutex_lock(&vfio_minor_lock);
> > +	if (idr_pre_get(&vfio_idr, GFP_KERNEL) == 0)
> > +		goto exit;
> > +
> > +	retval = idr_get_new(&vfio_idr, vdev, &id);
> > +	if (retval < 0) {
> > +		if (retval == -EAGAIN)
> > +			retval = -ENOMEM;
> > +		goto exit;
> > +	}
> > +	if (id > MINORMASK) {
> > +		idr_remove(&vfio_idr, id);
> > +		retval = -ENOMEM;
> > +	}
> > +	if (id > vfio_max_minor)
> > +		vfio_max_minor = id;
> > +	if (vfio_major < 0) {
> > +		retval = register_chrdev(0, "vfio", &vfio_fops);
> > +		if (retval < 0)
> > +			goto exit;
> > +		vfio_major = retval;
> > +	}
> > +
> > +	retval = MKDEV(vfio_major, id);
> > +exit:
> > +	mutex_unlock(&vfio_minor_lock);
> > +	return retval;
> > +}
> > +
> > +int vfio_validate(struct vfio_dev *vdev)
> > +{
> > +	int rc = 0;
> > +	int id;
> > +
> > +	mutex_lock(&vfio_minor_lock);
> > +	for (id = 0; id <= vfio_max_minor; id++)
> > +		if (vdev == idr_find(&vfio_idr, id))
> > +			goto out;
> > +	rc = 1;
> > +out:
> > +	mutex_unlock(&vfio_minor_lock);
> > +	return rc;
> > +}
> > +
> > +static void vfio_free_minor(struct vfio_dev *vdev)
> > +{
> > +	mutex_lock(&vfio_minor_lock);
> > +	idr_remove(&vfio_idr, MINOR(vdev->devnum));
> > +	mutex_unlock(&vfio_minor_lock);
> > +}
> > +
> > +/*
> > + * Verify that the device supports Interrupt Disable bit in command
> > register, + * per PCI 2.3, by flipping this bit and reading it back:
> > this bit was readonly + * in PCI 2.2.  (from uio_pci_generic)
> > + */
> > +static int verify_pci_2_3(struct pci_dev *pdev)
> > +{
> > +	u16 orig, new;
> > +	u8 pin;
> > +
> > +	pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> > +	if (pin == 0)		/* irqs not needed */
> > +		return 0;
> > +
> > +	pci_read_config_word(pdev, PCI_COMMAND, &orig);
> > +	pci_write_config_word(pdev, PCI_COMMAND,
> > +			      orig ^ PCI_COMMAND_INTX_DISABLE);
> > +	pci_read_config_word(pdev, PCI_COMMAND, &new);
> > +	/* There's no way to protect against
> > +	 * hardware bugs or detect them reliably, but as long as we know
> > +	 * what the value should be, let's go ahead and check it. */
> > +	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
> > +		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
> > +			"driver or HW bug?\n", orig, new);
> > +		return -EBUSY;
> > +	}
> > +	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
> > +		dev_warn(&pdev->dev, "Device does not support "
> > +			 "disabling interrupts: unable to bind.\n");
> > +		return -ENODEV;
> > +	}
> > +	/* Now restore the original value. */
> > +	pci_write_config_word(pdev, PCI_COMMAND, orig);
> > +	return 0;
> > +}
> > +
> > +static int vfio_probe(struct pci_dev *pdev, const struct pci_device_id
> > *id) +{
> > +	struct vfio_dev *vdev;
> > +	int err;
> > +	u8 type;
> > +
> > +	if (!iommu_found())
> > +		return -EINVAL;
> > +
> > +	pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
> > +	if ((type & 0x7F) != PCI_HEADER_TYPE_NORMAL)
> > +		return -EINVAL;
> > +
> > +	err = verify_pci_2_3(pdev);
> > +	if (err)
> > +		return err;
> > +
> > +	vdev = kzalloc(sizeof(struct vfio_dev), GFP_KERNEL);
> > +	if (!vdev)
> > +		return -ENOMEM;
> > +	vdev->pdev = pdev;
> > +
> > +	mutex_init(&vdev->lgate);
> > +	mutex_init(&vdev->dgate);
> > +	mutex_init(&vdev->igate);
> > +	mutex_init(&vdev->ngate);
> > +	INIT_LIST_HEAD(&vdev->nlc_list);
> > +	init_waitqueue_head(&vdev->dev_idle_q);
> > +	init_waitqueue_head(&vdev->nl_wait_q);
> > +
> > +	err = vfio_get_devnum(vdev);
> > +	if (err < 0)
> > +		goto err_get_devnum;
> > +	vdev->devnum = err;
> > +	err = 0;
> > +
> > +	sprintf(vdev->name, "vfio%d", MINOR(vdev->devnum));
> > +	pci_set_drvdata(pdev, vdev);
> > +	vdev->dev = device_create(vfio_class->class, &pdev->dev,
> > +			  vdev->devnum, vdev, vdev->name);
> > +	if (IS_ERR(vdev->dev)) {
> > +		printk(KERN_ERR "VFIO: device register failed\n");
> > +		err = PTR_ERR(vdev->dev);
> > +		goto err_device_create;
> > +	}
> > +
> > +	err = vfio_dev_add_attributes(vdev);
> > +	if (err)
> > +		goto err_vfio_dev_add_attributes;
> > +
> > +
> > +	if (pdev->irq > 0) {
> > +		err = request_irq(pdev->irq, vfio_interrupt,
> > +				  IRQF_SHARED, vdev->name, vdev);
> > +		if (err)
> > +			goto err_request_irq;
> 
> Since this is a sahred interrupt, you will get called
> even if MSI in device is enabled, which will confuse
> users. How about requesting irq upon an ioctl?
OK, now requested at ioctl and freed on release.

> 
> > +	}
> > +
> > +	return 0;
> > +
> > +err_request_irq:
> > +err_vfio_dev_add_attributes:
> > +	device_destroy(vfio_class->class, vdev->devnum);
> > +err_device_create:
> > +	vfio_free_minor(vdev);
> > +err_get_devnum:
> > +	kfree(vdev);
> > +	return err;
> > +}
> > +
> > +static void vfio_remove(struct pci_dev *pdev)
> > +{
> > +	struct vfio_dev *vdev = pci_get_drvdata(pdev);
> > +	int ret;
> > +
> > +	/* prevent further opens */
> > +	vfio_free_minor(vdev);
> > +
> > +	/* notify users */
> > +	ret = vfio_nl_remove(vdev);
> > +
> > +	/* wait for all closed */
> > +	wait_event(vdev->dev_idle_q, vdev->listeners == 0);
> > +
> > +	pci_disable_device(pdev);
> > +	if (pdev->irq > 0)
> > +		free_irq(pdev->irq, vdev);
> > +
> > +	vfio_nl_freeclients(vdev);
> > +	device_destroy(vfio_class->class, vdev->devnum);
> > +	pci_set_drvdata(pdev, NULL);
> > +	kfree(vdev);
> > +}
> > +
> > +static struct pci_error_handlers vfio_error_handlers = {
> > +	.error_detected	= vfio_error_detected,
> > +	.mmio_enabled	= vfio_mmio_enabled,
> > +	.link_reset	= vfio_link_reset,
> > +	.slot_reset	= vfio_slot_reset,
> > +	.resume		= vfio_error_resume,
> > +};
> > +
> > +static struct pci_driver driver = {
> > +	.name		= "vfio",
> > +	.id_table	= NULL, /* only dynamic id's */
> > +	.probe		 = vfio_probe,
> > +	.remove		 = vfio_remove,
> > +	.err_handler	 = &vfio_error_handlers,
> > +};
> > +
> > +static atomic_t vfio_pm_suspend_count;
> > +static int vfio_pm_suspend_result;
> > +static DECLARE_WAIT_QUEUE_HEAD(vfio_pm_wait_q);
> > +
> > +/*
> > + * Notify user level drivers of hibernation/suspend request
> > + * Send all the notifies in parallel, collect all the replies
> > + * If one ULD can't suspend, none can
> > + */
> > +static int vfio_pm_suspend(void)
> > +{
> > +	struct vfio_dev *vdev;
> > +	int id, alive = 0;
> > +	int ret;
> > +
> > +	mutex_lock(&vfio_minor_lock);
> > +	atomic_set(&vfio_pm_suspend_count, 0);
> > +	vfio_pm_suspend_result = NOTIFY_DONE;
> > +	for (id = 0; id <= vfio_max_minor; id++) {
> > +		vdev = idr_find(&vfio_idr, id);
> > +		if (vdev == NULL)
> > +			continue;
> > +		if (vdev->listeners == 0)
> > +			continue;
> > +		alive++;
> > +		ret = vfio_nl_upcall(vdev, VFIO_MSG_PM_SUSPEND, 0, 0);
> > +		if (ret == 0)
> > +			atomic_inc(&vfio_pm_suspend_count);
> > +	}
> > +	mutex_unlock(&vfio_minor_lock);
> > +	if (alive > atomic_read(&vfio_pm_suspend_count))
> > +		return NOTIFY_BAD;
> > +
> > +	/* sleep for reply */
> > +	if (wait_event_interruptible_timeout(vfio_pm_wait_q,
> > +	    (atomic_read(&vfio_pm_suspend_count) == 0),
> > +	    VFIO_SUSPEND_REPLY_TIMEOUT) <= 0) {
> > +		printk(KERN_ERR "vfio upcall suspend reply timeout\n");
> > +		return NOTIFY_BAD;
> > +	}
> > +	return vfio_pm_suspend_result;
> > +}
> > +
> > +static int vfio_pm_resume(void)
> > +{
> > +	struct vfio_dev *vdev;
> > +	int id;
> > +
> > +	mutex_lock(&vfio_minor_lock);
> > +	for (id = 0; id <= vfio_max_minor; id++) {
> > +		vdev = idr_find(&vfio_idr, id);
> > +		if (vdev == NULL)
> > +			continue;
> > +		if (vdev->listeners == 0)
> > +			continue;
> > +		(void) vfio_nl_upcall(vdev, VFIO_MSG_PM_RESUME, 0, 0);
> > +	}
> > +	mutex_unlock(&vfio_minor_lock);
> > +	return NOTIFY_DONE;
> > +}
> > +
> > +
> > +void vfio_pm_process_reply(int reply)
> > +{
> > +	if (vfio_pm_suspend_result == NOTIFY_DONE) {
> > +		if (reply != NOTIFY_DONE)
> > +			vfio_pm_suspend_result = NOTIFY_BAD;
> > +	}
> > +	if (atomic_dec_and_test(&vfio_pm_suspend_count))
> > +		wake_up(&vfio_pm_wait_q);
> > +}
> > +
> > +static int vfio_pm_notify(struct notifier_block *this, unsigned long
> > event, +	void *notused)
> > +{
> > +	switch (event) {
> > +	case PM_HIBERNATION_PREPARE:
> > +	case PM_SUSPEND_PREPARE:
> > +		return vfio_pm_suspend();
> > +		break;
> > +	case PM_POST_HIBERNATION:
> > +	case PM_POST_SUSPEND:
> > +		return vfio_pm_resume();
> > +		break;
> > +	default:
> > +		return NOTIFY_DONE;
> > +	}
> > +}
> > +
> > +struct notifier_block vfio_pm_nb = {
> > +	.notifier_call = vfio_pm_notify,
> > +};
> > +
> > +static int __init init(void)
> > +{
> > +	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
> > +	vfio_class_init();
> > +	vfio_nl_init();
> > +	register_pm_notifier(&vfio_pm_nb);
> > +	return pci_register_driver(&driver);
> > +}
> > +
> > +static void __exit cleanup(void)
> > +{
> > +	if (vfio_major >= 0)
> > +		unregister_chrdev(vfio_major, "vfio");
> > +	pci_unregister_driver(&driver);
> > +	unregister_pm_notifier(&vfio_pm_nb);
> > +	unregister_pm_notifier(&vfio_pm_nb);
> > +	vfio_nl_exit();
> > +	vfio_class_destroy();
> > +}
> > +
> > +module_init(init);
> > +module_exit(cleanup);
> > +
> > +MODULE_VERSION(DRIVER_VERSION);
> > +MODULE_LICENSE("GPL v2");
> > +MODULE_AUTHOR(DRIVER_AUTHOR);
> > +MODULE_DESCRIPTION(DRIVER_DESC);
> > diff --git a/drivers/vfio/vfio_netlink.c b/drivers/vfio/vfio_netlink.c
> > new file mode 100644
> > index 0000000..bc9a7d3
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_netlink.c
> > @@ -0,0 +1,459 @@
> > +/*
> > + * Netlink inteface for VFIO
> > + * Author: Tom Lyon (pugs@xxxxxxxxx)
> > + *
> > + * Copyright 2010, Cisco Systems, Inc.
> > + * Copyright 2007, 2008 Siemens AG
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2
> > + * as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > along + * with this program; if not, write to the Free Software
> > Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA
> > 02110-1301 USA.
> > + *
> > + * Derived from net/ieee802154/netlink.c Written by:
> > + * Sergey Lapin <slapin@xxxxxxxxxxx>
> > + * Dmitry Eremin-Solenikov <dbaryshkov@xxxxxxxxx>
> > + * Maxim Osipov <maxim.osipov@xxxxxxxxxxx>
> > + */
> > +
> > +/*
> > + * This code handles the signaling of various system events
> > + * to the user level driver, using the generic netlink facilities.
> > + * In many cases, we wait for replies from the user driver as well.
> > + */
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/gfp.h>
> > +#include <linux/pci.h>
> > +#include <linux/sched.h>
> > +#include <net/genetlink.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/vfio.h>
> > +
> > +static u32 vfio_seq_num;
> > +static DEFINE_SPINLOCK(vfio_seq_lock);
> > +
> > +struct genl_family vfio_nl_family = {
> > +	.id		= GENL_ID_GENERATE,
> > +	.hdrsize	= 0,
> > +	.name		= VFIO_GENL_NAME,
> > +	.version	= 1,
> > +	.maxattr	= VFIO_NL_ATTR_MAX,
> > +};
> > +
> > +/* Requests to userspace */
> > +struct sk_buff *vfio_nl_create(u8 req)
> > +{
> > +	void *hdr;
> > +	struct sk_buff *msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
> > +	unsigned long f;
> > +
> > +	if (!msg)
> > +		return NULL;
> > +
> > +	spin_lock_irqsave(&vfio_seq_lock, f);
> > +	hdr = genlmsg_put(msg, 0, ++vfio_seq_num,
> > +			&vfio_nl_family, 0, req);
> > +	spin_unlock_irqrestore(&vfio_seq_lock, f);
> > +	if (!hdr) {
> > +		nlmsg_free(msg);
> > +		return NULL;
> > +	}
> > +
> > +	return msg;
> > +}
> > +
> > +/*
> > + * We would have liked to use NL multicast, but
> > + * (a) multicast sockets are only for root
> > + * (b) there's no multicast user level api in libnl
> > + * (c) we need to know what net namespaces are involved
> > + * Sigh.
> > + */
> > +int vfio_nl_mcast(struct vfio_dev *vdev, struct sk_buff *msg, u8 type)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_nl_client *nlc;
> > +	struct sk_buff *skb;
> > +	/* XXX: nlh is right at the start of msg */
> > +	void *hdr = genlmsg_data(NLMSG_DATA(msg->data));
> > +	int good = 0;
> > +	int rc;
> > +
> > +	if (genlmsg_end(msg, hdr) < 0) {
> > +		nlmsg_free(msg);
> > +		return -ENOBUFS;
> > +	}
> > +
> > +	mutex_lock(&vdev->ngate);
> > +	list_for_each(pos, &vdev->nlc_list) {
> > +		nlc = list_entry(pos, struct vfio_nl_client, list);
> > +		if (nlc->msgcap & (1LL << type)) {
> > +			skb = skb_copy(msg, GFP_KERNEL);
> > +			if (skb == NULL)  {
> > +				rc = -ENOBUFS;
> > +				goto out;
> > +			}
> > +			rc = genlmsg_unicast(nlc->net, skb, nlc->pid);
> > +			if (rc == 0)
> > +				good++;
> > +		}
> > +	}
> > +	rc = 0;
> > +out:
> > +	mutex_unlock(&vdev->ngate);
> > +	nlmsg_free(msg);
> > +	if (good)
> > +		return good;
> > +	return rc;
> > +}
> > +
> > +#ifdef notdef
> > +struct sk_buff *vfio_nl_new_reply(struct genl_info *info,
> > +		int flags, u8 req)
> > +{
> > +	void *hdr;
> > +	struct sk_buff *msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
> > +
> > +	if (!msg)
> > +		return NULL;
> > +
> > +	hdr = genlmsg_put_reply(msg, info,
> > +			&vfio_nl_family, flags, req);
> > +	if (!hdr) {
> > +		nlmsg_free(msg);
> > +		return NULL;
> > +	}
> > +
> > +	return msg;
> > +}
> > +
> > +int vfio_nl_reply(struct sk_buff *msg, struct genl_info *info)
> > +{
> > +	/* XXX: nlh is right at the start of msg */
> > +	void *hdr = genlmsg_data(NLMSG_DATA(msg->data));
> > +
> > +	if (genlmsg_end(msg, hdr) < 0)
> > +		goto out;
> > +
> > +	return genlmsg_reply(msg, info);
> > +out:
> > +	nlmsg_free(msg);
> > +	return -ENOBUFS;
> > +}
> > +#endif
> > +
> > +
> > +static const struct nla_policy vfio_nl_reg_policy[VFIO_NL_ATTR_MAX+1] =
> > { +	[VFIO_ATTR_MSGCAP]	= { .type = NLA_U64 },
> > +	[VFIO_ATTR_PCI_DOMAIN]	= { .type = NLA_U32 },
> > +	[VFIO_ATTR_PCI_BUS]	= { .type = NLA_U16 },
> > +	[VFIO_ATTR_PCI_SLOT]	= { .type = NLA_U8 },
> > +	[VFIO_ATTR_PCI_FUNC]	= { .type = NLA_U8 },
> > +};
> > +
> > +struct vfio_dev *vfio_nl_get_vdev(struct genl_info *info)
> > +{
> > +	u32 domain;
> > +	u16 bus;
> > +	u8 slot, func;
> > +	u16 devfn;
> > +	struct pci_dev *pdev;
> > +	struct vfio_dev *vdev;
> > +
> > +	domain = nla_get_u32(info->attrs[VFIO_ATTR_PCI_DOMAIN]);
> > +	bus = nla_get_u16(info->attrs[VFIO_ATTR_PCI_BUS]);
> > +	slot = nla_get_u8(info->attrs[VFIO_ATTR_PCI_SLOT]);
> > +	func = nla_get_u8(info->attrs[VFIO_ATTR_PCI_FUNC]);
> > +	devfn = PCI_DEVFN(slot, func);
> > +	pdev = pci_get_domain_bus_and_slot(domain, bus, devfn);
> > +	if (pdev == NULL)
> > +		return NULL;
> > +	vdev = pci_get_drvdata(pdev);
> > +	if (vdev == NULL)
> > +		return NULL;
> > +	if (vfio_validate(vdev))
> > +		return NULL;
> > +	if (vdev->pdev != pdev || strncmp(vdev->name, "vfio", 4))
> > +		return NULL;
> > +	return vdev;
> > +}
> > +
> > +/*
> > + * The user driver must register here with a bitmask of which
> > + * events it is interested in receiving
> > + */
> > +static int vfio_nl_user_register(struct sk_buff *skb, struct genl_info
> > *info) +{
> > +	u64 msgcap;
> > +	struct list_head *pos;
> > +	struct vfio_nl_client *nlc;
> > +	int rc = 0;
> > +	struct vfio_dev *vdev;
> > +
> > +	msgcap = nla_get_u64(info->attrs[VFIO_ATTR_MSGCAP]);
> > +	if (msgcap == 0)
> > +		return -EINVAL;
> > +	vdev = vfio_nl_get_vdev(info);
> > +	if (vdev == NULL)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&vdev->ngate);
> > +	list_for_each(pos, &vdev->nlc_list) {
> > +		nlc = list_entry(pos, struct vfio_nl_client, list);
> > +		if (nlc->pid == info->snd_pid &&
> > +		    nlc->net == info->_net)	/* already here */
> > +			goto update;
> > +	}
> > +	nlc = kzalloc(sizeof(struct vfio_nl_client), GFP_KERNEL);
> > +	if (nlc == NULL) {
> > +		rc = -ENOMEM;
> > +		goto out;
> > +	}
> > +	nlc->pid = info->snd_pid;
> > +	nlc->net = info->_net;
> > +	list_add(&nlc->list, &vdev->nlc_list);
> > +update:
> > +	nlc->msgcap = msgcap;
> > +out:
> > +	mutex_unlock(&vdev->ngate);
> > +	return rc;
> > +}
> > +
> > +static const struct nla_policy vfio_nl_err_policy[VFIO_NL_ATTR_MAX+1] =
> > { +	[VFIO_ATTR_ERROR_HANDLING_REPLY] = { .type = NLA_U32 },
> > +	[VFIO_ATTR_PCI_DOMAIN]	= { .type = NLA_U32 },
> > +	[VFIO_ATTR_PCI_BUS]	= { .type = NLA_U16 },
> > +	[VFIO_ATTR_PCI_SLOT]	= { .type = NLA_U8 },
> > +	[VFIO_ATTR_PCI_FUNC]	= { .type = NLA_U8 },
> > +};
> > +
> > +static int vfio_nl_error_handling_reply(struct sk_buff *skb,
> > +					struct genl_info *info)
> > +{
> > +	u32 value, seq;
> > +	struct vfio_dev *vdev;
> > +
> > +	value = nla_get_u32(info->attrs[VFIO_ATTR_ERROR_HANDLING_REPLY]);
> > +	vdev = vfio_nl_get_vdev(info);
> > +	if (vdev == NULL)
> > +		return -EINVAL;
> > +	seq = nlmsg_hdr(skb)->nlmsg_seq;
> > +	if (seq > vdev->nl_reply_seq) {
> > +		vdev->nl_reply_value = value;
> > +		vdev->nl_reply_seq = seq;
> > +		wake_up(&vdev->nl_wait_q);
> > +	}
> > +	return 0;
> > +}
> > +
> > +static const struct nla_policy vfio_nl_pm_policy[VFIO_NL_ATTR_MAX+1] = {
> > +	[VFIO_ATTR_PM_SUSPEND_REPLY] = { .type = NLA_U32 },
> > +	[VFIO_ATTR_PCI_DOMAIN]	= { .type = NLA_U32 },
> > +	[VFIO_ATTR_PCI_BUS]	= { .type = NLA_U16 },
> > +	[VFIO_ATTR_PCI_SLOT]	= { .type = NLA_U8 },
> > +	[VFIO_ATTR_PCI_FUNC]	= { .type = NLA_U8 },
> > +};
> > +
> > +static int vfio_nl_pm_suspend_reply(struct sk_buff *skb, struct
> > genl_info *info) +{
> > +	u32 value;
> > +	struct vfio_dev *vdev;
> > +
> > +	value = nla_get_u32(info->attrs[VFIO_ATTR_PM_SUSPEND_REPLY]);
> > +	vdev = vfio_nl_get_vdev(info);
> > +	if (vdev == NULL)
> > +		return -EINVAL;
> > +	if (vdev->listeners == 0)
> > +		return -EINVAL;
> > +	vfio_pm_process_reply(value);
> > +	return 0;
> > +}
> > +
> > +void vfio_nl_freeclients(struct vfio_dev *vdev)
> > +{
> > +	struct list_head *pos, *pos2;
> > +	struct vfio_nl_client *nlc;
> > +
> > +	mutex_lock(&vdev->ngate);
> > +	list_for_each_safe(pos, pos2, &vdev->nlc_list) {
> > +		nlc = list_entry(pos, struct vfio_nl_client, list);
> > +		list_del(&nlc->list);
> > +		kfree(nlc);
> > +	}
> > +	mutex_unlock(&vdev->ngate);
> > +}
> > +
> > +static struct genl_ops vfio_nl_reg_ops = {
> > +	.cmd	= VFIO_MSG_REGISTER,
> > +	.doit	= vfio_nl_user_register,
> > +	.policy	= vfio_nl_reg_policy,
> > +};
> > +
> > +static struct genl_ops vfio_nl_err_ops = {
> > +	.cmd	= VFIO_MSG_ERROR_HANDLING_REPLY,
> > +	.doit	= vfio_nl_error_handling_reply,
> > +	.policy	= vfio_nl_err_policy,
> > +};
> > +
> > +static struct genl_ops vfio_nl_pm_ops = {
> > +	.cmd	= VFIO_MSG_PM_SUSPEND_REPLY,
> > +	.doit	= vfio_nl_pm_suspend_reply,
> > +	.policy	= vfio_nl_pm_policy,
> > +};
> > +
> > +int vfio_nl_init(void)
> > +{
> > +	int rc;
> > +
> > +	rc = genl_register_family(&vfio_nl_family);
> > +	if (rc)
> > +		goto fail;
> > +
> > +	rc = genl_register_ops(&vfio_nl_family, &vfio_nl_reg_ops);
> > +	if (rc < 0)
> > +		goto fail;
> > +	rc = genl_register_ops(&vfio_nl_family, &vfio_nl_err_ops);
> > +	if (rc < 0)
> > +		goto fail;
> > +	rc = genl_register_ops(&vfio_nl_family, &vfio_nl_pm_ops);
> > +	if (rc < 0)
> > +		goto fail;
> > +	return 0;
> > +
> > +fail:
> > +	genl_unregister_family(&vfio_nl_family);
> > +	return rc;
> > +}
> > +
> > +void vfio_nl_exit(void)
> > +{
> > +	genl_unregister_family(&vfio_nl_family);
> > +}
> > +
> > +int vfio_nl_remove(struct vfio_dev *vdev)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	struct sk_buff *msg;
> > +	int rc;
> > +
> > +	msg = vfio_nl_create(VFIO_MSG_REMOVE);
> > +	if (!msg)
> > +		return -ENOBUFS;
> > +
> > +	NLA_PUT_U32(msg, VFIO_ATTR_PCI_DOMAIN, pci_domain_nr(pdev->bus));
> > +	NLA_PUT_U16(msg, VFIO_ATTR_PCI_BUS, pdev->bus->number);
> > +	NLA_PUT_U8(msg, VFIO_ATTR_PCI_SLOT, PCI_SLOT(pdev->devfn));
> > +	NLA_PUT_U8(msg, VFIO_ATTR_PCI_FUNC, PCI_FUNC(pdev->devfn));
> > +
> > +	rc = vfio_nl_mcast(vdev, msg, VFIO_MSG_REMOVE);
> > +	if (rc > 0)
> > +		rc = 0;
> > +	return rc;
> > +
> > +nla_put_failure:
> > +	nlmsg_free(msg);
> > +	return -ENOBUFS;
> > +}
> > +
> > +int vfio_nl_upcall(struct vfio_dev *vdev, u8 type, int state, int
> > waitret) +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	struct sk_buff *msg;
> > +	u32 seq;
> > +
> > +	msg = vfio_nl_create(type);
> > +	if (!msg)
> > +		goto null_out;
> > +	seq = nlmsg_hdr(msg)->nlmsg_seq;
> > +
> > +	NLA_PUT_U32(msg, VFIO_ATTR_PCI_DOMAIN, pci_domain_nr(pdev->bus));
> > +	NLA_PUT_U16(msg, VFIO_ATTR_PCI_BUS, pdev->bus->number);
> > +	NLA_PUT_U8(msg, VFIO_ATTR_PCI_SLOT, PCI_SLOT(pdev->devfn));
> > +	NLA_PUT_U8(msg, VFIO_ATTR_PCI_FUNC, PCI_FUNC(pdev->devfn));
> > +
> > +	if (type == VFIO_MSG_ERROR_DETECTED)
> > +		NLA_PUT_U32(msg, VFIO_ATTR_CHANNEL_STATE, state);
> > +
> > +	if (vfio_nl_mcast(vdev, msg, type) <= 0)
> > +		goto null_out;
> > +	if (!waitret)
> > +		return 0;
> > +
> > +	/* sleep for reply */
> > +	if (wait_event_interruptible_timeout(vdev->nl_wait_q,
> > +	    (vdev->nl_reply_seq >= seq), VFIO_ERROR_REPLY_TIMEOUT) <= 0) {
> > +		printk(KERN_ERR "vfio upcall timeout\n");
> > +		goto null_out;
> > +	}
> > +	if (seq != vdev->nl_reply_seq)
> > +		goto null_out;
> > +	return vdev->nl_reply_value;
> > +
> > +nla_put_failure:
> > +	nlmsg_free(msg);
> > +null_out:
> > +	return -1;
> > +}
> > +
> > +/* the following routines invoked for pci error handling */
> > +
> > +pci_ers_result_t vfio_error_detected(struct pci_dev *pdev,
> > +					pci_channel_state_t state)
> > +{
> > +	struct vfio_dev *vdev = pci_get_drvdata(pdev);
> > +	int ret;
> > +
> > +	ret = vfio_nl_upcall(vdev, VFIO_MSG_ERROR_DETECTED, (int)state, 1);
> > +	if (ret >= 0)
> > +		return ret;
> > +	return PCI_ERS_RESULT_NONE;
> > +}
> > +
> > +pci_ers_result_t vfio_mmio_enabled(struct pci_dev *pdev)
> > +{
> > +	struct vfio_dev *vdev = pci_get_drvdata(pdev);
> > +	int ret;
> > +
> > +	ret = vfio_nl_upcall(vdev, VFIO_MSG_MMIO_ENABLED, 0, 1);
> > +	if (ret >= 0)
> > +		return ret;
> > +	return PCI_ERS_RESULT_NONE;
> > +}
> > +
> > +pci_ers_result_t vfio_link_reset(struct pci_dev *pdev)
> > +{
> > +	struct vfio_dev *vdev = pci_get_drvdata(pdev);
> > +	int ret;
> > +
> > +	ret = vfio_nl_upcall(vdev, VFIO_MSG_LINK_RESET, 0, 1);
> > +	if (ret >= 0)
> > +		return ret;
> > +	return PCI_ERS_RESULT_NONE;
> > +}
> > +
> > +pci_ers_result_t vfio_slot_reset(struct pci_dev *pdev)
> > +{
> > +	struct vfio_dev *vdev = pci_get_drvdata(pdev);
> > +	int ret;
> > +
> > +	ret = vfio_nl_upcall(vdev, VFIO_MSG_SLOT_RESET, 0, 1);
> > +	if (ret >= 0)
> > +		return ret;
> > +	return PCI_ERS_RESULT_NONE;
> > +}
> > +
> > +void vfio_error_resume(struct pci_dev *pdev)
> > +{
> > +	struct vfio_dev *vdev = pci_get_drvdata(pdev);
> > +
> > +	(void) vfio_nl_upcall(vdev, VFIO_MSG_ERROR_RESUME, 0, 0);
> > +}
> > diff --git a/drivers/vfio/vfio_pci_config.c
> > b/drivers/vfio/vfio_pci_config.c new file mode 100644
> > index 0000000..b7de0bf
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_pci_config.c
> > @@ -0,0 +1,698 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@xxxxxxxxx
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger@xxxxxxxxxxxxx>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Hans J. Koch <hjk@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg@xxxxxxxxx>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst@xxxxxxxxxx>
> > + */
> > +
> > +/*
> > + * This code handles reading and writing of PCI configuration registers.
> > + * This is hairy because we want to allow a lot of flexibility to the
> > + * user driver, but cannot trust it with all of the config fields.
> > + * Tables determine which fields can be read and written, as well as
> > + * which fields are 'virtualized' - special actions and translations to
> > + * make it appear to the user that he has control, when in fact things
> > + * must be negotiated with the underlying OS.
> > + */
> > +
> > +#include <linux/fs.h>
> > +#include <linux/pci.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/vfio.h>
> > +
> > +#define PCI_CAP_ID_BASIC	0
> > +#ifndef PCI_CAP_ID_MAX
> > +#define	PCI_CAP_ID_MAX		PCI_CAP_ID_AF
> > +#endif
> > +
> > +/*
> > + * Lengths of PCI Config Capabilities
> > + * 0 means unknown (but at least 4)
> > + * FF means special/variable
> > + */
> > +static u8 pci_capability_length[] = {
> > +	[PCI_CAP_ID_BASIC]	= 64,		/* pci config header */
> > +	[PCI_CAP_ID_PM]		= PCI_PM_SIZEOF,
> > +	[PCI_CAP_ID_AGP]	= PCI_AGP_SIZEOF,
> > +	[PCI_CAP_ID_VPD]	= 8,
> > +	[PCI_CAP_ID_SLOTID]	= 4,
> > +	[PCI_CAP_ID_MSI]	= 0xFF,		/* 10, 14, 20, or 24 */
> > +	[PCI_CAP_ID_CHSWP]	= 4,
> > +	[PCI_CAP_ID_PCIX]	= 0xFF,		/* 8 or 24 */
> > +	[PCI_CAP_ID_HT]		= 28,
> > +	[PCI_CAP_ID_VNDR]	= 0xFF,
> > +	[PCI_CAP_ID_DBG]	= 0,
> > +	[PCI_CAP_ID_CCRC]	= 0,
> > +	[PCI_CAP_ID_SHPC]	= 0,
> > +	[PCI_CAP_ID_SSVID]	= 0,		/* bridge only - not supp */
> > +	[PCI_CAP_ID_AGP3]	= 0,
> > +	[PCI_CAP_ID_EXP]	= 36,
> > +	[PCI_CAP_ID_MSIX]	= 12,
> > +	[PCI_CAP_ID_AF]		= 6,
> > +};
> > +
> > +/*
> > + * Read/Write Permission Bits - one bit for each bit in capability
> > + * Any field can be read if it exists,
> > + * but what is read depends on whether the field
> > + * is 'virtualized', or just pass thru to the hardware.
> > + * Any virtualized field is also virtualized for writes.
> > + * Writes are only permitted if they have a 1 bit here.
> > + */
> > +struct perm_bits {
> > +	u32	rvirt;		/* read bits which must be virtualized */
> > +	u32	write;		/* writeable bits - virt if read virt */
> > +};
> > +
> > +static struct perm_bits pci_cap_basic_perm[] = {
> > +	{ 0xFFFFFFFF,	0, },		/* 0x00 vendor & device id - RO */
> > +	{ 0x00000003,	0xFFFFFFFF, },	/* 0x04 cmd - mem & io bits virt */
> > +	{ 0,		0, },		/* 0x08 class code & revision id */
> > +	{ 0,		0xFF00FFFF, },	/* 0x0c bist, htype, lat, cache */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x10 bar */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x14 bar */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x18 bar */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x1c bar */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x20 bar */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x24 bar */
> > +	{ 0,		0, },		/* 0x28 cardbus - not yet */
> > +	{ 0,		0, },		/* 0x2c subsys vendor & dev */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x30 rom bar */
> > +	{ 0,		0, },		/* 0x34 capability ptr & resv */
> > +	{ 0,		0, },		/* 0x38 resv */
> > +	{ 0x000000FF,	0x000000FF, },	/* 0x3c max_lat ... irq */
> > +};
> > +
> > +static struct perm_bits pci_cap_pm_perm[] = {
> > +	{ 0,		0, },		/* 0x00 PM capabilities */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x04 PM control/status */
> > +};
> > +
> > +static struct perm_bits pci_cap_vpd_perm[] = {
> > +	{ 0,		0xFFFF0000, },	/* 0x00 address */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x04 data */
> > +};
> > +
> > +static struct perm_bits pci_cap_slotid_perm[] = {
> > +	{ 0,		0, },		/* 0x00 all read only */
> > +};
> > +
> > +/* 4 different possible layouts of MSI capability */
> > +static struct perm_bits pci_cap_msi_10_perm[] = {
> > +	{ 0x00FF0000,	0x00FF0000, },	/* 0x00 MSI message control */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x04 MSI message address */
> > +	{ 0x0000FFFF,	0x0000FFFF, },	/* 0x08 MSI message data */
> > +};
> > +static struct perm_bits pci_cap_msi_14_perm[] = {
> > +	{ 0x00FF0000,	0x00FF0000, },	/* 0x00 MSI message control */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x04 MSI message address */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x08 MSI message upper addr */
> > +	{ 0x0000FFFF,	0x0000FFFF, },	/* 0x0c MSI message data */
> > +};
> > +static struct perm_bits pci_cap_msi_20_perm[] = {
> > +	{ 0x00FF0000,	0x00FF0000, },	/* 0x00 MSI message control */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x04 MSI message address */
> > +	{ 0x0000FFFF,	0x0000FFFF, },	/* 0x08 MSI message data */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x0c MSI mask bits */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x10 MSI pending bits */
> > +};
> > +static struct perm_bits pci_cap_msi_24_perm[] = {
> > +	{ 0x00FF0000,	0x00FF0000, },	/* 0x00 MSI message control */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x04 MSI message address */
> > +	{ 0xFFFFFFFF,	0xFFFFFFFF, },	/* 0x08 MSI message upper addr */
> > +	{ 0x0000FFFF,	0x0000FFFF, },	/* 0x0c MSI message data */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x10 MSI mask bits */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x14 MSI pending bits */
> > +};
> > +
> > +static struct perm_bits pci_cap_pcix_perm[] = {
> > +	{ 0,		0xFFFF0000, },	/* 0x00 PCI_X_CMD */
> > +	{ 0,		0, },		/* 0x04 PCI_X_STATUS */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x08 ECC ctlr & status */
> > +	{ 0,		0, },		/* 0x0c ECC first addr */
> > +	{ 0,		0, },		/* 0x10 ECC second addr */
> > +	{ 0,		0, },		/* 0x14 ECC attr */
> > +};
> > +
> > +/* pci express capabilities */
> > +static struct perm_bits pci_cap_exp_perm[] = {
> > +	{ 0,		0, },		/* 0x00 PCIe capabilities */
> > +	{ 0,		0, },		/* 0x04 PCIe device capabilities */
> > +	{ 0,		0xFFFFFFFF, },	/* 0x08 PCIe device control & status */
> > +	{ 0,		0, },		/* 0x0c PCIe link capabilities */
> > +	{ 0,		0x000000FF, },	/* 0x10 PCIe link ctl/stat - SAFE? */
> > +	{ 0,		0, },		/* 0x14 PCIe slot capabilities */
> > +	{ 0,		0x00FFFFFF, },	/* 0x18 PCIe link ctl/stat - SAFE? */
> > +	{ 0,		0, },		/* 0x1c PCIe root port stuff */
> > +	{ 0,		0, },		/* 0x20 PCIe root port stuff */
> > +};
> > +
> > +static struct perm_bits pci_cap_msix_perm[] = {
> > +	{ 0,		0, },		/* 0x00 MSI-X Enable */
> > +	{ 0,		0, },		/* 0x04 table offset & bir */
> > +	{ 0,		0, },		/* 0x08 pba offset & bir */
> > +};
> > +
> > +static struct perm_bits pci_cap_af_perm[] = {
> > +	{ 0,		0, },		/* 0x00 af capability */
> > +	{ 0,		0x0001,	 },	/* 0x04 af flr bit */
> > +};
> > +
> > +static struct perm_bits *pci_cap_perms[] = {
> > +	[PCI_CAP_ID_BASIC]	= pci_cap_basic_perm,
> > +	[PCI_CAP_ID_PM]		= pci_cap_pm_perm,
> > +	[PCI_CAP_ID_VPD]	= pci_cap_vpd_perm,
> > +	[PCI_CAP_ID_SLOTID]	= pci_cap_slotid_perm,
> > +	[PCI_CAP_ID_MSI]	= NULL,			/* special */
> > +	[PCI_CAP_ID_PCIX]	= pci_cap_pcix_perm,
> > +	[PCI_CAP_ID_EXP]	= pci_cap_exp_perm,
> > +	[PCI_CAP_ID_MSIX]	= pci_cap_msix_perm,
> > +	[PCI_CAP_ID_AF]		= pci_cap_af_perm,
> > +};
> > +
> > +static int vfio_msi_cap_len(struct vfio_dev *vdev, u8 pos)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int len;
> > +	int ret;
> > +	u16 flags;
> > +
> > +	ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags);
> > +	if (ret < 0)
> > +		return ret;
> > +	if (flags & PCI_MSI_FLAGS_64BIT)
> > +		len = 14;
> > +	else
> > +		len = 10;
> > +	if (flags & PCI_MSI_FLAGS_MASKBIT)
> > +		len += 10;
> > +
> > +	switch (len) {
> > +	case 10:
> > +		vdev->msi_perm = pci_cap_msi_10_perm;
> > +		break;
> > +	case 14:
> > +		vdev->msi_perm = pci_cap_msi_14_perm;
> > +		break;
> > +	case 20:
> > +		vdev->msi_perm = pci_cap_msi_20_perm;
> > +		break;
> > +	case 24:
> > +		vdev->msi_perm = pci_cap_msi_24_perm;
> > +		break;
> > +	}
> > +	return len;
> > +}
> > +
> > +/*
> > + * We build a map of the config space that tells us where
> > + * and what capabilities exist, so that we can map reads and
> > + * writes back to capabilities, and thus figure out what to
> > + * allow, deny, or virtualize
> > + */
> > +int vfio_build_config_map(struct vfio_dev *vdev)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	u8 *map;
> > +	int i, len;
> > +	u8 pos, cap, tmp;
> > +	u16 flags;
> > +	int ret;
> > +#ifndef PCI_FIND_CAP_TTL
> > +#define PCI_FIND_CAP_TTL	48
> > +#endif
> > +	int loops = PCI_FIND_CAP_TTL;
> > +
> > +	map = kmalloc(pdev->cfg_size, GFP_KERNEL);
> > +	if (map == NULL)
> > +		return -ENOMEM;
> > +	for (i = 0; i < pdev->cfg_size; i++)
> > +		map[i] = 0xFF;
> > +	vdev->pci_config_map = map;
> > +
> > +	/* default config space */
> > +	for (i = 0; i < pci_capability_length[0]; i++)
> > +		map[i] = 0;
> > +
> > +	/* any capabilities? */
> > +	ret = pci_read_config_word(pdev, PCI_STATUS, &flags);
> > +	if (ret < 0)
> > +		return ret;
> > +	if ((flags & PCI_STATUS_CAP_LIST) == 0)
> > +		return 0;
> > +
> > +	ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
> > +	if (ret < 0)
> > +		return ret;
> > +	while (pos && --loops > 0) {
> > +		ret = pci_read_config_byte(pdev, pos, &cap);
> > +		if (ret < 0)
> > +			return ret;
> > +		if (cap == 0) {
> > +			printk(KERN_WARNING "%s: cap 0\n", __func__);
> > +			break;
> > +		}
> > +		if (cap > PCI_CAP_ID_MAX) {
> > +			printk(KERN_WARNING "%s: unknown pci capability id %x\n",
> > +					__func__, cap);
> > +			len = 0;
> > +		} else
> > +			len = pci_capability_length[cap];
> > +		if (len == 0) {
> > +			printk(KERN_WARNING "%s: unknown length for pci cap %x\n",
> > +					__func__, cap);
> > +			len = 4;
> > +		}
> > +		if (len == 0xFF) {
> > +			switch (cap) {
> > +			case PCI_CAP_ID_MSI:
> > +				len = vfio_msi_cap_len(vdev, pos);
> > +				if (len < 0)
> > +					return len;
> > +				break;
> > +			case PCI_CAP_ID_PCIX:
> > +				ret = pci_read_config_word(pdev, pos + 2,
> > +					&flags);
> > +				if (ret < 0)
> > +					return ret;
> > +				if (flags & 0x3000)
> > +					len = 24;
> > +				else
> > +					len = 8;
> > +				break;
> > +			case PCI_CAP_ID_VNDR:
> > +				/* length follows next field */
> > +				ret = pci_read_config_byte(pdev, pos + 2, &tmp);
> > +				if (ret < 0)
> > +					return ret;
> > +				len = tmp;
> > +				break;
> > +			default:
> > +				len = 0;
> > +				break;
> > +			}
> > +		}
> > +
> > +		for (i = 0; i < len; i++) {
> > +			if (map[pos+i] != 0xFF)
> > +				printk(KERN_WARNING
> > +					"%s: pci config conflict at %x, "
> > +					"caps %x %x\n",
> > +					__func__, i, map[pos+i], cap);
> > +			map[pos+i] = cap;
> > +		}
> > +		ret = pci_read_config_byte(pdev, pos + PCI_CAP_LIST_NEXT, &pos);
> > +		if (ret < 0)
> > +			return ret;
> > +	}
> > +	if (loops <= 0)
> > +		printk(KERN_ERR "%s: config space loop!\n", __func__);
> > +	return 0;
> > +}
> > +
> > +static int vfio_virt_init(struct vfio_dev *vdev)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	u32 *lp;
> > +	int i;
> > +
> > +	vdev->vconfig = kmalloc(256, GFP_KERNEL);
> > +	if (vdev->vconfig == NULL)
> > +		return -ENOMEM;
> > +
> > +	lp = (u32 *)vdev->vconfig;
> > +	for (i = 0; i < 256/sizeof(u32); i++, lp++)
> > +		pci_read_config_dword(pdev, i * sizeof(u32), lp);
> > +	vdev->bardirty = 1;
> > +
> > +	vdev->rbar[0] = *(u32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
> > +	vdev->rbar[1] = *(u32 *)&vdev->vconfig[PCI_BASE_ADDRESS_1];
> > +	vdev->rbar[2] = *(u32 *)&vdev->vconfig[PCI_BASE_ADDRESS_2];
> > +	vdev->rbar[3] = *(u32 *)&vdev->vconfig[PCI_BASE_ADDRESS_3];
> > +	vdev->rbar[4] = *(u32 *)&vdev->vconfig[PCI_BASE_ADDRESS_4];
> > +	vdev->rbar[5] = *(u32 *)&vdev->vconfig[PCI_BASE_ADDRESS_5];
> > +	vdev->rbar[6] = *(u32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
> > +
> > +	/* for sr-iov devices */
> > +	vdev->vconfig[PCI_VENDOR_ID] = pdev->vendor & 0xFF;
> > +	vdev->vconfig[PCI_VENDOR_ID+1] = pdev->vendor >> 8;
> > +	vdev->vconfig[PCI_DEVICE_ID] = pdev->device & 0xFF;
> > +	vdev->vconfig[PCI_DEVICE_ID+1] = pdev->device >> 8;
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Restore the *real* BARs after we detect a backdoor reset.
> > + * (backdoor = some device specific technique that we didn't catch)
> > + */
> > +static void vfio_bar_restore(struct vfio_dev *vdev)
> > +{
> > +	printk(KERN_WARNING "%s: restoring real bars\n", __func__);
> > +
> > +#define do_bar(off, which) \
> > +	pci_user_write_config_dword(vdev->pdev, off, vdev->rbar[which])
> > +
> > +	do_bar(PCI_BASE_ADDRESS_0, 0);
> > +	do_bar(PCI_BASE_ADDRESS_1, 1);
> > +	do_bar(PCI_BASE_ADDRESS_2, 2);
> > +	do_bar(PCI_BASE_ADDRESS_3, 3);
> > +	do_bar(PCI_BASE_ADDRESS_4, 4);
> > +	do_bar(PCI_BASE_ADDRESS_5, 5);
> > +	do_bar(PCI_ROM_ADDRESS, 6);
> > +#undef do_bar
> > +}
> > +
> > +/*
> > + * Pretend we're hardware and tweak the values
> > + * of the *virtual* pci BARs to reflect the hardware
> > + * capabilities
> > + */
> > +static void vfio_bar_fixup(struct vfio_dev *vdev)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int bar;
> > +	u32 *lp;
> > +	u64 mask;
> > +
> > +	for (bar = 0; bar <= 5; bar++) {
> > +		if (pci_resource_start(pdev, bar))
> > +			mask = ~(pci_resource_len(pdev, bar) - 1);
> > +		else
> > +			mask = 0;
> > +		lp = (u32 *)vdev->vconfig + PCI_BASE_ADDRESS_0 + 4*bar;
> > +		*lp &= (u32)mask;
> > +
> > +		if (pci_resource_flags(pdev, bar) & IORESOURCE_IO)
> > +			*lp |= PCI_BASE_ADDRESS_SPACE_IO;
> > +		else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) {
> > +			*lp |= PCI_BASE_ADDRESS_SPACE_MEMORY;
> > +			if (pci_resource_flags(pdev, bar) & IORESOURCE_PREFETCH)
> > +				*lp |= PCI_BASE_ADDRESS_MEM_PREFETCH;
> > +			if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM_64) {
> > +				*lp |= PCI_BASE_ADDRESS_MEM_TYPE_64;
> > +				lp++;
> > +				*lp &= (u32)(mask >> 32);
> > +				bar++;
> > +			}
> > +		}
> > +	}
> > +
> > +	if (pci_resource_start(pdev, PCI_ROM_RESOURCE))
> > +		mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
> > +	else
> > +		mask = 0;
> > +	lp = (u32 *)vdev->vconfig + PCI_ROM_ADDRESS;
> > +	*lp &= (u32)mask;
> > +
> > +	vdev->bardirty = 0;
> > +}
> > +
> > +static inline int vfio_read_config_byte(struct vfio_dev *vdev,
> > +					int pos, u8 *valp)
> > +{
> > +	return pci_user_read_config_byte(vdev->pdev, pos, valp);
> > +}
> > +
> > +static inline int vfio_write_config_byte(struct vfio_dev *vdev,
> > +					int pos, u8 val)
> > +{
> > +	vdev->vconfig[pos] = val;
> > +	return pci_user_write_config_byte(vdev->pdev, pos, val);
> > +}
> > +
> > +static int vfio_config_rwbyte(int write,
> > +				struct vfio_dev *vdev,
> > +				int pos,
> > +				char __user *buf)
> > +{
> > +	u8 *map = vdev->pci_config_map;
> > +	u8 cap, val, newval;
> > +	u16 start, off;
> > +	int p;
> > +	struct perm_bits *perm;
> > +	u8 wr, virt;
> > +	int ret;
> > +
> > +	cap = map[pos];
> > +	if (cap == 0xFF) {	/* unknown region */
> > +		if (write)
> > +			return 0;	/* silent no-op */
> > +		val = 0;
> > +		if (pos <= pci_capability_length[0])	/* ok to read */
> > +			(void) vfio_read_config_byte(vdev, pos, &val);
> > +		if (copy_to_user(buf, &val, 1))
> > +			return -EFAULT;
> > +		return 0;
> > +	}
> > +
> > +	/* scan back to start of cap region */
> > +	for (p = pos; p >= 0; p--) {
> > +		if (map[p] != cap)
> > +			break;
> > +		start = p;
> > +	}
> > +	off = pos - start;	/* offset within capability */
> > +
> > +	if (cap == PCI_CAP_ID_MSI)
> > +		perm = vdev->msi_perm;
> > +	else
> > +		perm = pci_cap_perms[cap];
> > +	if (perm == NULL) {
> > +		wr = 0;
> > +		virt = 0;
> > +	} else {
> > +		perm += (off >> 2);
> > +		wr = perm->write >> ((off & 3) * 8);
> > +		virt = perm->rvirt >> ((off & 3) * 8);
> > +	}
> > +	if (write && !wr)		/* no writeable bits */
> > +		return 0;
> > +	if (!virt) {
> > +		if (write) {
> > +			if (copy_from_user(&val, buf, 1))
> > +				return -EFAULT;
> > +			val &= wr;
> > +			if (wr != 0xFF) {
> > +				u8 existing;
> > +
> > +				ret = vfio_read_config_byte(vdev, pos,
> > +							&existing);
> > +				if (ret < 0)
> > +					return ret;
> > +				val |= (existing & ~wr);
> > +			}
> > +			vfio_write_config_byte(vdev, pos, val);
> > +		} else {
> > +			ret = vfio_read_config_byte(vdev, pos, &val);
> > +			if (ret < 0)
> > +				return ret;
> > +			if (copy_to_user(buf, &val, 1))
> > +				return -EFAULT;
> > +		}
> > +		return 0;
> > +	}
> > +
> > +	if (write) {
> > +		if (copy_from_user(&newval, buf, 1))
> > +			return -EFAULT;
> > +	}
> > +	/*
> > +	 * We get here if there are some virt bits
> > +	 * handle remaining real bits, if any
> > +	 */
> > +	if (~virt) {
> > +		u8 rbits = (~virt) & wr;
> > +
> > +		ret = vfio_read_config_byte(vdev, pos, &val);
> > +		if (ret < 0)
> > +			return ret;
> > +		if (write && rbits) {
> > +			val &= ~rbits;
> > +			val |= (newval & rbits);
> > +			vfio_write_config_byte(vdev, pos, val);
> > +		}
> > +	}
> > +	/*
> > +	 * Now handle entirely virtual fields
> > +	 */
> > +	switch (cap) {
> > +	case PCI_CAP_ID_BASIC:		/* virtualize BARs */
> > +		switch (off) {
> > +		/*
> > +		 * vendor and device are virt because they don't
> > +		 * show up otherwise for sr-iov vfs
> > +		 */
> > +		case PCI_VENDOR_ID:
> > +		case PCI_VENDOR_ID + 1:
> > +		case PCI_DEVICE_ID:
> > +		case PCI_DEVICE_ID + 1:
> > +			/* read only */
> > +			val = vdev->vconfig[pos];
> > +			break;
> > +		case PCI_COMMAND:
> > +			/*
> > +			 * If the real mem or IO enable bits are zero
> > +			 * then there may have been a backdoor reset.
> > +			 * Restore the real BARs before allowing those
> > +			 * bits to re-enable
> > +			 */
> > +			if (vdev->pdev->is_virtfn)
> > +				val |= PCI_COMMAND_MEMORY;
> > +			if (write) {
> > +				int upd = 0;
> > +
> > +				upd = (newval & PCI_COMMAND_MEMORY) >
> > +				      (val & PCI_COMMAND_MEMORY);
> > +				upd += (newval & PCI_COMMAND_IO) >
> > +				       (val & PCI_COMMAND_IO);
> > +				if (upd)
> > +					vfio_bar_restore(vdev);
> > +				vfio_write_config_byte(vdev, pos, newval);
> > +			}
> > +			break;
> > +		case PCI_INTERRUPT_LINE:
> > +			if (write)
> > +				vdev->vconfig[pos] = newval;
> > +			else
> > +				val = vdev->vconfig[pos];
> > +			break;
> > +		case PCI_BASE_ADDRESS_0:
> > +		case PCI_BASE_ADDRESS_0+1:
> > +		case PCI_BASE_ADDRESS_0+2:
> > +		case PCI_BASE_ADDRESS_0+3:
> > +		case PCI_BASE_ADDRESS_1:
> > +		case PCI_BASE_ADDRESS_1+1:
> > +		case PCI_BASE_ADDRESS_1+2:
> > +		case PCI_BASE_ADDRESS_1+3:
> > +		case PCI_BASE_ADDRESS_2:
> > +		case PCI_BASE_ADDRESS_2+1:
> > +		case PCI_BASE_ADDRESS_2+2:
> > +		case PCI_BASE_ADDRESS_2+3:
> > +		case PCI_BASE_ADDRESS_3:
> > +		case PCI_BASE_ADDRESS_3+1:
> > +		case PCI_BASE_ADDRESS_3+2:
> > +		case PCI_BASE_ADDRESS_3+3:
> > +		case PCI_BASE_ADDRESS_4:
> > +		case PCI_BASE_ADDRESS_4+1:
> > +		case PCI_BASE_ADDRESS_4+2:
> > +		case PCI_BASE_ADDRESS_4+3:
> > +		case PCI_BASE_ADDRESS_5:
> > +		case PCI_BASE_ADDRESS_5+1:
> > +		case PCI_BASE_ADDRESS_5+2:
> > +		case PCI_BASE_ADDRESS_5+3:
> > +		case PCI_ROM_ADDRESS:
> > +		case PCI_ROM_ADDRESS+1:
> > +		case PCI_ROM_ADDRESS+2:
> > +		case PCI_ROM_ADDRESS+3:
> > +			if (write) {
> > +				vdev->vconfig[pos] = newval;
> > +				vdev->bardirty = 1;
> > +			} else {
> > +				if (vdev->bardirty)
> > +					vfio_bar_fixup(vdev);
> > +				val = vdev->vconfig[pos];
> > +			}
> > +			break;
> > +		}
> > +		break;
> > +	case PCI_CAP_ID_MSI:		/* virtualize (parts of) MSI */
> > +		if (off == PCI_MSI_FLAGS) {
> > +			u8 num;
> > +
> > +			if (write) {
> > +				if (vdev->ev_msi == NULL)
> > +					newval &= ~PCI_MSI_FLAGS_ENABLE;
> > +				num = (newval & PCI_MSI_FLAGS_QSIZE) >> 4;
> > +				if (num > vdev->msi_qmax)
> > +					num = vdev->msi_qmax;
> > +				newval &= ~PCI_MSI_FLAGS_QSIZE;
> > +				newval |= num << 4;
> > +				vfio_write_config_byte(vdev, pos, newval);
> > +			} else {
> > +				ret = vfio_read_config_byte(vdev, pos, &val);
> > +				if (ret < 0)
> > +					return ret;
> > +				val &= ~PCI_MSI_FLAGS_QMASK;
> > +				val |= vdev->msi_qmax << 1;
> > +			}
> > +		} else {
> > +			if (write)
> > +				vdev->vconfig[pos] = newval;
> > +			else
> > +				val = vdev->vconfig[pos];
> > +		}
> > +		break;
> > +	}
> > +	if (!write && copy_to_user(buf, &val, 1))
> > +		return -EFAULT;
> > +	return 0;
> > +}
> > +
> > +ssize_t vfio_config_readwrite(int write,
> > +		struct vfio_dev *vdev,
> > +		char __user *buf,
> > +		size_t count,
> > +		loff_t *ppos)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int done = 0;
> > +	int ret;
> > +	u16 pos;
> > +
> > +
> > +	if (vdev->pci_config_map == NULL) {
> > +		ret = vfio_build_config_map(vdev);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +	if (vdev->vconfig == NULL) {
> > +		ret = vfio_virt_init(vdev);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	while (count > 0) {
> > +		pos = *ppos;
> > +		if (pos == pdev->cfg_size)
> > +			break;
> > +		if (pos > pdev->cfg_size) {
> > +			ret = -EINVAL;
> > +			goto out;
> > +		}
> > +
> > +		ret = vfio_config_rwbyte(write, vdev, pos, buf);
> > +
> > +		if (ret < 0)
> > +			goto out;
> > +		buf++;
> > +		done++;
> > +		count--;
> > +		(*ppos)++;
> > +	}
> > +	ret = done;
> > +out:
> > +	return ret;
> > +}
> > diff --git a/drivers/vfio/vfio_rdwr.c b/drivers/vfio/vfio_rdwr.c
> > new file mode 100644
> > index 0000000..1fd50a6
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_rdwr.c
> > @@ -0,0 +1,158 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@xxxxxxxxx
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger@xxxxxxxxxxxxx>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Hans J. Koch <hjk@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg@xxxxxxxxx>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst@xxxxxxxxxx>
> > + */
> > +
> > +/*
> > + * This code handles normal read and write system calls; allowing
> > + * access to device memory or I/O registers
> > + * without the need for mmap'ing.
> > + */
> > +
> > +#include <linux/fs.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/pci.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/io.h>
> > +
> > +#include <linux/vfio.h>
> > +
> > +ssize_t vfio_io_readwrite(
> > +		int write,
> > +		struct vfio_dev *vdev,
> > +		char __user *buf,
> > +		size_t count,
> > +		loff_t *ppos)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	size_t done = 0;
> > +	resource_size_t end;
> > +	void __iomem *io;
> > +	loff_t pos;
> > +	int pci_space;
> > +	int unit;
> > +
> > +	pci_space = vfio_offset_to_pci_space(*ppos);
> > +	pos = vfio_offset_to_pci_offset(*ppos);
> > +
> > +	if (!pci_resource_start(pdev, pci_space))
> > +		return -EINVAL;
> > +	end = pci_resource_len(pdev, pci_space);
> > +	if (pos + count > end)
> > +		return -EINVAL;
> > +	if (vdev->barmap[pci_space] == NULL)
> > +		vdev->barmap[pci_space] = pci_iomap(pdev, pci_space, 0);
> > +	io = vdev->barmap[pci_space];
> > +
> > +	while (count > 0) {
> > +		if ((pos % 4) == 0 && count >= 4) {
> > +			u32 val;
> > +
> > +			if (write) {
> > +				if (copy_from_user(&val, buf, 4))
> > +					return -EFAULT;
> > +				iowrite32(val, io + pos);
> > +			} else {
> > +				val = ioread32(io + pos);
> > +				if (copy_to_user(buf, &val, 4))
> > +					return -EFAULT;
> > +			}
> > +			unit = 4;
> > +		} else if ((pos % 2) == 0 && count >= 2) {
> > +			u16 val;
> > +
> > +			if (write) {
> > +				if (copy_from_user(&val, buf, 2))
> > +					return -EFAULT;
> > +				iowrite16(val, io + pos);
> > +			} else {
> > +				val = ioread16(io + pos);
> > +				if (copy_to_user(buf, &val, 2))
> > +					return -EFAULT;
> > +			}
> > +			unit = 2;
> > +		} else {
> > +			u8 val;
> > +
> > +			if (write) {
> > +				if (copy_from_user(&val, buf, 1))
> > +					return -EFAULT;
> > +				iowrite8(val, io + pos);
> > +			} else {
> > +				val = ioread8(io + pos);
> > +				if (copy_to_user(buf, &val, 1))
> > +					return -EFAULT;
> > +			}
> > +			unit = 1;
> > +		}
> > +		pos += unit;
> > +		buf += unit;
> > +		count -= unit;
> > +		done += unit;
> > +	}
> > +	*ppos += done;
> > +	return done;
> > +}
> 
> Can we export and use pci_write_legacy_io? Same for read.
> Drivers don't do unaligned accesses, do they?
pci legacy routines only exists for weird platforms, not x86.

> 
> > +
> > +ssize_t vfio_mem_readwrite(
> > +		int write,
> > +		struct vfio_dev *vdev,
> > +		char __user *buf,
> > +		size_t count,
> > +		loff_t *ppos)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	resource_size_t end;
> > +	void __iomem *io;
> > +	loff_t pos;
> > +	int pci_space;
> > +
> > +	pci_space = vfio_offset_to_pci_space(*ppos);
> > +	pos = vfio_offset_to_pci_offset(*ppos);
> > +
> > +	if (!pci_resource_start(pdev, pci_space))
> > +		return -EINVAL;
> > +	end = pci_resource_len(pdev, pci_space);
> > +	if (vdev->barmap[pci_space] == NULL)
> > +		vdev->barmap[pci_space] = pci_iomap(pdev, pci_space, 0);
> > +	io = vdev->barmap[pci_space];
> > +
> > +	if (pos > end)
> > +		return -EINVAL;
> > +	if (pos == end)
> > +		return 0;
> > +	if (pos + count > end)
> > +		count = end - pos;
> > +	if (write) {
> > +		if (copy_from_user(io + pos, buf, count))
> > +			return -EFAULT;
> > +	} else {
> > +		if (copy_to_user(buf, io + pos, count))
> > +			return -EFAULT;
> > +	}
> > +	*ppos += count;
> > +	return count;
> > +}
> > diff --git a/drivers/vfio/vfio_sysfs.c b/drivers/vfio/vfio_sysfs.c
> > new file mode 100644
> > index 0000000..a3ddba1
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_sysfs.c
> > @@ -0,0 +1,118 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@xxxxxxxxx
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger@xxxxxxxxxxxxx>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Hans J. Koch <hjk@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg@xxxxxxxxx>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst@xxxxxxxxxx>
> > + */
> > +
> > +/*
> > + * This code handles vfio related files in sysfs
> > + * (not much useful yet)
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/kobject.h>
> > +#include <linux/sysfs.h>
> > +#include <linux/mm.h>
> > +#include <linux/fs.h>
> > +#include <linux/pci.h>
> > +#include <linux/mmu_notifier.h>
> > +
> > +#include <linux/vfio.h>
> > +
> > +struct vfio_class *vfio_class;
> > +
> > +int vfio_class_init(void)
> > +{
> > +	int ret = 0;
> > +
> > +	if (vfio_class != NULL) {
> > +		kref_get(&vfio_class->kref);
> > +		goto exit;
> > +	}
> > +
> > +	vfio_class = kzalloc(sizeof(*vfio_class), GFP_KERNEL);
> > +	if (!vfio_class) {
> > +		ret = -ENOMEM;
> > +		goto err_kzalloc;
> > +	}
> > +
> > +	kref_init(&vfio_class->kref);
> > +	vfio_class->class = class_create(THIS_MODULE, "vfio");
> > +	if (IS_ERR(vfio_class->class)) {
> > +		ret = IS_ERR(vfio_class->class);
> > +		printk(KERN_ERR "class_create failed for vfio\n");
> > +		goto err_class_create;
> > +	}
> > +	return 0;
> > +
> > +err_class_create:
> > +	kfree(vfio_class);
> > +	vfio_class = NULL;
> > +err_kzalloc:
> > +exit:
> > +	return ret;
> > +}
> > +
> > +static void vfio_class_release(struct kref *kref)
> > +{
> > +	/* Ok, we cheat as we know we only have one vfio_class */
> > +	class_destroy(vfio_class->class);
> > +	kfree(vfio_class);
> > +	vfio_class = NULL;
> > +}
> > +
> > +void vfio_class_destroy(void)
> > +{
> > +	if (vfio_class)
> > +		kref_put(&vfio_class->kref, vfio_class_release);
> > +}
> > +
> > +static ssize_t show_locked_pages(struct device *dev,
> > +				 struct device_attribute *attr,
> > +				 char *buf)
> > +{
> > +	struct vfio_dev *vdev = dev_get_drvdata(dev);
> > +
> > +	if (vdev == NULL)
> > +		return -ENODEV;
> > +	return sprintf(buf, "%u\n", vdev->locked_pages);
> > +}
> > +
> > +static DEVICE_ATTR(locked_pages, S_IRUGO, show_locked_pages, NULL);
> > +
> > +static struct attribute *vfio_attrs[] = {
> > +	&dev_attr_locked_pages.attr,
> > +	NULL,
> > +};
> > +
> > +static struct attribute_group vfio_attr_grp = {
> > +	.attrs = vfio_attrs,
> > +};
> > +
> > +int vfio_dev_add_attributes(struct vfio_dev *vdev)
> > +{
> > +	return sysfs_create_group(&vdev->dev->kobj, &vfio_attr_grp);
> > +}
> > diff --git a/include/linux/Kbuild b/include/linux/Kbuild
> > index 2fc8e14..3121529 100644
> > --- a/include/linux/Kbuild
> > +++ b/include/linux/Kbuild
> > @@ -167,6 +167,7 @@ header-y += ultrasound.h
> > 
> >  header-y += un.h
> >  header-y += utime.h
> >  header-y += veth.h
> > 
> > +header-y += vfio.h
> > 
> >  header-y += videotext.h
> >  header-y += x25.h
> > 
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > new file mode 100644
> > index 0000000..b7dd524
> > --- /dev/null
> > +++ b/include/linux/vfio.h
> > @@ -0,0 +1,267 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@xxxxxxxxx
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger@xxxxxxxxxxxxx>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Hans J. Koch <hjk@xxxxxxxxxxxxx>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg@xxxxxxxxx>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst@xxxxxxxxxx>
> > + */
> > +#include <linux/types.h>
> > +
> > +/*
> > + * VFIO driver - allow mapping and use of certain PCI devices
> > + * in unprivileged user processes. (If IOMMU is present)
> > + * Especially useful for Virtual Function parts of SR-IOV devices
> > + */
> > +
> > +#ifdef __KERNEL__
> > +
> > +struct vfio_nl_client {
> > +	struct list_head	list;
> > +	u64			msgcap;
> > +	struct net		*net;
> > +	u32			pid;
> > +};
> > +
> > +struct perm_bits;
> > +struct vfio_dev {
> > +	struct device	*dev;
> > +	struct pci_dev	*pdev;
> > +	char		name[8];
> > +	u8		*pci_config_map;
> > +	int		pci_config_size;
> > +	int		devnum;
> > +	void __iomem	*barmap[PCI_ROM_RESOURCE+1];
> > +	spinlock_t	irqlock;	/* guards command register accesses */
> > +	int		listeners;
> > +	u32		locked_pages;
> > +	struct mutex	lgate;		/* listener gate */
> > +	struct mutex	dgate;		/* dma op gate */
> > +	struct mutex	igate;		/* intr op gate */
> > +	struct mutex	ngate;		/* netlink op gate */
> > +	struct list_head nlc_list;	/* netlink clients */
> > +	wait_queue_head_t dev_idle_q;
> > +	wait_queue_head_t nl_wait_q;
> > +	u32		nl_reply_seq;
> > +	u32		nl_reply_value;
> > +	int		mapcount;
> > +	struct uiommu_domain	*udomain;
> > +	int			cachec;
> > +	struct msix_entry	*msix;
> > +	struct eventfd_ctx	*ev_irq;
> > +	struct eventfd_ctx	**ev_msi;
> > +	struct eventfd_ctx	**ev_msix;
> > +	int			msi_nvec;
> > +	int			msix_nvec;
> > +	u8		*vconfig;
> > +	u32		rbar[7];	/* copies of real bars */
> > +	u8		msi_qmax;
> > +	u8		bardirty;
> > +	struct perm_bits	*msi_perm;
> > +};
> > +
> > +struct vfio_listener {
> > +	struct vfio_dev	*vdev;
> > +	struct list_head	dm_list;
> > +	struct mm_struct	*mm;
> > +	struct mmu_notifier	mmu_notifier;
> > +};
> > +
> > +/*
> > + * Structure for keeping track of memory nailed down by the
> > + * user for DMA
> > + */
> > +struct dma_map_page {
> > +	struct list_head list;
> > +	struct page     **pages;
> > +	dma_addr_t      daddr;
> > +	unsigned long	vaddr;
> > +	int		npage;
> > +	int		rdwr;
> > +};
> > +
> > +/* VFIO class infrastructure */
> > +struct vfio_class {
> > +	struct kref kref;
> > +	struct class *class;
> > +};
> > +extern struct vfio_class *vfio_class;
> > +
> > +ssize_t vfio_io_readwrite(int, struct vfio_dev *,
> > +			char __user *, size_t, loff_t *);
> > +ssize_t vfio_mem_readwrite(int, struct vfio_dev *,
> > +			char __user *, size_t, loff_t *);
> > +ssize_t vfio_config_readwrite(int, struct vfio_dev *,
> > +			char __user *, size_t, loff_t *);
> > +
> > +void vfio_drop_msi(struct vfio_dev *);
> > +void vfio_drop_msix(struct vfio_dev *);
> > +int vfio_setup_msi(struct vfio_dev *, int, void __user *);
> > +int vfio_setup_msix(struct vfio_dev *, int, void __user *);
> > +
> > +#ifndef PCI_MSIX_ENTRY_SIZE
> > +#define	PCI_MSIX_ENTRY_SIZE	16
> > +#endif
> > +#ifndef PCI_STATUS_INTERRUPT
> > +#define	PCI_STATUS_INTERRUPT	0x08
> > +#endif
> > +
> > +struct vfio_dma_map;
> > +void vfio_dma_unmapall(struct vfio_listener *);
> > +int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *);
> > +int vfio_dma_map_common(struct vfio_listener *, unsigned int,
> > +			struct vfio_dma_map *);
> > +int vfio_domain_set(struct vfio_dev *, int, int);
> > +int vfio_domain_unset(struct vfio_dev *);
> > +
> > +int vfio_class_init(void);
> > +void vfio_class_destroy(void);
> > +int vfio_dev_add_attributes(struct vfio_dev *);
> > +int vfio_build_config_map(struct vfio_dev *);
> > +
> > +int vfio_nl_init(void);
> > +void vfio_nl_freeclients(struct vfio_dev *);
> > +void vfio_nl_exit(void);
> > +int vfio_nl_remove(struct vfio_dev *);
> > +int vfio_validate(struct vfio_dev *);
> > +int vfio_nl_upcall(struct vfio_dev *, u8, int, int);
> > +void vfio_pm_process_reply(int);
> > +pci_ers_result_t vfio_error_detected(struct pci_dev *,
> > pci_channel_state_t); +pci_ers_result_t vfio_mmio_enabled(struct pci_dev
> > *);
> > +pci_ers_result_t vfio_link_reset(struct pci_dev *);
> > +pci_ers_result_t vfio_slot_reset(struct pci_dev *);
> > +void vfio_error_resume(struct pci_dev *);
> > +#define VFIO_ERROR_REPLY_TIMEOUT	(3*HZ)
> > +#define VFIO_SUSPEND_REPLY_TIMEOUT	(5*HZ)
> > +
> > +irqreturn_t vfio_interrupt(int, void *);
> > +
> > +#endif	/* __KERNEL__ */
> > +
> > +/* Kernel & User level defines for ioctls */
> > +
> > +/*
> > + * Structure for DMA mapping of user buffers
> > + * vaddr, dmaaddr, and size must all be page aligned
> > + * buffer may only be larger than 1 page if (a) there is
> > + * an iommu in the system, or (b) buffer is part of a huge page
> > + */
> > +struct vfio_dma_map {
> > +	__u64	vaddr;		/* process virtual addr */
> > +	__u64	dmaaddr;	/* desired and/or returned dma address */
> > +	__u64	size;		/* size in bytes */
> > +	__u64	flags;		/* bool: 0 for r/o; 1 for r/w */
> > +#define	VFIO_FLAG_WRITE		0x1	/* req writeable DMA mem */
> > +};
> > +
> > +/* map user pages at specific dma address */
> > +/* requires previous VFIO_DOMAIN_SET */
> > +#define	VFIO_DMA_MAP_IOVA	_IOWR(';', 101, struct vfio_dma_map)
> > +
> > +/* unmap user pages */
> > +#define	VFIO_DMA_UNMAP		_IOW(';', 102, struct vfio_dma_map)
> > +
> > +/* request IRQ interrupts; use given eventfd */
> > +#define	VFIO_EVENTFD_IRQ	_IOW(';', 103, int)
> > +
> > +/* Request MSI interrupts: arg[0] is #, arg[1-n] are eventfds */
> > +#define	VFIO_EVENTFDS_MSI	_IOW(';', 104, int)
> > +
> > +/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */
> > +#define	VFIO_EVENTFDS_MSIX	_IOW(';', 105, int)
> > +
> > +/* Get length of a BAR */
> > +#define	VFIO_BAR_LEN		_IOWR(';', 167, __u32)
> > +
> > +/* Set the IOMMU domain - arg is fd from uiommu driver */
> > +#define	VFIO_DOMAIN_SET		_IOW(';', 107, int)
> > +
> > +/* Unset the IOMMU domain */
> > +#define	VFIO_DOMAIN_UNSET	_IO(';', 108)
> > +
> > +/*
> > + * Reads, writes, and mmaps determine which PCI BAR (or config space)
> > + * from the high level bits of the file offset
> > + */
> > +#define	VFIO_PCI_BAR0_RESOURCE		0x0
> > +#define	VFIO_PCI_BAR1_RESOURCE		0x1
> > +#define	VFIO_PCI_BAR2_RESOURCE		0x2
> > +#define	VFIO_PCI_BAR3_RESOURCE		0x3
> > +#define	VFIO_PCI_BAR4_RESOURCE		0x4
> > +#define	VFIO_PCI_BAR5_RESOURCE		0x5
> > +#define	VFIO_PCI_ROM_RESOURCE		0x6
> > +#define	VFIO_PCI_CONFIG_RESOURCE	0xF
> > +#define	VFIO_PCI_SPACE_SHIFT	32
> > +#define VFIO_PCI_CONFIG_OFF
> > vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE) +
> > +static inline int vfio_offset_to_pci_space(__u64 off)
> > +{
> > +	return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF;
> > +}
> > +
> > +static inline __u32 vfio_offset_to_pci_offset(__u64 off)
> > +{
> > +	return off & (__u32)0xFFFFFFFF;
> 
> You don't really need the cast, do you?
> 
> > +}
> > +
> > +static inline __u64 vfio_pci_space_to_offset(int sp)
> > +{
> > +	return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT;
> > +}
> > +
> 
> Is this ever used besides VFIO_PCI_CONFIG_OFF?
> If not it's likely an overkill.
> If yes note that sp will get sign extended when cast.
Can be used when accessing different bar areas.

> 
> > +/*
> > + * Netlink defines:
> > + */
> > +#define VFIO_GENL_NAME	"VFIO"
> > +
> > +/* message types */
> > +enum {
> > +	VFIO_MSG_INVAL = 0,
> > +	/* kernel to user */
> > +	VFIO_MSG_REMOVE,		/* unbind, module or hotplug remove */
> > +	VFIO_MSG_ERROR_DETECTED,	/* pci err handling - error detected */
> > +	VFIO_MSG_MMIO_ENABLED,		/* pci err handling - mmio enabled */
> > +	VFIO_MSG_LINK_RESET,		/* pci err handling - link reset */
> > +	VFIO_MSG_SLOT_RESET,		/* pci err handling - slot reset */
> > +	VFIO_MSG_ERROR_RESUME,		/* pci err handling - resume normal */
> > +	VFIO_MSG_PM_SUSPEND,		/* suspend or hibernate notification */
> > +	VFIO_MSG_PM_RESUME,		/* resume after suspend or hibernate */
> > +	/* user to kernel */
> > +	VFIO_MSG_REGISTER,
> > +	VFIO_MSG_ERROR_HANDLING_REPLY,	/* err handling reply */
> > +	VFIO_MSG_PM_SUSPEND_REPLY,	/* suspend notify reply */
> > +};
> > +
> > +/* attributes */
> > +enum {
> > +	VFIO_ATTR_UNSPEC,
> > +	VFIO_ATTR_MSGCAP,	/* bitmask of messages desired */
> > +	VFIO_ATTR_PCI_DOMAIN,
> > +	VFIO_ATTR_PCI_BUS,
> > +	VFIO_ATTR_PCI_SLOT,
> > +	VFIO_ATTR_PCI_FUNC,
> > +	VFIO_ATTR_CHANNEL_STATE,
> > +	VFIO_ATTR_ERROR_HANDLING_REPLY,
> > +	VFIO_ATTR_PM_SUSPEND_REPLY,
> > +	__VFIO_NL_ATTR_MAX
> > +};
> > +#define VFIO_NL_ATTR_MAX (__VFIO_NL_ATTR_MAX - 1)
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html