> -----Original Message----- > From: linux-kernel-owner@xxxxxxxxxxxxxxx [mailto:linux-kernel- > owner@xxxxxxxxxxxxxxx] On Behalf Of Alex Williamson > Sent: Tuesday, November 20, 2012 11:50 PM > To: Alexey Kardashevskiy > Cc: Benjamin Herrenschmidt; Paul Mackerras; linuxppc- > dev@xxxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; > David Gibson > Subject: Re: [PATCH] vfio powerpc: enabled and supported on powernv > platform > > On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote: > > VFIO implements platform independent stuff such as a PCI driver, BAR > > access (via read/write on a file descriptor or direct mapping when > > possible) and IRQ signaling. > > The platform dependent part includes IOMMU initialization and > > handling. > > > > This patch initializes IOMMU groups based on the IOMMU configuration > > discovered during the PCI scan, only POWERNV platform is supported at > > the moment. > > > > Also the patch implements an VFIO-IOMMU driver which manages DMA > > mapping/unmapping requests coming from the client (now QEMU). It also > > returns a DMA window information to let the guest initialize the > > device tree for a guest OS properly. Although this driver has been > > tested only on POWERNV, it should work on any platform supporting TCE > > tables. > > > > To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option. > > > > Cc: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> > > --- > > arch/powerpc/include/asm/iommu.h | 6 + > > arch/powerpc/kernel/iommu.c | 140 +++++++++++++++++++ > > arch/powerpc/platforms/powernv/pci.c | 135 +++++++++++++++++++ > > drivers/iommu/Kconfig | 8 ++ > > drivers/vfio/Kconfig | 6 + > > drivers/vfio/Makefile | 1 + > > drivers/vfio/vfio_iommu_spapr_tce.c | 247 > ++++++++++++++++++++++++++++++++++ > > include/linux/vfio.h | 20 +++ > > 8 files changed, 563 insertions(+) > > create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c > > > > diff --git a/arch/powerpc/include/asm/iommu.h > > b/arch/powerpc/include/asm/iommu.h > > index cbfe678..5ba66cb 100644 > > --- a/arch/powerpc/include/asm/iommu.h > > +++ b/arch/powerpc/include/asm/iommu.h > > @@ -64,30 +64,33 @@ struct iommu_pool { } > > ____cacheline_aligned_in_smp; > > > > struct iommu_table { > > unsigned long it_busno; /* Bus number this table belongs to */ > > unsigned long it_size; /* Size of iommu table in entries */ > > unsigned long it_offset; /* Offset into global table */ > > unsigned long it_base; /* mapped address of tce table */ > > unsigned long it_index; /* which iommu table this is */ > > unsigned long it_type; /* type: PCI or Virtual Bus */ > > unsigned long it_blocksize; /* Entries in each block (cacheline) > */ > > unsigned long poolsize; > > unsigned long nr_pools; > > struct iommu_pool large_pool; > > struct iommu_pool pools[IOMMU_NR_POOLS]; > > unsigned long *it_map; /* A simple allocation bitmap for now > */ > > +#ifdef CONFIG_IOMMU_API > > + struct iommu_group *it_group; > > +#endif > > }; > > > > struct scatterlist; > > > > static inline void set_iommu_table_base(struct device *dev, void > > *base) { > > dev->archdata.dma_data.iommu_table_base = base; } > > > > static inline void *get_iommu_table_base(struct device *dev) { > > return dev->archdata.dma_data.iommu_table_base; > > } > > > > /* Frees table for an individual device node */ @@ -135,17 +138,20 @@ > > static inline void pci_iommu_init(void) { } extern void > > alloc_dart_table(void); #if defined(CONFIG_PPC64) && > > defined(CONFIG_PM) static inline void iommu_save(void) { > > if (ppc_md.iommu_save) > > ppc_md.iommu_save(); > > } > > > > static inline void iommu_restore(void) { > > if (ppc_md.iommu_restore) > > ppc_md.iommu_restore(); > > } > > #endif > > > > +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long > entry, uint64_t tce, > > + enum dma_data_direction direction, unsigned long pages); > > + > > #endif /* __KERNEL__ */ > > #endif /* _ASM_IOMMU_H */ > > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c > > index ff5a6ce..94f614b 100644 > > --- a/arch/powerpc/kernel/iommu.c > > +++ b/arch/powerpc/kernel/iommu.c > > @@ -32,30 +32,31 @@ > > #include <linux/dma-mapping.h> > > #include <linux/bitmap.h> > > #include <linux/iommu-helper.h> > > #include <linux/crash_dump.h> > > #include <linux/hash.h> > > #include <linux/fault-inject.h> > > #include <linux/pci.h> > > #include <asm/io.h> > > #include <asm/prom.h> > > #include <asm/iommu.h> > > #include <asm/pci-bridge.h> > > #include <asm/machdep.h> > > #include <asm/kdump.h> > > #include <asm/fadump.h> > > #include <asm/vio.h> > > +#include <asm/tce.h> > > > > #define DBG(...) > > > > static int novmerge; > > > > static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned > > int); > > > > static int __init setup_iommu(char *str) { > > if (!strcmp(str, "novmerge")) > > novmerge = 1; > > else if (!strcmp(str, "vmerge")) > > novmerge = 0; > > return 1; > > } > > @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, > > struct iommu_table *tbl, } > > > > void iommu_free_coherent(struct iommu_table *tbl, size_t size, > > void *vaddr, dma_addr_t dma_handle) { > > if (tbl) { > > unsigned int nio_pages; > > > > size = PAGE_ALIGN(size); > > nio_pages = size >> IOMMU_PAGE_SHIFT; > > iommu_free(tbl, dma_handle, nio_pages); > > size = PAGE_ALIGN(size); > > free_pages((unsigned long)vaddr, get_order(size)); > > } > > } > > + > > +#ifdef CONFIG_IOMMU_API > > +/* > > + * SPAPR TCE API > > + */ > > +static struct page *free_tce(struct iommu_table *tbl, unsigned long > > +entry) { > > + struct page *page = NULL; > > NULL initialization doesn't appear to be necessary > > > + unsigned long oldtce; > > + > > + oldtce = ppc_md.tce_get(tbl, entry); > > + > > + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) > > + return NULL; > > + > > + page = pfn_to_page(oldtce >> PAGE_SHIFT); > > + > > + WARN_ON(!page); > > + if (page && (oldtce & TCE_PCI_WRITE)) > > + SetPageDirty(page); > > + ppc_md.tce_free(tbl, entry, 1); > > + > > + return page; > > +} > > + > > +static int put_tce(struct iommu_table *tbl, unsigned long entry, > > + uint64_t tce, enum dma_data_direction direction) { > > + int ret; > > + struct page *page = NULL; > > + unsigned long kva, offset; > > + > > + /* Map new TCE */ > > + offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK); > > + ret = get_user_pages_fast(tce & PAGE_MASK, 1, > > + direction != DMA_TO_DEVICE, &page); > > + if (ret < 1) { > > + printk(KERN_ERR "tce_vfio: get_user_pages_fast failed > tce=%llx ioba=%lx ret=%d\n", > > + tce, entry << IOMMU_PAGE_SHIFT, ret); > > + if (!ret) > > + ret = -EFAULT; > > Missing return ret? Otherwise we've got some bogus uses of page below > and we're setting ret for no reason here. > > > + } > > + > > + kva = (unsigned long) page_address(page); > > + kva += offset; > > + > > + /* tce_build receives a virtual address */ > > + entry += tbl->it_offset; /* Offset into real TCE table */ > > + ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL); > > + > > + /* tce_build() only returns non-zero for transient errors */ > > + if (unlikely(ret)) { > > + printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx > ioba=%lx kva=%lx ret=%d\n", > > + tce, entry << IOMMU_PAGE_SHIFT, kva, ret); > > + put_page(page); > > + return -EIO; > > + } > > + > > + return 0; > > +} > > + > > +static void tce_flush(struct iommu_table *tbl) { > > + /* Flush/invalidate TLB caches if necessary */ > > + if (ppc_md.tce_flush) > > + ppc_md.tce_flush(tbl); > > + > > + /* Make sure updates are seen by hardware */ > > + mb(); > > +} > > + > > +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, > uint64_t tce, > > + enum dma_data_direction direction, unsigned long pages) { > > + int i, ret = 0, pages_to_put = 0; > > + struct page *page; > > + struct iommu_pool *pool = get_pool(tbl, entry); > > + struct page **oldpages; > > + const int oldpagesnum = PAGE_SIZE/sizeof(*oldpages); > > + > > + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE); > > + > > + /* Handle a single page request without allocation > > + of pages-to-release array */ > > + if (pages == 1) { > > + spin_lock(&(pool->lock)); > > + page = free_tce(tbl, entry); > > + > > + if (direction != DMA_NONE) > > + ret = put_tce(tbl, entry, tce, direction); > > + > > + tce_flush(tbl); > > + > > + if (page) > > + put_page(page); > > + > > + spin_unlock(&(pool->lock)); > > + return ret; > > + } > > + > > + /* Releasing multiple pages */ > > + /* Allocate an array for pages to be released after TCE table > > + is updated */ > > + oldpages = kmalloc(PAGE_SIZE, GFP_KERNEL); > > + if (!oldpages) > > + return -ENOMEM; > > + > > + spin_lock(&(pool->lock)); > > + > > + for (i = 0; (i < pages) && !ret; ++i, ++entry, tce += > IOMMU_PAGE_SIZE) { > > + page = free_tce(tbl, entry); > > + if (page) { > > + oldpages[pages_to_put] = page; > > + ++pages_to_put; > > + } > > + > > + if (direction != DMA_NONE) > > + ret = put_tce(tbl, entry, tce, direction); > > + > > + /* Release old pages if we reached the end of oldpages[] or > > + it is the last page or we are about to exit the loop */ > > + if ((pages_to_put == oldpagesnum) || (i == pages - 1) || ret) > { > > + tce_flush(tbl); > > Avoiding tce_flush() is the reason for all this extra overhead, right? > I wonder if it'd be cleaner separating map vs unmap, where the map case > can avoid the oldpages array... but that means inserting new mappings on > top of old ones wouldn't put the pages. > > > + > > + /* Release pages after removing them from TCE table */ > > + while (pages_to_put) { > > + --pages_to_put; > > + put_page(oldpages[pages_to_put]); > > + } > > + } > > + } > > + > > + spin_unlock(&(pool->lock)); > > + kfree(oldpages); > > + > > + return ret; > > +} > > +EXPORT_SYMBOL_GPL(iommu_put_tces); > > +#endif /* CONFIG_IOMMU_API */ > > diff --git a/arch/powerpc/platforms/powernv/pci.c > > b/arch/powerpc/platforms/powernv/pci.c > > index 05205cf..676f4d9 100644 > > --- a/arch/powerpc/platforms/powernv/pci.c > > +++ b/arch/powerpc/platforms/powernv/pci.c > > @@ -8,30 +8,31 @@ > > * This program is free software; you can redistribute it and/or > > * modify it under the terms of the GNU General Public License > > * as published by the Free Software Foundation; either version > > * 2 of the License, or (at your option) any later version. > > */ > > > > #include <linux/kernel.h> > > #include <linux/pci.h> > > #include <linux/delay.h> > > #include <linux/string.h> > > #include <linux/init.h> > > #include <linux/bootmem.h> > > #include <linux/irq.h> > > #include <linux/io.h> > > #include <linux/msi.h> > > +#include <linux/iommu.h> > > > > #include <asm/sections.h> > > #include <asm/io.h> > > #include <asm/prom.h> > > #include <asm/pci-bridge.h> > > #include <asm/machdep.h> > > #include <asm/ppc-pci.h> > > #include <asm/opal.h> > > #include <asm/iommu.h> > > #include <asm/tce.h> > > #include <asm/abs_addr.h> > > #include <asm/firmware.h> > > > > #include "powernv.h" > > #include "pci.h" > > @@ -601,15 +602,149 @@ void __init pnv_pci_init(void) > > /* Configure IOMMU DMA hooks */ > > ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup; > > ppc_md.tce_build = pnv_tce_build; > > ppc_md.tce_free = pnv_tce_free; > > ppc_md.tce_get = pnv_tce_get; > > ppc_md.pci_probe_mode = pnv_pci_probe_mode; > > set_pci_dma_ops(&dma_iommu_ops); > > > > /* Configure MSIs */ > > #ifdef CONFIG_PCI_MSI > > ppc_md.msi_check_device = pnv_msi_check_device; > > ppc_md.setup_msi_irqs = pnv_setup_msi_irqs; > > ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; #endif } > > + > > +#ifdef CONFIG_IOMMU_API > > +/* > > + * IOMMU groups support required by VFIO */ static int > > +add_device(struct device *dev) { > > + struct iommu_table *tbl; > > + int ret = 0; > > + > > + if (WARN_ON(dev->iommu_group)) { > > + printk(KERN_WARNING "tce_vfio: device %s is already in iommu > group %d, skipping\n", > > + dev->kobj.name, > > dev_name(dev) > > > + iommu_group_id(dev->iommu_group)); > > + return -EBUSY; > > + } > > + > > + tbl = get_iommu_table_base(dev); > > + if (!tbl) { > > + pr_debug("tce_vfio: skipping device %s with no tbl\n", > > + dev->kobj.name); > > + return 0; > > + } > > + > > + pr_debug("tce_vfio: adding %s to iommu group %d\n", > > + dev->kobj.name, iommu_group_id(tbl->it_group)); > > + > > + ret = iommu_group_add_device(tbl->it_group, dev); > > + if (ret < 0) > > + printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n", > > + dev->kobj.name, ret); > > + > > + return ret; > > +} > > + > > +static void del_device(struct device *dev) { > > + iommu_group_remove_device(dev); > > +} > > + > > +static int iommu_bus_notifier(struct notifier_block *nb, > > + unsigned long action, void *data) { > > + struct device *dev = data; > > + > > + switch (action) { > > + case BUS_NOTIFY_ADD_DEVICE: > > + return add_device(dev); > > + case BUS_NOTIFY_DEL_DEVICE: > > + del_device(dev); > > + return 0; > > + default: > > + return 0; > > + } > > +} > > + > > +static struct notifier_block tce_iommu_bus_nb = { > > + .notifier_call = iommu_bus_notifier, }; > > + > > +static void group_release(void *iommu_data) { > > + struct iommu_table *tbl = iommu_data; > > + tbl->it_group = NULL; > > +} > > + > > +static int __init tce_iommu_init(void) { > > + struct pci_dev *pdev = NULL; > > + struct iommu_table *tbl; > > + struct iommu_group *grp; > > + > > + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); > > There's already a notifier in the iommu code if you were to register an > iommu_ops with the add/remove_device entries. That would allow you to > remove the notifier block and notifier function below and the second loop > below. Are you avoiding that to avoid the rest of iommu_ops? > [Sethi Varun-B16395] Could be one reason, also they are associating the iommu group with the tce table entry and not the device. > Also, shouldn't this notifier only be registered after the first loop > below? Otherwise ADD_DEVICE could race with setting up groups, which we > assume are present in the add_device() above. [Sethi Varun-B16395] Isn't this similar to how how the notifier is registered in iommu_bus_init? First a notifier is registered and then we check for devices that have already been probed. -Varun ��.n��������+%������w��{.n�����o�^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�