The idea of the patch is to demonstrate what POWER needs to support VFIO. Added support on POWER. Than includes: 1) IOMMU API driver for POWER. It also includes subsys_initcall_sync(power_pci_iommu_init) which walks through all PCI devices and creates IOMMU groups and adds devices to these groups. 2) Prototype for an additional IOMMU API call tce_iommu_get_dma_window. We need to tell the POWER guest a DMA window location and size. So, I tried to add this to struct iommu_ops: static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index, phys_addr_t *start_address, size_t *allocated_size) The idea is that it returns 32-bit DMA window for index==0 and 64-bit DMA window for index==1. This is what we need now. However I noticed a move to implement IOMMU chardev for every platform separately (kernel: drivers/vfio/vfio_iommu_x86.c). I. e. QEMU does ioctl to the host, this call gets into a _platform_specific_ IOMMU chardev, then the chardev calls a _platform_independend_ IOMMU API function (lets call it iommu_get_dma_window()), and this iommu_get_dma_window() calls a _platform_specific_ tce_iommu_get_dma_window(). Another example, DMA map/unmap implementation on X86 is split between 2 pieces of code - drivers/vfio/vfio_iommu_x86.c and drivers/iommu/intel-iommu.c. And drivers/vfio/vfio_iommu_x86.c works perfect for POWER except a DMA window setup which I dropped for now and simply use quite popular configuration on power (1Gb DMA window starting from 0x0). As for me, it is too complicated. We do not need either - platform specific IOMMU chardev or - IOMMU API at all What do I miss? Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> --- arch/powerpc/include/asm/iommu.h | 3 + arch/powerpc/kernel/iommu.c | 302 ++++++++++++++++++++++++++++++++++++++ drivers/iommu/Kconfig | 2 + drivers/vfio/Kconfig | 5 +- 4 files changed, 310 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index edfc980..92aeb57 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -66,6 +66,9 @@ struct iommu_table { unsigned long it_halfpoint; /* Breaking point for small/large allocs */ spinlock_t it_lock; /* Protects it_map */ unsigned long *it_map; /* A simple allocation bitmap for now */ +#ifdef CONFIG_IOMMU_API + struct iommu_group *it_group; +#endif }; struct scatterlist; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0cfcf98..6e40870 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -39,6 +39,9 @@ #include <asm/pci-bridge.h> #include <asm/machdep.h> #include <asm/kdump.h> +#include <asm-generic/sizes.h> + +#include <linux/iommu.h> #define DBG(...) @@ -677,3 +680,302 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } } + +#ifdef CONFIG_IOMMU_API + +/* + * IOMMU API implementation. + * + * Note: only one domain per IOMMU group is enabled at the moment. + * + */ +struct tce_domain { + struct iommu_table *tbl; +}; + +static int tce_iommu_domain_init(struct iommu_domain *dom) +{ + struct tce_domain *tcedom; + + tcedom = kzalloc(sizeof(*tcedom), GFP_KERNEL); + if (!tcedom) + return -ENOMEM; + + dom->priv = tcedom; + printk("TCE Domain %p (IOMMU tcedom %p) initialized\n", tcedom, dom); + + return 0; +} + +static void tce_iommu_domain_destroy(struct iommu_domain *dom) +{ + struct tce_domain *tcedom = dom->priv; + + printk("TCE Domain %p (IOMMU tcedom %p) destroyed\n", tcedom, dom); + if (!tcedom) + return; + + if (tcedom->tbl) { + /* + * At the moment the kernel cleans the TCE table up before use + * anyway but it would be just nice to clean when it is just + * released. + */ + printk("TODO: clean the TCE table\n"); + } + + kfree(tcedom); + + dom->priv = NULL; +} + +static int tce_iommu_attach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct tce_domain *tcedom = dom->priv; + + if (!tcedom->tbl) { + tcedom->tbl = get_iommu_table_base(dev); + printk("TCE Domain %p (IOMMU tcedom %p) - " + "device %p is first in a domain\n", + tcedom, dom, dev); + } else if (tcedom->tbl != get_iommu_table_base(dev)) { + printk("TCE Domain %p (IOMMU tcedom %p) - " + "device %p NOT attached, wrong group\n", + tcedom, dom, dev); + return -EBUSY; + } + + printk("TCE Domain %p (IOMMU tcedom %p) - device %p attached\n", + tcedom, dom, dev); + + return 0; +} + +static void tce_iommu_detach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct tce_domain *tcedom = dom->priv; + struct iommu_table *tbl = tcedom->tbl; + + printk("TCE Domain %p (IOMMU tcedom %p) - device %p DEtached\n", + tcedom, dom, dev); + BUG_ON(tbl !=get_iommu_table_base(dev)); +} + +static int tce_iommu_map(struct iommu_domain *dom, unsigned long iova, + phys_addr_t paddr, size_t size, int prot) +{ + struct tce_domain *tcedom = dom->priv; + struct iommu_table *tbl = tcedom->tbl; + unsigned long entry, flags; + int build_fail; + + spin_lock_irqsave(&(tbl->it_lock), flags); + entry = iova >> IOMMU_PAGE_SHIFT; + build_fail = ppc_md.tce_build(tbl, entry, 1/*pages*/, + (unsigned long)paddr & IOMMU_PAGE_MASK, + DMA_BIDIRECTIONAL, NULL/*attrs*/); + + /* ppc_md.tce_build() only returns non-zero for transient errors. + * Clean up the table bitmap in this case and return + * DMA_ERROR_CODE. For all other errors the functionality is + * not altered. + */ + if (unlikely(build_fail)) { + printk("Failed to add TCE\n"); + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return -EFAULT; + } + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + return 0; +} + +static size_t tce_iommu_unmap(struct iommu_domain *dom, unsigned long iova, + size_t size) +{ + struct tce_domain *tcedom = dom->priv; + struct iommu_table *tbl = tcedom->tbl; + unsigned long entry, flags; + entry = iova >> IOMMU_PAGE_SHIFT; + + spin_lock_irqsave(&(tbl->it_lock), flags); + ppc_md.tce_free(tbl, entry, 1); + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + return size; +} + +static phys_addr_t tce_iommu_iova_to_phys(struct iommu_domain *dom, + unsigned long iova) +{ + struct tce_domain *tcedom = dom->priv; + struct iommu_table *tbl = tcedom->tbl; + unsigned long entry = iova >> IOMMU_PAGE_SHIFT; + phys_addr_t ret = 0; + + if (ppc_md.tce_get) + ret = ppc_md.tce_get(tbl, entry); + + return ret; +} + +static int tce_iommu_domain_has_cap(struct iommu_domain *dom, + unsigned long cap) +{ + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + case IOMMU_CAP_INTR_REMAP: + /* FIXME: not sure if these are correct */ + return 1; +/* case IOMMU_CAP_SETUP_REQUIRED: + return 1;*/ + } + + return 0; +} +#if 0 +static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index, + phys_addr_t *start_address, size_t *allocated_size) +{ + struct tce_domain *tcedom = dom->priv; + struct iommu_table *tbl = tcedom->tbl; + + if (!tbl) { + printk(KERN_ERR"tce_iommu: not initialized\n"); + return -EFAULT; + } + if (!allocated_size || !start_address) { + printk(KERN_ERR"tce_iommu: invalid parameters\n"); + return -EFAULT; + } + if (index > 0) { + printk(KERN_ERR"tce_iommu: %u is out of boundary\n", index); + return -EINVAL; + } + *start_address = tbl->it_offset << IOMMU_PAGE_SHIFT; + *allocated_size = tbl->it_size << IOMMU_PAGE_SHIFT; + + return 0; +} +#endif +static int device_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + printk("device_notifier(%p) ", dev); + /*if (action == BUS_NOTIFY_ADD_DEVICE) + return add_iommu_group(dev, NULL);*/ + switch (action) { +#define __x(s) case s: printk("action=" #s " %u 0x%x\n", (s), (s)); break; + __x(BUS_NOTIFY_ADD_DEVICE); + __x(BUS_NOTIFY_DEL_DEVICE); + __x(BUS_NOTIFY_BIND_DRIVER); + __x(BUS_NOTIFY_BOUND_DRIVER); + __x(BUS_NOTIFY_UNBIND_DRIVER); + __x(BUS_NOTIFY_UNBOUND_DRIVER); + default: printk("action=%lu 0x%lx\n", action, action); + } + return 0; +} + +static struct notifier_block device_nb = { + .notifier_call = device_notifier, +}; + +static int tce_iommu_add_device_dummy(struct device *dev) +{ + printk(KERN_ERR"%s: not implemented!\n", __func__); + return -EINVAL; +} + +static void tce_iommu_remove_device_dummy(struct device *dev) +{ + printk(KERN_ERR"%s: not implemented!\n", __func__); +} + +static struct iommu_ops tce_iommu_ops = { + .domain_init = tce_iommu_domain_init, + .domain_destroy = tce_iommu_domain_destroy, + .attach_dev = tce_iommu_attach_device, + .detach_dev = tce_iommu_detach_device, + .map = tce_iommu_map, + .unmap = tce_iommu_unmap, + .iova_to_phys = tce_iommu_iova_to_phys, + .domain_has_cap = tce_iommu_domain_has_cap, + .add_device = tce_iommu_add_device_dummy, + .remove_device = tce_iommu_remove_device_dummy, +/* .get_dma_window = tce_iommu_get_dma_window,*/ + .pgsize_bitmap = SZ_4K /*| SZ_64K | SZ_1M | SZ_16M;*/ +}; + +/* + * Setup IOMMU API. + */ +static int __init power_pci_iommu_init(void) +{ + struct pci_dev *pdev = NULL; + struct iommu_table *tbl = NULL; + int ret = 0; + + bus_set_iommu(&pci_bus_type, &tce_iommu_ops); + bus_register_notifier(&pci_bus_type, &device_nb); + + for_each_pci_dev(pdev) { + + tbl = get_iommu_table_base(&pdev->dev); + if (NULL == tbl) { + printk("Skipping device %s\n", pdev->dev.kobj.name); + continue; + } + if (!tbl->it_group) { + struct iommu_group *tmp = iommu_group_alloc(); + if (IS_ERR(tmp)) { + printk("Failed to create new IOMMU group, " + "ret = %ld\n", PTR_ERR(tmp)); + break; + } + tbl->it_group = tmp; + } + + ret = iommu_group_add_device(tbl->it_group, &pdev->dev); + if (ret < 0) + printk("iommu_group_add_device(%s) failed with %d\n", + pdev->dev.kobj.name, ret); + } + + return 0; +} + +/* + * Must be initialized after subsys_initcall(iommu_init) and + * subsys_initcall(pcibios_init). + */ +subsys_initcall_sync(power_pci_iommu_init); + +#endif /* CONFIG_IOMMU_API */ + +/* WORKAROUND */ +struct kvm; +struct kvm_memory_slot; +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + return 0; +} + diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6bea696..885ebe1 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -1,5 +1,7 @@ # IOMMU_API always gets selected by whoever wants it. config IOMMU_API + bool "IOMMU API Support for PCI pass through" + default n bool menuconfig IOMMU_SUPPORT diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 77b754c..5788194 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -1,6 +1,7 @@ config VFIO_IOMMU_X86 - tristate - depends on VFIO && X86 + tristate "X86 IOMMU API" + depends on VFIO +# && X86 default n menuconfig VFIO -- Alexey -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html