The patch introduces support of VFIO on POWER. The patch consists of: 1. IOMMU driver for VFIO. It does not use IOMMU API at all, instead it calls POWER IOMMU API directly (ppc_md callbacks). 2. A piece of code (module_init) which creates IOMMU groups. TBD: what is a better place for it? The patch is made on top of git://github.com/awilliam/linux-vfio.git iommu-group-vfio-20120523 (which is iommu-group-vfio-20120521 + some fixes) Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> --- arch/powerpc/Kconfig | 6 + arch/powerpc/include/asm/iommu.h | 3 + arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/iommu_vfio.c | 371 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 381 insertions(+), 0 deletions(-) create mode 100644 arch/powerpc/kernel/iommu_vfio.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index feab3ba..13d12ac 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -319,6 +319,12 @@ config 8XX_MINIMAL_FPEMU config IOMMU_HELPER def_bool PPC64 +config IOMMU_VFIO + select IOMMU_API + depends on PPC64 + tristate "Enable IOMMU chardev to support user-space PCI" + default n + config SWIOTLB bool "SWIOTLB support" default n diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 957a83f..c64bce7 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -66,6 +66,9 @@ struct iommu_table { unsigned long it_halfpoint; /* Breaking point for small/large allocs */ spinlock_t it_lock; /* Protects it_map */ unsigned long *it_map; /* A simple allocation bitmap for now */ +#ifdef CONFIG_IOMMU_API + struct iommu_group *it_group; +#endif }; struct scatterlist; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f5808a3..7cfd68e 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -90,6 +90,7 @@ obj-$(CONFIG_RELOCATABLE_PPC32) += reloc_32.o obj-$(CONFIG_PPC32) += entry_32.o setup_32.o obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o +obj-$(CONFIG_IOMMU_VFIO) += iommu_vfio.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o obj-$(CONFIG_MODULES) += ppc_ksyms.o diff --git a/arch/powerpc/kernel/iommu_vfio.c b/arch/powerpc/kernel/iommu_vfio.c new file mode 100644 index 0000000..68a93dd --- /dev/null +++ b/arch/powerpc/kernel/iommu_vfio.c @@ -0,0 +1,371 @@ +/* + * VFIO: IOMMU DMA mapping support for TCE on POWER + * + * Copyright (C) 2012 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy <aik@xxxxxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio_iommu_x86.c: + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@xxxxxxxxxx> + */ + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <asm/iommu.h> + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "aik@xxxxxxxxx" +#define DRIVER_DESC "POWER IOMMU chardev for VFIO" + +#define IOMMU_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1) + +/* -------- API for POWERPC IOMMU -------- */ + +#define POWERPC_IOMMU 2 + +struct tce_iommu_info { + __u32 argsz; + __u32 dma32_window_start; + __u32 dma32_window_size; +}; + +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct tce_iommu_dma_map { + __u32 argsz; + __u64 va; + __u64 dmaaddr; +}; + +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) + +/* ***************************************************************** */ + +struct tce_iommu { + struct iommu_table *tbl; +}; + +static int tce_iommu_attach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct tce_iommu *tceiommu = iommu_data; + + if (tceiommu->tbl) { + printk(KERN_ERR "Only one group per IOMMU instance is allowed\n"); + return -EFAULT; + } + tceiommu->tbl = iommu_group_get_iommudata(iommu_group); + + return 0; +} + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct tce_iommu *tceiommu = iommu_data; + + if (!tceiommu->tbl) { + printk(KERN_ERR "IOMMU already released\n"); + return; + } + tceiommu->tbl = NULL; +} + +static void *tce_iommu_open(unsigned long arg) +{ + struct tce_iommu *tceiommu; + + if (arg != POWERPC_IOMMU) + return ERR_PTR(-EINVAL); + + tceiommu = kzalloc(sizeof(*tceiommu), GFP_KERNEL); + if (!tceiommu) + return ERR_PTR(-ENOMEM); + + return tceiommu; +} + +static void tce_iommu_release(void *iommu_data) +{ + struct tce_iommu *tceiommu = iommu_data; + kfree(tceiommu); +} + +static int tce_iommu_map(struct iommu_table *tbl, unsigned long iova, + phys_addr_t paddr) +{ + unsigned long entry, flags; + int build_fail; + + spin_lock_irqsave(&(tbl->it_lock), flags); + entry = iova >> IOMMU_PAGE_SHIFT; + build_fail = ppc_md.tce_build(tbl, entry, 1/*pages*/, + (unsigned long)paddr & IOMMU_PAGE_MASK, + DMA_BIDIRECTIONAL, NULL/*attrs*/); + + /* ppc_md.tce_build() only returns non-zero for transient errors. + * Clean up the table bitmap in this case and return + * DMA_ERROR_CODE. For all other errors the functionality is + * not altered. + */ + if (unlikely(build_fail)) { + printk("Failed to add TCE\n"); + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return -EFAULT; + } + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + return 0; +} + +static void tce_iommu_unmap(struct iommu_table *tbl, unsigned long iova) +{ + unsigned long entry, flags; + entry = iova >> IOMMU_PAGE_SHIFT; + + spin_lock_irqsave(&(tbl->it_lock), flags); + ppc_md.tce_free(tbl, entry, 1); + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); +} + +static phys_addr_t tce_iommu_iova_to_va(struct iommu_table *tbl, + unsigned long iova) +{ + unsigned long entry = iova >> IOMMU_PAGE_SHIFT; + phys_addr_t ret = 0; + + if (ppc_md.tce_get) + ret = ppc_md.tce_get(tbl, entry); + + return ret; +} + +static struct page *tceaddr_to_page(void *addr) +{ + return pfn_to_page(__pa(addr) >> PAGE_SHIFT); +} + +static long tce_dmamap_page(struct iommu_table *tbl, + uint64_t va, uint64_t dmaaddr) +{ + int ret = -EFAULT; + phys_addr_t addr; + struct page *page[1]; + int iswrite = 1; + void *kva; + + if (NULL == tbl) { + printk(KERN_ERR"tce_iommu: (map) IOMMU table has not " + "been initialized yet!\n"); + return -EFAULT; + } + addr = tce_iommu_iova_to_va(tbl, dmaaddr); + if (addr) { + printk(KERN_WARNING"tce_iommu: already mapped va=%llx " + "da=%llx addr=%llx\n", + va, dmaaddr, addr); + /*TODO: unmap! */ + } + + ret = get_user_pages_fast(va, 1, iswrite, page); + if (1 != ret) { + printk(KERN_ERR"tce_iommu: get_user_pages_fast failed " + "va=%llx da=%llx addr=%llx ret=%d\n", + va, dmaaddr, addr, ret); + return -EFAULT; + } + ret = -EFAULT; + kva = (void *) page_address(page[0]); + if (kva) { + ret = tce_iommu_map(tbl, dmaaddr, (phys_addr_t) kva); + } + if (ret) { + printk(KERN_ERR"tce_iommu: tce_iommu_map va=%llx " + "da=%llx kva=%p\n", + va, dmaaddr, kva); + if (iswrite) + SetPageDirty(page[0]); + put_page(page[0]); + } + + return ret; +} + +static long tce_dmaunmap_page(struct iommu_table *tbl, uint64_t dmaaddr) +{ + int ret = 0; + phys_addr_t addr; + struct page *page; + + if (NULL == tbl) { + printk(KERN_ERR"tce_iommu: (unmap) IOMMU table has not been " + "initialized yet!\n"); + return -EFAULT; + } + addr = tce_iommu_iova_to_va(tbl, dmaaddr); + if (addr) { + page = tceaddr_to_page((void*)addr); + if (!page) { + printk(KERN_ERR"DMAUNMAP error: pfn_to_page(%llx) " + "failed\n", addr); + ret = -EFAULT; + } else { + SetPageDirty(page); + put_page(page); + } + } + tce_iommu_unmap(tbl, dmaaddr); + if (ret) + printk(KERN_ERR"Failed to DMAUNMAP: da=%llx pfn=%llx\n", + dmaaddr, addr); + return ret; +} + + +static long tce_iommu_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + struct tce_iommu *tceiommu = iommu_data; + unsigned long minsz; + + if (cmd == IOMMU_CHECK_EXTENSION) { + switch (arg) { + case POWERPC_IOMMU: + return 1; + default: + return 0; + } + } else if (cmd == POWERPC_IOMMU_GET_INFO) { + struct tce_iommu_info info; + + minsz = offsetofend(struct tce_iommu_info, dma32_window_size); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.dma32_window_start = + tceiommu->tbl->it_offset << IOMMU_PAGE_SHIFT; + info.dma32_window_size = + tceiommu->tbl->it_size << IOMMU_PAGE_SHIFT; + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == POWERPC_IOMMU_MAP_DMA) { + struct tce_iommu_dma_map map; + + minsz = offsetofend(struct tce_iommu_dma_map, dmaaddr); + + if (copy_from_user(&map, (void __user *)arg, minsz)) + return -EFAULT; + + if (map.argsz < minsz) + return -EINVAL; + + return tce_dmamap_page(tceiommu->tbl, map.va, map.dmaaddr); + + } else if (cmd == POWERPC_IOMMU_UNMAP_DMA) { + struct tce_iommu_dma_map unmap; + + minsz = offsetofend(struct tce_iommu_dma_map, dmaaddr); + + if (copy_from_user(&unmap, (void __user *)arg, minsz)) + return -EFAULT; + + if (unmap.argsz < minsz) + return -EINVAL; + + return tce_dmaunmap_page(tceiommu->tbl, unmap.dmaaddr); + } + + return -ENOTTY; +} + +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { + .name = "vfio-iommu-powerpc", + .owner = THIS_MODULE, + .open = tce_iommu_open, + .release = tce_iommu_release, + .ioctl = tce_iommu_ioctl, + .attach_group = tce_iommu_attach_group, + .detach_group = tce_iommu_detach_group, +}; + +static int __init tce_iommu_init(void) +{ + struct pci_dev *pdev = NULL; + struct iommu_table *tbl = NULL; + struct iommu_group *grp = NULL; + int ret = 0; + + /* TODO: Do this for all devices, not just for PCI */ + for_each_pci_dev(pdev) { + + tbl = get_iommu_table_base(&pdev->dev); + if (NULL == tbl) { + printk("Skipping device %s\n", pdev->dev.kobj.name); + continue; + } + if (!tbl->it_group) { + struct iommu_group *tmp = iommu_group_alloc(); + if (IS_ERR(tmp)) { + printk("Failed to create new IOMMU group, " + "ret = %ld\n", PTR_ERR(tmp)); + break; + } + tbl->it_group = tmp; + iommu_group_set_iommudata(tmp, tbl, NULL); + } + + grp = iommu_group_get(&pdev->dev); + if (!grp) { + ret = iommu_group_add_device(tbl->it_group, &pdev->dev); + if (ret < 0) + printk("iommu_group_add_device(%s) failed " + "with %d\n", + pdev->dev.kobj.name, ret); + } + } + + return vfio_register_iommu_driver(&tce_iommu_driver_ops); +} + +static void __exit tce_iommu_cleanup(void) +{ + vfio_unregister_iommu_driver(&tce_iommu_driver_ops); +} + +module_init(tce_iommu_init); +module_exit(tce_iommu_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + -- 1.7.7.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html