Re: [Qemu-devel] [RFC PATCH] kernel vfio: enabled and supported on power

Alex Williamson <alex.williamson@xxxxxxxxxx> · Sun, 13 May 2012 22:04:47 -0600

On Sat, 2012-05-12 at 17:31 +1000, Alexey Kardashevskiy wrote:
> The idea of the patch is to demonstrate what POWER needs to support VFIO.
> 
> Added support on POWER. Than includes:
> 
> 1) IOMMU API driver for POWER.
> It also includes subsys_initcall_sync(power_pci_iommu_init) which walks through all
> PCI devices and creates IOMMU groups and adds devices to these groups.
> 
> 2) Prototype for an additional IOMMU API call tce_iommu_get_dma_window.
> 
> We need to tell the POWER guest a DMA window location and size. So,
> I tried to add this to struct iommu_ops:
> 
> static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
>                         phys_addr_t *start_address, size_t *allocated_size)
> 
> The idea is that it returns 32-bit DMA window for index==0 and 64-bit
> DMA window for index==1. This is what we need now.
> 
> However I noticed a move to implement IOMMU chardev for every platform separately
> (kernel: drivers/vfio/vfio_iommu_x86.c).
> 
> I. e. QEMU does ioctl to the host, this call gets into a _platform_specific_ IOMMU chardev,
> then the chardev calls a _platform_independend_ IOMMU API function (lets call it
> iommu_get_dma_window()), and this iommu_get_dma_window() calls a _platform_specific_
> tce_iommu_get_dma_window().
> 
> 
> Another example, DMA map/unmap implementation on X86 is split between 2 pieces of code -
> drivers/vfio/vfio_iommu_x86.c and drivers/iommu/intel-iommu.c.

or amd_iommu.c

> And drivers/vfio/vfio_iommu_x86.c works perfect for POWER except a DMA window setup
> which I dropped for now and simply use quite popular configuration on power (1Gb DMA window starting
> from 0x0).
> 
> As for me, it is too complicated. We do not need either
> - platform specific IOMMU chardev  or
> - IOMMU API at all
> 
> What do I miss?

In an ideal world, I'd like to share as much IOMMU infrastructure as
possible.  Unfortunately x86 is the main driver of the IOMMU API and
other architectures require concepts that don't necessarily make sense
for everybody else.  vfio_iommu_x86 may work for you on POWER, but aiui,
you don't need all the page pinning, you need to optimize for streaming
DMA vs static mappings, and an ioctl may not be the preferred interface
for high frequency mapping and unmapping.  "x86" is admittedly a
terrible name for the file, but the usage is for how x86 currently makes
use of the iommu
(vfio_iommu_static_pinned_page_tables_with_no_mapping_restrictions.c
made me vomit a little and probably still doesn't specify it
sufficiently).

BenH actually suggested making the IOMMU more modular and focusing on
platform specific versions.

> Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx>
> ---
>  arch/powerpc/include/asm/iommu.h |    3 +
>  arch/powerpc/kernel/iommu.c      |  302 ++++++++++++++++++++++++++++++++++++++
>  drivers/iommu/Kconfig            |    2 +
>  drivers/vfio/Kconfig             |    5 +-
>  4 files changed, 310 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index edfc980..92aeb57 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -66,6 +66,9 @@ struct iommu_table {
>  	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
>  	spinlock_t     it_lock;      /* Protects it_map */
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
> 
>  struct scatterlist;
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 0cfcf98..6e40870 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -39,6 +39,9 @@
>  #include <asm/pci-bridge.h>
>  #include <asm/machdep.h>
>  #include <asm/kdump.h>
> +#include <asm-generic/sizes.h>
> +
> +#include <linux/iommu.h>
> 
>  #define DBG(...)
> 
> @@ -677,3 +680,302 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +
> +/*
> + * IOMMU API implementation.
> + *
> + * Note: only one domain per IOMMU group is enabled at the moment.
> + *
> + */
> +struct tce_domain {
> +	struct iommu_table *tbl;
> +};
> +
> +static int tce_iommu_domain_init(struct iommu_domain *dom)
> +{
> +	struct tce_domain *tcedom;
> +
> +	tcedom = kzalloc(sizeof(*tcedom), GFP_KERNEL);
> +	if (!tcedom)
> +		return -ENOMEM;
> +
> +	dom->priv = tcedom;
> +	printk("TCE Domain %p (IOMMU tcedom %p) initialized\n", tcedom, dom);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_domain_destroy(struct iommu_domain *dom)
> +{
> +	struct tce_domain *tcedom = dom->priv;
> +
> +	printk("TCE Domain %p (IOMMU tcedom %p) destroyed\n", tcedom, dom);
> +	if (!tcedom)
> +		return;
> +
> +	if (tcedom->tbl) {
> +		/*
> +		 * At the moment the kernel cleans the TCE table up before use
> +		 * anyway but it would be just nice to clean when it is just
> +		 * released.
> +		 */
> +		printk("TODO: clean the TCE table\n");
> +	}
> +
> +	kfree(tcedom);
> +
> +	dom->priv = NULL;
> +}
> +
> +static int tce_iommu_attach_device(struct iommu_domain *dom,
> +				   struct device *dev)
> +{
> +	struct tce_domain *tcedom = dom->priv;
> +
> +	if (!tcedom->tbl) {
> +		tcedom->tbl = get_iommu_table_base(dev);
> +		printk("TCE Domain %p (IOMMU tcedom %p) - "
> +				"device %p is first in a domain\n",
> +				tcedom, dom, dev);
> +	} else if (tcedom->tbl != get_iommu_table_base(dev)) {
> +		printk("TCE Domain %p (IOMMU tcedom %p) - "
> +				"device %p NOT attached, wrong group\n",
> +				tcedom, dom, dev);
> +		return -EBUSY;
> +	}
> +
> +	printk("TCE Domain %p (IOMMU tcedom %p) - device %p attached\n",
> +			tcedom, dom, dev);
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_detach_device(struct iommu_domain *dom,
> +				    struct device *dev)
> +{
> +	struct tce_domain *tcedom = dom->priv;
> +	struct iommu_table *tbl = tcedom->tbl;
> +
> +	printk("TCE Domain %p (IOMMU tcedom %p) - device %p DEtached\n",
> +			tcedom, dom, dev);
> +	BUG_ON(tbl !=get_iommu_table_base(dev));
> +}
> +
> +static int tce_iommu_map(struct iommu_domain *dom, unsigned long iova,
> +		   phys_addr_t paddr, size_t size, int prot)
> +{
> +	struct tce_domain *tcedom = dom->priv;
> +	struct iommu_table *tbl = tcedom->tbl;
> +	unsigned long entry, flags;
> +	int build_fail;
> +
> +	spin_lock_irqsave(&(tbl->it_lock), flags);
> +	entry = iova >> IOMMU_PAGE_SHIFT;
> +	build_fail = ppc_md.tce_build(tbl, entry, 1/*pages*/,
> +			(unsigned long)paddr & IOMMU_PAGE_MASK,
> +			DMA_BIDIRECTIONAL, NULL/*attrs*/);
> +
> +	/* ppc_md.tce_build() only returns non-zero for transient errors.
> +	 * Clean up the table bitmap in this case and return
> +	 * DMA_ERROR_CODE. For all other errors the functionality is
> +	 * not altered.
> +	 */
> +	if (unlikely(build_fail)) {
> +		printk("Failed to add TCE\n");
> +		spin_unlock_irqrestore(&(tbl->it_lock), flags);
> +		return -EFAULT;
> +	}
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	spin_unlock_irqrestore(&(tbl->it_lock), flags);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +
> +	return 0;
> +}
> +
> +static size_t tce_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
> +		     size_t size)
> +{
> +	struct tce_domain *tcedom = dom->priv;
> +	struct iommu_table *tbl = tcedom->tbl;
> +	unsigned long entry, flags;
> +	entry = iova >> IOMMU_PAGE_SHIFT;
> +
> +	spin_lock_irqsave(&(tbl->it_lock), flags);
> +	ppc_md.tce_free(tbl, entry, 1);
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	spin_unlock_irqrestore(&(tbl->it_lock), flags);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +
> +	return size;
> +}
> +
> +static phys_addr_t tce_iommu_iova_to_phys(struct iommu_domain *dom,
> +		unsigned long iova)
> +{
> +	struct tce_domain *tcedom = dom->priv;
> +	struct iommu_table *tbl = tcedom->tbl;
> +	unsigned long entry = iova >> IOMMU_PAGE_SHIFT;
> +	phys_addr_t ret = 0;
> +
> +	if (ppc_md.tce_get)
> +		ret = ppc_md.tce_get(tbl, entry);
> +
> +	return ret;
> +}
> +
> +static int tce_iommu_domain_has_cap(struct iommu_domain *dom,
> +				    unsigned long cap)
> +{
> +	switch (cap) {
> +	case IOMMU_CAP_CACHE_COHERENCY:
> +	case IOMMU_CAP_INTR_REMAP:
> +		/* FIXME: not sure if these are correct */
> +		return 1;
> +/*	case IOMMU_CAP_SETUP_REQUIRED:
> +		return 1;*/
> +	}
> +
> +	return 0;
> +}
> +#if 0
> +static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
> +			phys_addr_t *start_address, size_t *allocated_size)
> +{
> +	struct tce_domain *tcedom = dom->priv;
> +	struct iommu_table *tbl = tcedom->tbl;
> +
> +	if (!tbl) {
> +		printk(KERN_ERR"tce_iommu: not initialized\n");
> +		return -EFAULT;
> +	}
> +	if (!allocated_size || !start_address) {
> +		printk(KERN_ERR"tce_iommu: invalid parameters\n");
> +		return -EFAULT;
> +	}
> +	if (index > 0) {
> +		printk(KERN_ERR"tce_iommu: %u is out of boundary\n", index);
> +		return -EINVAL;
> +	}
> +	*start_address = tbl->it_offset << IOMMU_PAGE_SHIFT;
> +	*allocated_size = tbl->it_size << IOMMU_PAGE_SHIFT;
> +
> +	return 0;
> +}
> +#endif
> +static int device_notifier(struct notifier_block *nb,
> +			   unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	printk("device_notifier(%p) ", dev);
> +	/*if (action == BUS_NOTIFY_ADD_DEVICE)
> +		return add_iommu_group(dev, NULL);*/
> +	switch (action) {
> +#define __x(s) case s: printk("action=" #s " %u 0x%x\n", (s), (s)); break;
> +		__x(BUS_NOTIFY_ADD_DEVICE);
> +		__x(BUS_NOTIFY_DEL_DEVICE);
> +		__x(BUS_NOTIFY_BIND_DRIVER);
> +		__x(BUS_NOTIFY_BOUND_DRIVER);
> +		__x(BUS_NOTIFY_UNBIND_DRIVER);
> +		__x(BUS_NOTIFY_UNBOUND_DRIVER);
> +		default: printk("action=%lu 0x%lx\n", action, action);
> +	}
> +	return 0;
> +}
> +
> +static struct notifier_block device_nb = {
> +	.notifier_call = device_notifier,
> +};
> +
> +static int tce_iommu_add_device_dummy(struct device *dev)
> +{
> +	printk(KERN_ERR"%s: not implemented!\n", __func__);
> +	return -EINVAL;
> +}
> +
> +static void tce_iommu_remove_device_dummy(struct device *dev)
> +{
> +	printk(KERN_ERR"%s: not implemented!\n", __func__);
> +}

These are optional, don't implement if you handle hotplug on your own...

> +
> +static struct iommu_ops tce_iommu_ops = {
> +	.domain_init    = tce_iommu_domain_init,
> +	.domain_destroy = tce_iommu_domain_destroy,
> +	.attach_dev     = tce_iommu_attach_device,
> +	.detach_dev     = tce_iommu_detach_device,
> +	.map            = tce_iommu_map,
> +	.unmap          = tce_iommu_unmap,
> +	.iova_to_phys   = tce_iommu_iova_to_phys,
> +	.domain_has_cap = tce_iommu_domain_has_cap,
> +	.add_device     = tce_iommu_add_device_dummy,
> +	.remove_device  = tce_iommu_remove_device_dummy,
> +/*	.get_dma_window = tce_iommu_get_dma_window,*/
> +	.pgsize_bitmap  = SZ_4K /*| SZ_64K | SZ_1M | SZ_16M;*/
> +};
> +
> +/*
> + * Setup IOMMU API.
> + */
> +static int __init power_pci_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl = NULL;
> +	int ret = 0;
> +
> +	bus_set_iommu(&pci_bus_type, &tce_iommu_ops);

...but if you did implement .add_device, this bus_register_notifier
wouldn't be necessary

> +	bus_register_notifier(&pci_bus_type, &device_nb);

And this for_each would happen as part of bus_set_iommu

> +	for_each_pci_dev(pdev) {

And this code below would be your .add_device function.

> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (NULL == tbl) {
> +			printk("Skipping device %s\n", pdev->dev.kobj.name);
> +			continue;
> +		}
> +		if (!tbl->it_group) {
> +			struct iommu_group *tmp = iommu_group_alloc();
> +			if (IS_ERR(tmp)) {
> +				printk("Failed to create new IOMMU group, "
> +						"ret = %ld\n", PTR_ERR(tmp));
> +				break;
> +			}
> +			tbl->it_group = tmp;
> +		}
> +
> +		ret = iommu_group_add_device(tbl->it_group, &pdev->dev);
> +		if (ret < 0)
> +			printk("iommu_group_add_device(%s) failed with %d\n",
> +					pdev->dev.kobj.name, ret);
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Must be initialized after subsys_initcall(iommu_init) and
> + * subsys_initcall(pcibios_init).
> + */
> +subsys_initcall_sync(power_pci_iommu_init);
> +
> +#endif /* CONFIG_IOMMU_API */
> +
> +/* WORKAROUND */
> +struct kvm;
> +struct kvm_memory_slot;
> +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
> +{
> +	return 0;
> +}
> +
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 6bea696..885ebe1 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -1,5 +1,7 @@
>  # IOMMU_API always gets selected by whoever wants it.
>  config IOMMU_API
> +	bool "IOMMU API Support for PCI pass through"
> +	default n
>  	bool

I think there's a goal to switch dma_ops over to IOMMU_API, so while
it's the primary use, maybe not the sole user.

>  menuconfig IOMMU_SUPPORT
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 77b754c..5788194 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -1,6 +1,7 @@
>  config VFIO_IOMMU_X86
> -	tristate
> -	depends on VFIO && X86
> +	tristate "X86 IOMMU API"
> +	depends on VFIO
> +#	&& X86
>  	default n

We need a new name if that's going to happen.  Thanks, glad to see it
works without too much trouble.

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html