[RFC PATCH] kernel vfio: enabled and supported on power

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The idea of the patch is to demonstrate what POWER needs to support VFIO.

Added support on POWER. Than includes:

1) IOMMU API driver for POWER.
It also includes subsys_initcall_sync(power_pci_iommu_init) which walks through all
PCI devices and creates IOMMU groups and adds devices to these groups.

2) Prototype for an additional IOMMU API call tce_iommu_get_dma_window.

We need to tell the POWER guest a DMA window location and size. So,
I tried to add this to struct iommu_ops:

static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
                        phys_addr_t *start_address, size_t *allocated_size)

The idea is that it returns 32-bit DMA window for index==0 and 64-bit
DMA window for index==1. This is what we need now.

However I noticed a move to implement IOMMU chardev for every platform separately
(kernel: drivers/vfio/vfio_iommu_x86.c).

I. e. QEMU does ioctl to the host, this call gets into a _platform_specific_ IOMMU chardev,
then the chardev calls a _platform_independend_ IOMMU API function (lets call it
iommu_get_dma_window()), and this iommu_get_dma_window() calls a _platform_specific_
tce_iommu_get_dma_window().


Another example, DMA map/unmap implementation on X86 is split between 2 pieces of code -
drivers/vfio/vfio_iommu_x86.c and drivers/iommu/intel-iommu.c.

And drivers/vfio/vfio_iommu_x86.c works perfect for POWER except a DMA window setup
which I dropped for now and simply use quite popular configuration on power (1Gb DMA window starting
from 0x0).

As for me, it is too complicated. We do not need either
- platform specific IOMMU chardev  or
- IOMMU API at all

What do I miss?


Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx>
---
 arch/powerpc/include/asm/iommu.h |    3 +
 arch/powerpc/kernel/iommu.c      |  302 ++++++++++++++++++++++++++++++++++++++
 drivers/iommu/Kconfig            |    2 +
 drivers/vfio/Kconfig             |    5 +-
 4 files changed, 310 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index edfc980..92aeb57 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -66,6 +66,9 @@ struct iommu_table {
 	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
 	spinlock_t     it_lock;      /* Protects it_map */
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };

 struct scatterlist;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0cfcf98..6e40870 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -39,6 +39,9 @@
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/kdump.h>
+#include <asm-generic/sizes.h>
+
+#include <linux/iommu.h>

 #define DBG(...)

@@ -677,3 +680,302 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+
+/*
+ * IOMMU API implementation.
+ *
+ * Note: only one domain per IOMMU group is enabled at the moment.
+ *
+ */
+struct tce_domain {
+	struct iommu_table *tbl;
+};
+
+static int tce_iommu_domain_init(struct iommu_domain *dom)
+{
+	struct tce_domain *tcedom;
+
+	tcedom = kzalloc(sizeof(*tcedom), GFP_KERNEL);
+	if (!tcedom)
+		return -ENOMEM;
+
+	dom->priv = tcedom;
+	printk("TCE Domain %p (IOMMU tcedom %p) initialized\n", tcedom, dom);
+
+	return 0;
+}
+
+static void tce_iommu_domain_destroy(struct iommu_domain *dom)
+{
+	struct tce_domain *tcedom = dom->priv;
+
+	printk("TCE Domain %p (IOMMU tcedom %p) destroyed\n", tcedom, dom);
+	if (!tcedom)
+		return;
+
+	if (tcedom->tbl) {
+		/*
+		 * At the moment the kernel cleans the TCE table up before use
+		 * anyway but it would be just nice to clean when it is just
+		 * released.
+		 */
+		printk("TODO: clean the TCE table\n");
+	}
+
+	kfree(tcedom);
+
+	dom->priv = NULL;
+}
+
+static int tce_iommu_attach_device(struct iommu_domain *dom,
+				   struct device *dev)
+{
+	struct tce_domain *tcedom = dom->priv;
+
+	if (!tcedom->tbl) {
+		tcedom->tbl = get_iommu_table_base(dev);
+		printk("TCE Domain %p (IOMMU tcedom %p) - "
+				"device %p is first in a domain\n",
+				tcedom, dom, dev);
+	} else if (tcedom->tbl != get_iommu_table_base(dev)) {
+		printk("TCE Domain %p (IOMMU tcedom %p) - "
+				"device %p NOT attached, wrong group\n",
+				tcedom, dom, dev);
+		return -EBUSY;
+	}
+
+	printk("TCE Domain %p (IOMMU tcedom %p) - device %p attached\n",
+			tcedom, dom, dev);
+
+	return 0;
+}
+
+static void tce_iommu_detach_device(struct iommu_domain *dom,
+				    struct device *dev)
+{
+	struct tce_domain *tcedom = dom->priv;
+	struct iommu_table *tbl = tcedom->tbl;
+
+	printk("TCE Domain %p (IOMMU tcedom %p) - device %p DEtached\n",
+			tcedom, dom, dev);
+	BUG_ON(tbl !=get_iommu_table_base(dev));
+}
+
+static int tce_iommu_map(struct iommu_domain *dom, unsigned long iova,
+		   phys_addr_t paddr, size_t size, int prot)
+{
+	struct tce_domain *tcedom = dom->priv;
+	struct iommu_table *tbl = tcedom->tbl;
+	unsigned long entry, flags;
+	int build_fail;
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+	entry = iova >> IOMMU_PAGE_SHIFT;
+	build_fail = ppc_md.tce_build(tbl, entry, 1/*pages*/,
+			(unsigned long)paddr & IOMMU_PAGE_MASK,
+			DMA_BIDIRECTIONAL, NULL/*attrs*/);
+
+	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	 * Clean up the table bitmap in this case and return
+	 * DMA_ERROR_CODE. For all other errors the functionality is
+	 * not altered.
+	 */
+	if (unlikely(build_fail)) {
+		printk("Failed to add TCE\n");
+		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+		return -EFAULT;
+	}
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	return 0;
+}
+
+static size_t tce_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+		     size_t size)
+{
+	struct tce_domain *tcedom = dom->priv;
+	struct iommu_table *tbl = tcedom->tbl;
+	unsigned long entry, flags;
+	entry = iova >> IOMMU_PAGE_SHIFT;
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+	ppc_md.tce_free(tbl, entry, 1);
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	return size;
+}
+
+static phys_addr_t tce_iommu_iova_to_phys(struct iommu_domain *dom,
+		unsigned long iova)
+{
+	struct tce_domain *tcedom = dom->priv;
+	struct iommu_table *tbl = tcedom->tbl;
+	unsigned long entry = iova >> IOMMU_PAGE_SHIFT;
+	phys_addr_t ret = 0;
+
+	if (ppc_md.tce_get)
+		ret = ppc_md.tce_get(tbl, entry);
+
+	return ret;
+}
+
+static int tce_iommu_domain_has_cap(struct iommu_domain *dom,
+				    unsigned long cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+	case IOMMU_CAP_INTR_REMAP:
+		/* FIXME: not sure if these are correct */
+		return 1;
+/*	case IOMMU_CAP_SETUP_REQUIRED:
+		return 1;*/
+	}
+
+	return 0;
+}
+#if 0
+static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
+			phys_addr_t *start_address, size_t *allocated_size)
+{
+	struct tce_domain *tcedom = dom->priv;
+	struct iommu_table *tbl = tcedom->tbl;
+
+	if (!tbl) {
+		printk(KERN_ERR"tce_iommu: not initialized\n");
+		return -EFAULT;
+	}
+	if (!allocated_size || !start_address) {
+		printk(KERN_ERR"tce_iommu: invalid parameters\n");
+		return -EFAULT;
+	}
+	if (index > 0) {
+		printk(KERN_ERR"tce_iommu: %u is out of boundary\n", index);
+		return -EINVAL;
+	}
+	*start_address = tbl->it_offset << IOMMU_PAGE_SHIFT;
+	*allocated_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+
+	return 0;
+}
+#endif
+static int device_notifier(struct notifier_block *nb,
+			   unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	printk("device_notifier(%p) ", dev);
+	/*if (action == BUS_NOTIFY_ADD_DEVICE)
+		return add_iommu_group(dev, NULL);*/
+	switch (action) {
+#define __x(s) case s: printk("action=" #s " %u 0x%x\n", (s), (s)); break;
+		__x(BUS_NOTIFY_ADD_DEVICE);
+		__x(BUS_NOTIFY_DEL_DEVICE);
+		__x(BUS_NOTIFY_BIND_DRIVER);
+		__x(BUS_NOTIFY_BOUND_DRIVER);
+		__x(BUS_NOTIFY_UNBIND_DRIVER);
+		__x(BUS_NOTIFY_UNBOUND_DRIVER);
+		default: printk("action=%lu 0x%lx\n", action, action);
+	}
+	return 0;
+}
+
+static struct notifier_block device_nb = {
+	.notifier_call = device_notifier,
+};
+
+static int tce_iommu_add_device_dummy(struct device *dev)
+{
+	printk(KERN_ERR"%s: not implemented!\n", __func__);
+	return -EINVAL;
+}
+
+static void tce_iommu_remove_device_dummy(struct device *dev)
+{
+	printk(KERN_ERR"%s: not implemented!\n", __func__);
+}
+
+static struct iommu_ops tce_iommu_ops = {
+	.domain_init    = tce_iommu_domain_init,
+	.domain_destroy = tce_iommu_domain_destroy,
+	.attach_dev     = tce_iommu_attach_device,
+	.detach_dev     = tce_iommu_detach_device,
+	.map            = tce_iommu_map,
+	.unmap          = tce_iommu_unmap,
+	.iova_to_phys   = tce_iommu_iova_to_phys,
+	.domain_has_cap = tce_iommu_domain_has_cap,
+	.add_device     = tce_iommu_add_device_dummy,
+	.remove_device  = tce_iommu_remove_device_dummy,
+/*	.get_dma_window = tce_iommu_get_dma_window,*/
+	.pgsize_bitmap  = SZ_4K /*| SZ_64K | SZ_1M | SZ_16M;*/
+};
+
+/*
+ * Setup IOMMU API.
+ */
+static int __init power_pci_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl = NULL;
+	int ret = 0;
+
+	bus_set_iommu(&pci_bus_type, &tce_iommu_ops);
+	bus_register_notifier(&pci_bus_type, &device_nb);
+
+	for_each_pci_dev(pdev) {
+
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (NULL == tbl) {
+			printk("Skipping device %s\n", pdev->dev.kobj.name);
+			continue;
+		}
+		if (!tbl->it_group) {
+			struct iommu_group *tmp = iommu_group_alloc();
+			if (IS_ERR(tmp)) {
+				printk("Failed to create new IOMMU group, "
+						"ret = %ld\n", PTR_ERR(tmp));
+				break;
+			}
+			tbl->it_group = tmp;
+		}
+
+		ret = iommu_group_add_device(tbl->it_group, &pdev->dev);
+		if (ret < 0)
+			printk("iommu_group_add_device(%s) failed with %d\n",
+					pdev->dev.kobj.name, ret);
+	}
+
+	return 0;
+}
+
+/*
+ * Must be initialized after subsys_initcall(iommu_init) and
+ * subsys_initcall(pcibios_init).
+ */
+subsys_initcall_sync(power_pci_iommu_init);
+
+#endif /* CONFIG_IOMMU_API */
+
+/* WORKAROUND */
+struct kvm;
+struct kvm_memory_slot;
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	return 0;
+}
+
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6bea696..885ebe1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -1,5 +1,7 @@
 # IOMMU_API always gets selected by whoever wants it.
 config IOMMU_API
+	bool "IOMMU API Support for PCI pass through"
+	default n
 	bool

 menuconfig IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 77b754c..5788194 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,6 +1,7 @@
 config VFIO_IOMMU_X86
-	tristate
-	depends on VFIO && X86
+	tristate "X86 IOMMU API"
+	depends on VFIO
+#	&& X86
 	default n

 menuconfig VFIO


-- 
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux