RE: [virtio-dev] [RFC PATCH linux] iommu: Add virtio-iommu driver

Bharat Bhushan <bharat.bhushan@xxxxxxx> · Fri, 16 Jun 2017 08:48:00 +0000

Hi Jean

> -----Original Message-----
> From: virtio-dev@xxxxxxxxxxxxxxxxxxxx [mailto:virtio-dev@lists.oasis-
> open.org] On Behalf Of Jean-Philippe Brucker
> Sent: Saturday, April 08, 2017 12:53 AM
> To: iommu@xxxxxxxxxxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx;
> virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx; virtio-dev@xxxxxxxxxxxxxxxxxxxx
> Cc: cdall@xxxxxxxxxx; will.deacon@xxxxxxx; robin.murphy@xxxxxxx;
> lorenzo.pieralisi@xxxxxxx; joro@xxxxxxxxxx; mst@xxxxxxxxxx;
> jasowang@xxxxxxxxxx; alex.williamson@xxxxxxxxxx;
> marc.zyngier@xxxxxxx
> Subject: [virtio-dev] [RFC PATCH linux] iommu: Add virtio-iommu driver
> 
> The virtio IOMMU is a para-virtualized device, allowing to send IOMMU
> requests such as map/unmap over virtio-mmio transport. This driver should
> illustrate the initial proposal for virtio-iommu, that you hopefully received
> with it. It handle attach, detach, map and unmap requests.
> 
> The bulk of the code is to create requests and send them through virtio.
> Implementing the IOMMU API is fairly straightforward since the virtio-iommu
> MAP/UNMAP interface is almost identical. I threw in a custom
> map_sg() function which takes up some space, but is optional. The core
> function would send a sequence of map requests, waiting for a reply
> between each mapping. This optimization avoids yielding to the host after
> each map, and instead prepares a batch of requests in the virtio ring and
> kicks the host once.
> 
> It must be applied on top of the probe deferral work for IOMMU, currently
> under discussion. This allows to dissociate early driver detection and device
> probing: device-tree or ACPI is parsed early to find which devices are
> translated by the IOMMU, but the IOMMU itself cannot be probed until the
> core virtio module is loaded.
> 
> Enabling DEBUG makes it extremely verbose at the moment, but it should be
> calmer in next versions.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx>
> ---
>  drivers/iommu/Kconfig             |  11 +
>  drivers/iommu/Makefile            |   1 +
>  drivers/iommu/virtio-iommu.c      | 980
> ++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/Kbuild         |   1 +
>  include/uapi/linux/virtio_ids.h   |   1 +
>  include/uapi/linux/virtio_iommu.h | 142 ++++++
>  6 files changed, 1136 insertions(+)
>  create mode 100644 drivers/iommu/virtio-iommu.c  create mode 100644
> include/uapi/linux/virtio_iommu.h
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index
> 37e204f3d9be..8cd56ee9a93a 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -359,4 +359,15 @@ config MTK_IOMMU_V1
> 
>  	  if unsure, say N here.
> 
> +config VIRTIO_IOMMU
> +	tristate "Virtio IOMMU driver"
> +	depends on VIRTIO_MMIO
> +	select IOMMU_API
> +	select INTERVAL_TREE
> +	select ARM_DMA_USE_IOMMU if ARM
> +	help
> +	  Para-virtualised IOMMU driver with virtio.
> +
> +	  Say Y here if you intend to run this kernel as a guest.
> +
>  endif # IOMMU_SUPPORT
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index
> 195f7b997d8e..1199d8475802 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -27,3 +27,4 @@ obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-
> smmu.o
>  obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
>  obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
>  obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
> +obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> new file mode 100644 index 000000000000..1cf4f57b7817
> --- /dev/null
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -0,0 +1,980 @@
> +/*
> + * Virtio driver for the paravirtualized IOMMU
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
> USA.
> + *
> + * Copyright (C) 2017 ARM Limited
> + *
> + * Author: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx>  */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/amba/bus.h>
> +#include <linux/delay.h>
> +#include <linux/dma-iommu.h>
> +#include <linux/freezer.h>
> +#include <linux/interval_tree.h>
> +#include <linux/iommu.h>
> +#include <linux/module.h>
> +#include <linux/of_iommu.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/virtio.h>
> +#include <linux/virtio_config.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/wait.h>
> +
> +#include <uapi/linux/virtio_iommu.h>
> +
> +struct viommu_dev {
> +	struct iommu_device		iommu;
> +	struct device			*dev;
> +	struct virtio_device		*vdev;
> +
> +	struct virtqueue		*vq;
> +	struct list_head		pending_requests;
> +	/* Serialize anything touching the vq and the request list */
> +	spinlock_t			vq_lock;
> +
> +	struct list_head		list;
> +
> +	/* Device configuration */
> +	u64				pgsize_bitmap;
> +	u64				aperture_start;
> +	u64				aperture_end;
> +};
> +
> +struct viommu_mapping {
> +	phys_addr_t			paddr;
> +	struct interval_tree_node	iova;
> +};
> +
> +struct viommu_domain {
> +	struct iommu_domain		domain;
> +	struct viommu_dev		*viommu;
> +	struct mutex			mutex;
> +	u64				id;
> +
> +	spinlock_t			mappings_lock;
> +	struct rb_root			mappings;
> +
> +	/* Number of devices attached to this domain */
> +	unsigned long			attached;
> +};
> +
> +struct viommu_endpoint {
> +	struct viommu_dev		*viommu;
> +	struct viommu_domain		*vdomain;
> +};
> +
> +struct viommu_request {
> +	struct scatterlist		head;
> +	struct scatterlist		tail;
> +
> +	int				written;
> +	struct list_head		list;
> +};
> +
> +/* TODO: use an IDA */
> +static atomic64_t viommu_domain_ids_gen;
> +
> +#define to_viommu_domain(domain) container_of(domain, struct
> +viommu_domain, domain)
> +
> +/* Virtio transport */
> +
> +static int viommu_status_to_errno(u8 status) {
> +	switch (status) {
> +	case VIRTIO_IOMMU_S_OK:
> +		return 0;
> +	case VIRTIO_IOMMU_S_UNSUPP:
> +		return -ENOSYS;
> +	case VIRTIO_IOMMU_S_INVAL:
> +		return -EINVAL;
> +	case VIRTIO_IOMMU_S_RANGE:
> +		return -ERANGE;
> +	case VIRTIO_IOMMU_S_NOENT:
> +		return -ENOENT;
> +	case VIRTIO_IOMMU_S_FAULT:
> +		return -EFAULT;
> +	case VIRTIO_IOMMU_S_IOERR:
> +	case VIRTIO_IOMMU_S_DEVERR:
> +	default:
> +		return -EIO;
> +	}
> +}
> +
> +static int viommu_get_req_size(struct virtio_iommu_req_head *req, size_t
> *head,
> +			       size_t *tail)
> +{
> +	size_t size;
> +	union virtio_iommu_req r;
> +
> +	*tail = sizeof(struct virtio_iommu_req_tail);
> +
> +	switch (req->type) {
> +	case VIRTIO_IOMMU_T_ATTACH:
> +		size = sizeof(r.attach);
> +		break;
> +	case VIRTIO_IOMMU_T_DETACH:
> +		size = sizeof(r.detach);
> +		break;
> +	case VIRTIO_IOMMU_T_MAP:
> +		size = sizeof(r.map);
> +		break;
> +	case VIRTIO_IOMMU_T_UNMAP:
> +		size = sizeof(r.unmap);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	*head = size - *tail;
> +	return 0;
> +}
> +
> +static int viommu_receive_resp(struct viommu_dev *viommu, int
> +nr_expected) {
> +
> +	unsigned int len;
> +	int nr_received = 0;
> +	struct viommu_request *req, *pending, *next;
> +
> +	pending = list_first_entry_or_null(&viommu->pending_requests,
> +					   struct viommu_request, list);
> +	if (WARN_ON(!pending))
> +		return 0;
> +
> +	while ((req = virtqueue_get_buf(viommu->vq, &len)) != NULL) {
> +		if (req != pending) {
> +			dev_warn(viommu->dev, "discarding stale
> request\n");
> +			continue;
> +		}
> +
> +		pending->written = len;
> +
> +		if (++nr_received == nr_expected) {
> +			list_del(&pending->list);
> +			/*
> +			 * In an ideal world, we'd wake up the waiter for this
> +			 * group of requests here. But everything is painfully
> +			 * synchronous, so waiter is the caller.
> +			 */
> +			break;
> +		}
> +
> +		next = list_next_entry(pending, list);
> +		list_del(&pending->list);
> +
> +		if (WARN_ON(list_empty(&viommu->pending_requests)))
> +			return 0;
> +
> +		pending = next;
> +	}
> +
> +	return nr_received;
> +}
> +
> +/* Must be called with vq_lock held */
> +static int _viommu_send_reqs_sync(struct viommu_dev *viommu,
> +				  struct viommu_request *req, int nr,
> +				  int *nr_sent)
> +{
> +	int i, ret;
> +	ktime_t timeout;
> +	int nr_received = 0;
> +	struct scatterlist *sg[2];
> +	/*
> +	 * FIXME: as it stands, 1s timeout per request. This is a voluntary
> +	 * exaggeration because I have no idea how real our ktime is. Are we
> +	 * using a RTC? Are we aware of steal time? I don't know much about
> +	 * this, need to do some digging.
> +	 */
> +	unsigned long timeout_ms = 1000;
> +
> +	*nr_sent = 0;
> +
> +	for (i = 0; i < nr; i++, req++) {
> +		/*
> +		 * The backend will allocate one indirect descriptor for each
> +		 * request, which allows to double the ring consumption, but
> +		 * might be slower.
> +		 */
> +		req->written = 0;
> +
> +		sg[0] = &req->head;
> +		sg[1] = &req->tail;
> +
> +		ret = virtqueue_add_sgs(viommu->vq, sg, 1, 1, req,
> +					GFP_ATOMIC);
> +		if (ret)
> +			break;
> +
> +		list_add_tail(&req->list, &viommu->pending_requests);
> +	}
> +
> +	if (i && !virtqueue_kick(viommu->vq))
> +		return -EPIPE;
> +
> +	/*
> +	 * Absolutely no wiggle room here. We're not allowed to sleep as
> callers
> +	 * might be holding spinlocks, so we have to poll like savages until
> +	 * something appears. Hopefully the host already handled the
> request
> +	 * during the above kick and returned it to us.
> +	 *
> +	 * A nice improvement would be for the caller to tell us if we can
> sleep
> +	 * whilst mapping, but this has to go through the IOMMU/DMA API.
> +	 */
> +	timeout = ktime_add_ms(ktime_get(), timeout_ms * i);
> +	while (nr_received < i && ktime_before(ktime_get(), timeout)) {
> +		nr_received += viommu_receive_resp(viommu, i -
> nr_received);
> +		if (nr_received < i) {
> +			/*
> +			 * FIXME: what's a good way to yield to host? A
> second
> +			 * virtqueue_kick won't have any effect since we
> haven't
> +			 * added any descriptor.
> +			 */
> +			udelay(10);
> +		}
> +	}
> +	dev_dbg(viommu->dev, "request took %lld us\n",
> +		ktime_us_delta(ktime_get(), ktime_sub_ms(timeout,
> timeout_ms * i)));
> +
> +	if (nr_received != i)
> +		ret = -ETIMEDOUT;
> +
> +	if (ret == -ENOSPC && nr_received)
> +		/*
> +		 * We've freed some space since virtio told us that the ring is
> +		 * full, tell the caller to come back later (after releasing the
> +		 * lock first, to be fair to other threads)
> +		 */
> +		ret = -EAGAIN;
> +
> +	*nr_sent = nr_received;
> +
> +	return ret;
> +}
> +
> +/**
> + * viommu_send_reqs_sync - add a batch of requests, kick the host and
> wait for
> + *                         them to return
> + *
> + * @req: array of requests
> + * @nr: size of the array
> + * @nr_sent: contains the number of requests actually sent after this
> function
> + *           returns
> + *
> + * Return 0 on success, or an error if we failed to send some of the
> requests.
> + */
> +static int viommu_send_reqs_sync(struct viommu_dev *viommu,
> +				 struct viommu_request *req, int nr,
> +				 int *nr_sent)
> +{
> +	int ret;
> +	int sent = 0;
> +	unsigned long flags;
> +
> +	*nr_sent = 0;
> +	do {
> +		spin_lock_irqsave(&viommu->vq_lock, flags);
> +		ret = _viommu_send_reqs_sync(viommu, req, nr, &sent);
> +		spin_unlock_irqrestore(&viommu->vq_lock, flags);
> +
> +		*nr_sent += sent;
> +		req += sent;
> +		nr -= sent;
> +	} while (ret == -EAGAIN);
> +
> +	return ret;
> +}
> +
> +/**
> + * viommu_send_req_sync - send one request and wait for reply
> + *
> + * @head_ptr: pointer to a virtio_iommu_req_* structure
> + *
> + * Returns 0 if the request was successful, or an error number
> +otherwise. No
> + * distinction is done between transport and request errors.
> + */
> +static int viommu_send_req_sync(struct viommu_dev *viommu, void
> +*head_ptr) {
> +	int ret;
> +	int nr_sent;
> +	struct viommu_request req;
> +	size_t head_size, tail_size;
> +	struct virtio_iommu_req_tail *tail;
> +	struct virtio_iommu_req_head *head = head_ptr;
> +
> +	ret = viommu_get_req_size(head, &head_size, &tail_size);
> +	if (ret)
> +		return ret;
> +
> +	dev_dbg(viommu->dev, "Sending request 0x%x, %zu bytes\n",
> head->type,
> +		head_size + tail_size);
> +
> +	tail = head_ptr + head_size;
> +
> +	sg_init_one(&req.head, head, head_size);
> +	sg_init_one(&req.tail, tail, tail_size);
> +
> +	ret = viommu_send_reqs_sync(viommu, &req, 1, &nr_sent);
> +	if (ret || !req.written || nr_sent != 1) {
> +		dev_err(viommu->dev, "failed to send command\n");
> +		return -EIO;
> +	}
> +
> +	ret = -viommu_status_to_errno(tail->status);
> +
> +	if (ret)
> +		dev_dbg(viommu->dev, " completed with %d\n", ret);
> +
> +	return ret;
> +}
> +
> +static int viommu_tlb_map(struct viommu_domain *vdomain, unsigned
> long iova,
> +			  phys_addr_t paddr, size_t size)
> +{
> +	unsigned long flags;
> +	struct viommu_mapping *mapping;
> +
> +	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
> +	if (!mapping)
> +		return -ENOMEM;
> +
> +	mapping->paddr = paddr;
> +	mapping->iova.start = iova;
> +	mapping->iova.last = iova + size - 1;
> +
> +	spin_lock_irqsave(&vdomain->mappings_lock, flags);
> +	interval_tree_insert(&mapping->iova, &vdomain->mappings);
> +	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
> +
> +	return 0;
> +}
> +
> +static size_t viommu_tlb_unmap(struct viommu_domain *vdomain,
> +			       unsigned long iova, size_t size) {
> +	size_t unmapped = 0;
> +	unsigned long flags;
> +	unsigned long last = iova + size - 1;
> +	struct viommu_mapping *mapping = NULL;
> +	struct interval_tree_node *node, *next;
> +
> +	spin_lock_irqsave(&vdomain->mappings_lock, flags);
> +	next = interval_tree_iter_first(&vdomain->mappings, iova, last);
> +	while (next) {
> +		node = next;
> +		mapping = container_of(node, struct viommu_mapping,
> iova);
> +
> +		next = interval_tree_iter_next(node, iova, last);
> +
> +		/*
> +		 * Note that for a partial range, this will return the full
> +		 * mapping so we avoid sending split requests to the device.
> +		 */
> +		unmapped += mapping->iova.last - mapping->iova.start + 1;
> +
> +		interval_tree_remove(node, &vdomain->mappings);
> +		kfree(mapping);
> +	}
> +	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
> +
> +	return unmapped;
> +}
> +
> +/* IOMMU API */
> +
> +static bool viommu_capable(enum iommu_cap cap) {
> +	return false; /* :( */
> +}
> +
> +static struct iommu_domain *viommu_domain_alloc(unsigned type) {
> +	struct viommu_domain *vdomain;
> +
> +	if (type != IOMMU_DOMAIN_UNMANAGED && type !=
> IOMMU_DOMAIN_DMA)
> +		return NULL;
> +
> +	vdomain = kzalloc(sizeof(struct viommu_domain), GFP_KERNEL);
> +	if (!vdomain)
> +		return NULL;
> +
> +	vdomain->id =
> atomic64_inc_return_relaxed(&viommu_domain_ids_gen);
> +
> +	mutex_init(&vdomain->mutex);
> +	spin_lock_init(&vdomain->mappings_lock);
> +	vdomain->mappings = RB_ROOT;
> +
> +	pr_debug("alloc domain of type %d -> %llu\n", type, vdomain->id);
> +
> +	if (type == IOMMU_DOMAIN_DMA &&
> +	    iommu_get_dma_cookie(&vdomain->domain)) {
> +		kfree(vdomain);
> +		return NULL;
> +	}
> +
> +	return &vdomain->domain;
> +}
> +
> +static void viommu_domain_free(struct iommu_domain *domain) {
> +	struct viommu_domain *vdomain = to_viommu_domain(domain);
> +
> +	pr_debug("free domain %llu\n", vdomain->id);
> +
> +	iommu_put_dma_cookie(domain);
> +
> +	/* Free all remaining mappings (size 2^64) */
> +	viommu_tlb_unmap(vdomain, 0, 0);
> +
> +	kfree(vdomain);
> +}
> +
> +static int viommu_attach_dev(struct iommu_domain *domain, struct
> device
> +*dev) {
> +	int i;
> +	int ret = 0;
> +	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
> +	struct viommu_endpoint *vdev = fwspec->iommu_priv;
> +	struct viommu_domain *vdomain = to_viommu_domain(domain);
> +	struct virtio_iommu_req_attach req = {
> +		.head.type	= VIRTIO_IOMMU_T_ATTACH,
> +		.address_space	= cpu_to_le32(vdomain->id),
> +	};
> +
> +	mutex_lock(&vdomain->mutex);
> +	if (!vdomain->viommu) {
> +		struct viommu_dev *viommu = vdev->viommu;
> +
> +		vdomain->viommu = viommu;
> +
> +		domain->pgsize_bitmap		= viommu-
> >pgsize_bitmap;
> +		domain->geometry.aperture_start	= viommu-
> >aperture_start;
> +		domain->geometry.aperture_end	= viommu-
> >aperture_end;
> +		domain->geometry.force_aperture	= true;
> +
> +	} else if (vdomain->viommu != vdev->viommu) {
> +		dev_err(dev, "cannot attach to foreign VIOMMU\n");
> +		ret = -EXDEV;
> +	}
> +	mutex_unlock(&vdomain->mutex);
> +
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * When attaching the device to a new domain, it will be detached
> from
> +	 * the old one and, if as as a result the old domain isn't attached to
> +	 * any device, all mappings are removed from the old domain and it is
> +	 * freed. (Note that we can't use get_domain_for_dev here, it
> returns
> +	 * the default domain during initial attach.)
> +	 *
> +	 * Take note of the device disappearing, so we can ignore unmap
> request
> +	 * on stale domains (that is, between this detach and the upcoming
> +	 * free.)
> +	 *
> +	 * vdev->vdomain is protected by group->mutex
> +	 */
> +	if (vdev->vdomain) {
> +		dev_dbg(dev, "detach from domain %llu\n", vdev-
> >vdomain->id);
> +		vdev->vdomain->attached--;
> +	}
> +
> +	dev_dbg(dev, "attach to domain %llu\n", vdomain->id);
> +
> +	for (i = 0; i < fwspec->num_ids; i++) {
> +		req.device = cpu_to_le32(fwspec->ids[i]);
> +
> +		ret = viommu_send_req_sync(vdomain->viommu, &req);
> +		if (ret)
> +			break;
> +	}
> +
> +	vdomain->attached++;
> +	vdev->vdomain = vdomain;
> +
> +	return ret;
> +}
> +
> +static int viommu_map(struct iommu_domain *domain, unsigned long iova,
> +		      phys_addr_t paddr, size_t size, int prot) {
> +	int ret;
> +	struct viommu_domain *vdomain = to_viommu_domain(domain);
> +	struct virtio_iommu_req_map req = {
> +		.head.type	= VIRTIO_IOMMU_T_MAP,
> +		.address_space	= cpu_to_le32(vdomain->id),
> +		.virt_addr	= cpu_to_le64(iova),
> +		.phys_addr	= cpu_to_le64(paddr),
> +		.size		= cpu_to_le64(size),
> +	};
> +
> +	pr_debug("map %llu 0x%lx -> 0x%llx (%zu)\n", vdomain->id, iova,
> +		 paddr, size);

A query, when I am tracing above prints I see same physical address is mapped with two different virtual address, do you know why kernel does this?

Thanks
-Bharat

> +
> +	if (!vdomain->attached)
> +		return -ENODEV;
> +
> +	if (prot & IOMMU_READ)
> +		req.flags |= cpu_to_le32(VIRTIO_IOMMU_MAP_F_READ);
> +
> +	if (prot & IOMMU_WRITE)
> +		req.flags |= cpu_to_le32(VIRTIO_IOMMU_MAP_F_WRITE);
> +
> +	ret = viommu_tlb_map(vdomain, iova, paddr, size);
> +	if (ret)
> +		return ret;
> +
> +	ret = viommu_send_req_sync(vdomain->viommu, &req);
> +	if (ret)
> +		viommu_tlb_unmap(vdomain, iova, size);
> +
> +	return ret;
> +}
> +
> +static size_t viommu_unmap(struct iommu_domain *domain, unsigned
> long iova,
> +			   size_t size)
> +{
> +	int ret;
> +	size_t unmapped;
> +	struct viommu_domain *vdomain = to_viommu_domain(domain);
> +	struct virtio_iommu_req_unmap req = {
> +		.head.type	= VIRTIO_IOMMU_T_UNMAP,
> +		.address_space	= cpu_to_le32(vdomain->id),
> +		.virt_addr	= cpu_to_le64(iova),
> +	};
> +
> +	pr_debug("unmap %llu 0x%lx (%zu)\n", vdomain->id, iova, size);
> +
> +	/* Callers may unmap after detach, but device already took care of it.
> */
> +	if (!vdomain->attached)
> +		return size;
> +
> +	unmapped = viommu_tlb_unmap(vdomain, iova, size);
> +	if (unmapped < size)
> +		return 0;
> +
> +	req.size = cpu_to_le64(unmapped);
> +
> +	ret = viommu_send_req_sync(vdomain->viommu, &req);
> +	if (ret)
> +		return 0;
> +
> +	return unmapped;
> +}
> +
> +static size_t viommu_map_sg(struct iommu_domain *domain, unsigned
> long iova,
> +			    struct scatterlist *sg, unsigned int nents, int prot) {
> +	int i, ret;
> +	int nr_sent;
> +	size_t mapped;
> +	size_t min_pagesz;
> +	size_t total_size;
> +	struct scatterlist *s;
> +	unsigned int flags = 0;
> +	unsigned long cur_iova;
> +	unsigned long mapped_iova;
> +	size_t head_size, tail_size;
> +	struct viommu_request reqs[nents];
> +	struct virtio_iommu_req_map map_reqs[nents];
> +	struct viommu_domain *vdomain = to_viommu_domain(domain);
> +
> +	if (!vdomain->attached)
> +		return 0;
> +
> +	pr_debug("map_sg %llu %u 0x%lx\n", vdomain->id, nents, iova);
> +
> +	if (prot & IOMMU_READ)
> +		flags |= VIRTIO_IOMMU_MAP_F_READ;
> +
> +	if (prot & IOMMU_WRITE)
> +		flags |= VIRTIO_IOMMU_MAP_F_WRITE;
> +
> +	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
> +	tail_size = sizeof(struct virtio_iommu_req_tail);
> +	head_size = sizeof(*map_reqs) - tail_size;
> +
> +	cur_iova = iova;
> +
> +	for_each_sg(sg, s, nents, i) {
> +		size_t size = s->length;
> +		phys_addr_t paddr = sg_phys(s);
> +		void *tail = (void *)&map_reqs[i] + head_size;
> +
> +		if (!IS_ALIGNED(paddr | size, min_pagesz)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		/* TODO: merge physically-contiguous mappings if any */
> +		map_reqs[i] = (struct virtio_iommu_req_map) {
> +			.head.type	= VIRTIO_IOMMU_T_MAP,
> +			.address_space	= cpu_to_le32(vdomain->id),
> +			.flags		= cpu_to_le32(flags),
> +			.virt_addr	= cpu_to_le64(cur_iova),
> +			.phys_addr	= cpu_to_le64(paddr),
> +			.size		= cpu_to_le64(size),
> +		};
> +
> +		ret = viommu_tlb_map(vdomain, cur_iova, paddr, size);
> +		if (ret)
> +			break;
> +
> +		sg_init_one(&reqs[i].head, &map_reqs[i], head_size);
> +		sg_init_one(&reqs[i].tail, tail, tail_size);
> +
> +		cur_iova += size;
> +	}
> +
> +	total_size = cur_iova - iova;
> +
> +	if (ret) {
> +		viommu_tlb_unmap(vdomain, iova, total_size);
> +		return 0;
> +	}
> +
> +	ret = viommu_send_reqs_sync(vdomain->viommu, reqs, i,
> &nr_sent);
> +
> +	if (nr_sent != nents)
> +		goto err_rollback;
> +
> +	for (i = 0; i < nents; i++) {
> +		if (!reqs[i].written || map_reqs[i].tail.status)
> +			goto err_rollback;
> +	}
> +
> +	return total_size;
> +
> +err_rollback:
> +	/*
> +	 * Any request in the range might have failed. Unmap what was
> +	 * successful.
> +	 */
> +	cur_iova = iova;
> +	mapped_iova = iova;
> +	mapped = 0;
> +	for_each_sg(sg, s, nents, i) {
> +		size_t size = s->length;
> +
> +		cur_iova += size;
> +
> +		if (!reqs[i].written || map_reqs[i].tail.status) {
> +			if (mapped)
> +				viommu_unmap(domain, mapped_iova,
> mapped);
> +
> +			mapped_iova = cur_iova;
> +			mapped = 0;
> +		} else {
> +			mapped += size;
> +		}
> +	}
> +
> +	viommu_tlb_unmap(vdomain, iova, total_size);
> +
> +	return 0;
> +}
> +
> +static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
> +				       dma_addr_t iova)
> +{
> +	u64 paddr = 0;
> +	unsigned long flags;
> +	struct viommu_mapping *mapping;
> +	struct interval_tree_node *node;
> +	struct viommu_domain *vdomain = to_viommu_domain(domain);
> +
> +	spin_lock_irqsave(&vdomain->mappings_lock, flags);
> +	node = interval_tree_iter_first(&vdomain->mappings, iova, iova);
> +	if (node) {
> +		mapping = container_of(node, struct viommu_mapping,
> iova);
> +		paddr = mapping->paddr + (iova - mapping->iova.start);
> +	}
> +	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
> +
> +	pr_debug("iova_to_phys %llu 0x%llx->0x%llx\n", vdomain->id, iova,
> +		 paddr);
> +
> +	return paddr;
> +}
> +
> +static struct iommu_ops viommu_ops;
> +static struct virtio_driver virtio_iommu_drv;
> +
> +static int viommu_match_node(struct device *dev, void *data) {
> +	return dev->parent->fwnode == data;
> +}
> +
> +static struct viommu_dev *viommu_get_by_fwnode(struct
> fwnode_handle
> +*fwnode) {
> +	struct device *dev = driver_find_device(&virtio_iommu_drv.driver,
> NULL,
> +						fwnode,
> viommu_match_node);
> +	put_device(dev);
> +
> +	return dev ? dev_to_virtio(dev)->priv : NULL; }
> +
> +static int viommu_add_device(struct device *dev) {
> +	struct iommu_group *group;
> +	struct viommu_endpoint *vdev;
> +	struct viommu_dev *viommu = NULL;
> +	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
> +
> +	if (!fwspec || fwspec->ops != &viommu_ops)
> +		return -ENODEV;
> +
> +	viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode);
> +	if (!viommu)
> +		return -ENODEV;
> +
> +	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
> +	if (!vdev)
> +		return -ENOMEM;
> +
> +	vdev->viommu = viommu;
> +	fwspec->iommu_priv = vdev;
> +
> +	/*
> +	 * Last step creates a default domain and attaches to it. Everything
> +	 * must be ready.
> +	 */
> +	group = iommu_group_get_for_dev(dev);
> +
> +	return PTR_ERR_OR_ZERO(group);
> +}
> +
> +static void viommu_remove_device(struct device *dev) {
> +	kfree(dev->iommu_fwspec->iommu_priv);
> +}
> +
> +static struct iommu_group *
> +viommu_device_group(struct device *dev) {
> +	if (dev_is_pci(dev))
> +		return pci_device_group(dev);
> +	else
> +		return generic_device_group(dev);
> +}
> +
> +static int viommu_of_xlate(struct device *dev, struct of_phandle_args
> +*args) {
> +	u32 *id = args->args;
> +
> +	dev_dbg(dev, "of_xlate 0x%x\n", *id);
> +	return iommu_fwspec_add_ids(dev, args->args, 1); }
> +
> +/*
> + * (Maybe) temporary hack for device pass-through into guest userspace.
> +On ARM
> + * with an ITS, VFIO will look for a region where to map the doorbell,
> +even
> + * though the virtual doorbell is never written to by the device, and
> +instead
> + * the host injects interrupts directly. TODO: sort this out in VFIO.
> + */
> +#define MSI_IOVA_BASE			0x8000000
> +#define MSI_IOVA_LENGTH			0x100000
> +
> +static void viommu_get_resv_regions(struct device *dev, struct
> +list_head *head) {
> +	struct iommu_resv_region *region;
> +	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
> +
> +	region = iommu_alloc_resv_region(MSI_IOVA_BASE,
> MSI_IOVA_LENGTH, prot,
> +					 IOMMU_RESV_MSI);
> +	if (!region)
> +		return;
> +
> +	list_add_tail(&region->list, head);
> +}
> +
> +static void viommu_put_resv_regions(struct device *dev, struct
> +list_head *head) {
> +	struct iommu_resv_region *entry, *next;
> +
> +	list_for_each_entry_safe(entry, next, head, list)
> +		kfree(entry);
> +}
> +
> +static struct iommu_ops viommu_ops = {
> +	.capable		= viommu_capable,
> +	.domain_alloc		= viommu_domain_alloc,
> +	.domain_free		= viommu_domain_free,
> +	.attach_dev		= viommu_attach_dev,
> +	.map			= viommu_map,
> +	.unmap			= viommu_unmap,
> +	.map_sg			= viommu_map_sg,
> +	.iova_to_phys		= viommu_iova_to_phys,
> +	.add_device		= viommu_add_device,
> +	.remove_device		= viommu_remove_device,
> +	.device_group		= viommu_device_group,
> +	.of_xlate		= viommu_of_xlate,
> +	.get_resv_regions	= viommu_get_resv_regions,
> +	.put_resv_regions	= viommu_put_resv_regions,
> +};
> +
> +static int viommu_init_vq(struct viommu_dev *viommu) {
> +	struct virtio_device *vdev = dev_to_virtio(viommu->dev);
> +	vq_callback_t *callback = NULL;
> +	const char *name = "request";
> +	int ret;
> +
> +	ret = vdev->config->find_vqs(vdev, 1, &viommu->vq, &callback,
> +				     &name, NULL);
> +	if (ret)
> +		dev_err(viommu->dev, "cannot find VQ\n");
> +
> +	return ret;
> +}
> +
> +static int viommu_probe(struct virtio_device *vdev) {
> +	struct device *parent_dev = vdev->dev.parent;
> +	struct viommu_dev *viommu = NULL;
> +	struct device *dev = &vdev->dev;
> +	int ret;
> +
> +	viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);
> +	if (!viommu)
> +		return -ENOMEM;
> +
> +	spin_lock_init(&viommu->vq_lock);
> +	INIT_LIST_HEAD(&viommu->pending_requests);
> +	viommu->dev = dev;
> +	viommu->vdev = vdev;
> +
> +	ret = viommu_init_vq(viommu);
> +	if (ret)
> +		goto err_free_viommu;
> +
> +	virtio_cread(vdev, struct virtio_iommu_config, page_sizes,
> +		     &viommu->pgsize_bitmap);
> +
> +	viommu->aperture_end = -1UL;
> +
> +	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
> +			     struct virtio_iommu_config, input_range.start,
> +			     &viommu->aperture_start);
> +
> +	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
> +			     struct virtio_iommu_config, input_range.end,
> +			     &viommu->aperture_end);
> +
> +	if (!viommu->pgsize_bitmap) {
> +		ret = -EINVAL;
> +		goto err_free_viommu;
> +	}
> +
> +	viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap;
> +
> +	/*
> +	 * Not strictly necessary, virtio would enable it later. This allows to
> +	 * start using the request queue early.
> +	 */
> +	virtio_device_ready(vdev);
> +
> +	ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s",
> +				     virtio_bus_name(vdev));
> +	if (ret)
> +		goto err_free_viommu;
> +
> +	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
> +	iommu_device_set_fwnode(&viommu->iommu, parent_dev-
> >fwnode);
> +
> +	iommu_device_register(&viommu->iommu);
> +
> +#ifdef CONFIG_PCI
> +	if (pci_bus_type.iommu_ops != &viommu_ops) {
> +		pci_request_acs();
> +		ret = bus_set_iommu(&pci_bus_type, &viommu_ops);
> +		if (ret)
> +			goto err_unregister;
> +	}
> +#endif
> +#ifdef CONFIG_ARM_AMBA
> +	if (amba_bustype.iommu_ops != &viommu_ops) {
> +		ret = bus_set_iommu(&amba_bustype, &viommu_ops);
> +		if (ret)
> +			goto err_unregister;
> +	}
> +#endif
> +	if (platform_bus_type.iommu_ops != &viommu_ops) {
> +		ret = bus_set_iommu(&platform_bus_type, &viommu_ops);
> +		if (ret)
> +			goto err_unregister;
> +	}
> +
> +	vdev->priv = viommu;
> +
> +	dev_info(viommu->dev, "probe successful\n");
> +
> +	return 0;
> +
> +err_unregister:
> +	iommu_device_unregister(&viommu->iommu);
> +
> +err_free_viommu:
> +	kfree(viommu);
> +
> +	return ret;
> +}
> +
> +static void viommu_remove(struct virtio_device *vdev) {
> +	struct viommu_dev *viommu = vdev->priv;
> +
> +	iommu_device_unregister(&viommu->iommu);
> +	kfree(viommu);
> +
> +	dev_info(&vdev->dev, "device removed\n"); }
> +
> +static void viommu_config_changed(struct virtio_device *vdev) {
> +	dev_warn(&vdev->dev, "config changed\n"); }
> +
> +static unsigned int features[] = {
> +	VIRTIO_IOMMU_F_INPUT_RANGE,
> +};
> +
> +static struct virtio_device_id id_table[] = {
> +	{ VIRTIO_ID_IOMMU, VIRTIO_DEV_ANY_ID },
> +	{ 0 },
> +};
> +
> +static struct virtio_driver virtio_iommu_drv = {
> +	.driver.name		= KBUILD_MODNAME,
> +	.driver.owner		= THIS_MODULE,
> +	.id_table		= id_table,
> +	.feature_table		= features,
> +	.feature_table_size	= ARRAY_SIZE(features),
> +	.probe			= viommu_probe,
> +	.remove			= viommu_remove,
> +	.config_changed		= viommu_config_changed,
> +};
> +
> +module_virtio_driver(virtio_iommu_drv);
> +
> +IOMMU_OF_DECLARE(viommu, "virtio,mmio", NULL);
> +
> +MODULE_DESCRIPTION("virtio-iommu driver"); MODULE_AUTHOR("Jean-
> Philippe
> +Brucker <jean-philippe.brucker@xxxxxxx>");
> +MODULE_LICENSE("GPL v2");
> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index
> 1f25c86374ad..c0cb0f173258 100644
> --- a/include/uapi/linux/Kbuild
> +++ b/include/uapi/linux/Kbuild
> @@ -467,6 +467,7 @@ header-y += virtio_console.h  header-y +=
> virtio_gpu.h  header-y += virtio_ids.h  header-y += virtio_input.h
> +header-y += virtio_iommu.h
>  header-y += virtio_mmio.h
>  header-y += virtio_net.h
>  header-y += virtio_pci.h
> diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
> index 6d5c3b2d4f4d..934ed3d3cd3f 100644
> --- a/include/uapi/linux/virtio_ids.h
> +++ b/include/uapi/linux/virtio_ids.h
> @@ -43,5 +43,6 @@
>  #define VIRTIO_ID_INPUT        18 /* virtio input */
>  #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
>  #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
> +#define VIRTIO_ID_IOMMU	    61216 /* virtio IOMMU (temporary) */
> 
>  #endif /* _LINUX_VIRTIO_IDS_H */
> diff --git a/include/uapi/linux/virtio_iommu.h
> b/include/uapi/linux/virtio_iommu.h
> new file mode 100644
> index 000000000000..ec74c9a727d4
> --- /dev/null
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -0,0 +1,142 @@
> +/*
> + * Copyright (C) 2017 ARM Ltd.
> + *
> + * This header is BSD licensed so anyone can use the definitions
> + * to implement compatible drivers/servers:
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of ARM Ltd. nor the names of its contributors
> + *    may be used to endorse or promote products derived from this
> software
> + *    without specific prior written permission.
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS
> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM
> OR
> + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF
> + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
> CAUSED AND
> + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
> + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> OUT
> + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +#ifndef _UAPI_LINUX_VIRTIO_IOMMU_H
> +#define _UAPI_LINUX_VIRTIO_IOMMU_H
> +
> +/* Feature bits */
> +#define VIRTIO_IOMMU_F_INPUT_RANGE		0
> +#define VIRTIO_IOMMU_F_IOASID_BITS		1
> +#define VIRTIO_IOMMU_F_MAP_UNMAP		2
> +#define VIRTIO_IOMMU_F_BYPASS			3
> +
> +__packed
> +struct virtio_iommu_config {
> +	/* Supported page sizes */
> +	__u64					page_sizes;
> +	struct virtio_iommu_range {
> +		__u64				start;
> +		__u64				end;
> +	} input_range;
> +	__u8 					ioasid_bits;
> +};
> +
> +/* Request types */
> +#define VIRTIO_IOMMU_T_ATTACH			0x01
> +#define VIRTIO_IOMMU_T_DETACH			0x02
> +#define VIRTIO_IOMMU_T_MAP			0x03
> +#define VIRTIO_IOMMU_T_UNMAP			0x04
> +
> +/* Status types */
> +#define VIRTIO_IOMMU_S_OK			0x00
> +#define VIRTIO_IOMMU_S_IOERR			0x01
> +#define VIRTIO_IOMMU_S_UNSUPP			0x02
> +#define VIRTIO_IOMMU_S_DEVERR			0x03
> +#define VIRTIO_IOMMU_S_INVAL			0x04
> +#define VIRTIO_IOMMU_S_RANGE			0x05
> +#define VIRTIO_IOMMU_S_NOENT			0x06
> +#define VIRTIO_IOMMU_S_FAULT			0x07
> +
> +__packed
> +struct virtio_iommu_req_head {
> +	__u8					type;
> +	__u8					reserved[3];
> +};
> +
> +__packed
> +struct virtio_iommu_req_tail {
> +	__u8					status;
> +	__u8					reserved[3];
> +};
> +
> +__packed
> +struct virtio_iommu_req_attach {
> +	struct virtio_iommu_req_head		head;
> +
> +	__le32					address_space;
> +	__le32					device;
> +	__le32					reserved;
> +
> +	struct virtio_iommu_req_tail		tail;
> +};
> +
> +__packed
> +struct virtio_iommu_req_detach {
> +	struct virtio_iommu_req_head		head;
> +
> +	__le32					device;
> +	__le32					reserved;
> +
> +	struct virtio_iommu_req_tail		tail;
> +};
> +
> +#define VIRTIO_IOMMU_MAP_F_READ			(1 << 0)
> +#define VIRTIO_IOMMU_MAP_F_WRITE		(1 << 1)
> +#define VIRTIO_IOMMU_MAP_F_EXEC			(1 << 2)
> +
> +#define VIRTIO_IOMMU_MAP_F_MASK
> 	(VIRTIO_IOMMU_MAP_F_READ |	\
> +
> VIRTIO_IOMMU_MAP_F_WRITE |	\
> +
> VIRTIO_IOMMU_MAP_F_EXEC)
> +
> +__packed
> +struct virtio_iommu_req_map {
> +	struct virtio_iommu_req_head		head;
> +
> +	__le32					address_space;
> +	__le32					flags;
> +	__le64					virt_addr;
> +	__le64					phys_addr;
> +	__le64					size;
> +
> +	struct virtio_iommu_req_tail		tail;
> +};
> +
> +__packed
> +struct virtio_iommu_req_unmap {
> +	struct virtio_iommu_req_head		head;
> +
> +	__le32					address_space;
> +	__le32					flags;
> +	__le64					virt_addr;
> +	__le64					size;
> +
> +	struct virtio_iommu_req_tail		tail;
> +};
> +
> +union virtio_iommu_req {
> +	struct virtio_iommu_req_head		head;
> +
> +	struct virtio_iommu_req_attach		attach;
> +	struct virtio_iommu_req_detach		detach;
> +	struct virtio_iommu_req_map		map;
> +	struct virtio_iommu_req_unmap		unmap;
> +};
> +
> +#endif
> --
> 2.12.1
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@xxxxxxxxxxxxxxxxxxxx
> For additional commands, e-mail: virtio-dev-help@xxxxxxxxxxxxxxxxxxxx