Hi Jean-Philippe, On 10/12/18 6:35 PM, Michael S. Tsirkin wrote: > On Fri, Oct 12, 2018 at 03:59:15PM +0100, Jean-Philippe Brucker wrote: >> The virtio IOMMU is a para-virtualized device, allowing to send IOMMU >> requests such as map/unmap over virtio transport without emulating page >> tables. This implementation handles ATTACH, DETACH, MAP and UNMAP >> requests. >> >> The bulk of the code transforms calls coming from the IOMMU API into >> corresponding virtio requests. Mappings are kept in an interval tree >> instead of page tables. >> >> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> >> --- >> MAINTAINERS | 7 + >> drivers/iommu/Kconfig | 11 + >> drivers/iommu/Makefile | 1 + >> drivers/iommu/virtio-iommu.c | 938 ++++++++++++++++++++++++++++++ >> include/uapi/linux/virtio_ids.h | 1 + >> include/uapi/linux/virtio_iommu.h | 101 ++++ >> 6 files changed, 1059 insertions(+) >> create mode 100644 drivers/iommu/virtio-iommu.c >> create mode 100644 include/uapi/linux/virtio_iommu.h >> >> diff --git a/MAINTAINERS b/MAINTAINERS >> index 48a65c3a4189..f02fa65f47e2 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -15599,6 +15599,13 @@ S: Maintained >> F: drivers/virtio/virtio_input.c >> F: include/uapi/linux/virtio_input.h >> >> +VIRTIO IOMMU DRIVER >> +M: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> >> +L: virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx >> +S: Maintained >> +F: drivers/iommu/virtio-iommu.c >> +F: include/uapi/linux/virtio_iommu.h >> + >> VIRTUAL BOX GUEST DEVICE DRIVER >> M: Hans de Goede <hdegoede@xxxxxxxxxx> >> M: Arnd Bergmann <arnd@xxxxxxxx> >> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig >> index c60395b7470f..2dc016dc2b92 100644 >> --- a/drivers/iommu/Kconfig >> +++ b/drivers/iommu/Kconfig >> @@ -414,4 +414,15 @@ config QCOM_IOMMU >> help >> Support for IOMMU on certain Qualcomm SoCs. >> >> +config VIRTIO_IOMMU >> + bool "Virtio IOMMU driver" >> + depends on VIRTIO=y >> + select IOMMU_API >> + select INTERVAL_TREE >> + select ARM_DMA_USE_IOMMU if ARM >> + help >> + Para-virtualised IOMMU driver with virtio. >> + >> + Say Y here if you intend to run this kernel as a guest. >> + >> endif # IOMMU_SUPPORT >> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile >> index ab5eba6edf82..4cd643408e49 100644 >> --- a/drivers/iommu/Makefile >> +++ b/drivers/iommu/Makefile >> @@ -31,3 +31,4 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o >> obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o >> obj-$(CONFIG_S390_IOMMU) += s390-iommu.o >> obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o >> +obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o >> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c >> new file mode 100644 >> index 000000000000..9fb38cd3b727 >> --- /dev/null >> +++ b/drivers/iommu/virtio-iommu.c >> @@ -0,0 +1,938 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * Virtio driver for the paravirtualized IOMMU >> + * >> + * Copyright (C) 2018 Arm Limited >> + */ >> + >> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> + >> +#include <linux/amba/bus.h> >> +#include <linux/delay.h> >> +#include <linux/dma-iommu.h> >> +#include <linux/freezer.h> >> +#include <linux/interval_tree.h> >> +#include <linux/iommu.h> >> +#include <linux/module.h> >> +#include <linux/of_iommu.h> >> +#include <linux/of_platform.h> >> +#include <linux/pci.h> >> +#include <linux/platform_device.h> >> +#include <linux/virtio.h> >> +#include <linux/virtio_config.h> >> +#include <linux/virtio_ids.h> >> +#include <linux/wait.h> >> + >> +#include <uapi/linux/virtio_iommu.h> >> + >> +#define MSI_IOVA_BASE 0x8000000 >> +#define MSI_IOVA_LENGTH 0x100000 >> + >> +#define VIOMMU_REQUEST_VQ 0 >> +#define VIOMMU_NR_VQS 1 >> + >> +/* >> + * During development, it is convenient to time out rather than wait >> + * indefinitely in atomic context when a device misbehaves and a request doesn't >> + * return. In production however, some requests shouldn't return until they are >> + * successful. >> + */ >> +#ifdef DEBUG >> +#define VIOMMU_REQUEST_TIMEOUT 10000 /* 10s */ >> +#endif >> + >> +struct viommu_dev { >> + struct iommu_device iommu; >> + struct device *dev; >> + struct virtio_device *vdev; >> + >> + struct ida domain_ids; >> + >> + struct virtqueue *vqs[VIOMMU_NR_VQS]; >> + spinlock_t request_lock; >> + struct list_head requests; >> + >> + /* Device configuration */ >> + struct iommu_domain_geometry geometry; >> + u64 pgsize_bitmap; >> + u8 domain_bits; >> +}; >> + >> +struct viommu_mapping { >> + phys_addr_t paddr; >> + struct interval_tree_node iova; >> + u32 flags; >> +}; >> + >> +struct viommu_domain { >> + struct iommu_domain domain; >> + struct viommu_dev *viommu; >> + struct mutex mutex; >> + unsigned int id; >> + >> + spinlock_t mappings_lock; >> + struct rb_root_cached mappings; >> + >> + unsigned long nr_endpoints; >> +}; >> + >> +struct viommu_endpoint { >> + struct viommu_dev *viommu; >> + struct viommu_domain *vdomain; >> +}; >> + >> +struct viommu_request { >> + struct list_head list; >> + void *writeback; >> + unsigned int write_offset; >> + unsigned int len; >> + char buf[]; >> +}; >> + >> +#define to_viommu_domain(domain) \ >> + container_of(domain, struct viommu_domain, domain) >> + >> +static int viommu_get_req_errno(void *buf, size_t len) >> +{ >> + struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail); >> + >> + switch (tail->status) { >> + case VIRTIO_IOMMU_S_OK: >> + return 0; >> + case VIRTIO_IOMMU_S_UNSUPP: >> + return -ENOSYS; >> + case VIRTIO_IOMMU_S_INVAL: >> + return -EINVAL; >> + case VIRTIO_IOMMU_S_RANGE: >> + return -ERANGE; >> + case VIRTIO_IOMMU_S_NOENT: >> + return -ENOENT; >> + case VIRTIO_IOMMU_S_FAULT: >> + return -EFAULT; >> + case VIRTIO_IOMMU_S_IOERR: >> + case VIRTIO_IOMMU_S_DEVERR: >> + default: >> + return -EIO; >> + } >> +} >> + >> +static void viommu_set_req_status(void *buf, size_t len, int status) >> +{ >> + struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail); >> + >> + tail->status = status; >> +} >> + >> +static off_t viommu_get_req_offset(struct viommu_dev *viommu, >> + struct virtio_iommu_req_head *req, >> + size_t len) >> +{ >> + size_t tail_size = sizeof(struct virtio_iommu_req_tail); >> + >> + return len - tail_size; >> +} >> + >> +/* >> + * __viommu_sync_req - Complete all in-flight requests >> + * >> + * Wait for all added requests to complete. When this function returns, all >> + * requests that were in-flight at the time of the call have completed. >> + */ >> +static int __viommu_sync_req(struct viommu_dev *viommu) >> +{ >> + int ret = 0; >> + unsigned int len; >> + size_t write_len; >> + ktime_t timeout = 0; >> + struct viommu_request *req; >> + struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ]; >> + >> + assert_spin_locked(&viommu->request_lock); >> +#ifdef DEBUG >> + timeout = ktime_add_ms(ktime_get(), VIOMMU_REQUEST_TIMEOUT); >> +#endif >> + virtqueue_kick(vq); >> + >> + while (!list_empty(&viommu->requests)) { >> + len = 0; >> + req = virtqueue_get_buf(vq, &len); >> + if (req == NULL) { >> + if (!timeout || ktime_before(ktime_get(), timeout)) >> + continue; >> + >> + /* After timeout, remove all requests */ >> + req = list_first_entry(&viommu->requests, >> + struct viommu_request, list); >> + ret = -ETIMEDOUT; >> + } >> + >> + if (!len) >> + viommu_set_req_status(req->buf, req->len, >> + VIRTIO_IOMMU_S_IOERR); >> + >> + write_len = req->len - req->write_offset; >> + if (req->writeback && len >= write_len) >> + memcpy(req->writeback, req->buf + req->write_offset, >> + write_len); >> + >> + list_del(&req->list); >> + kfree(req); > > So with DEBUG set, this will actually free memory that device still > DMA's into. Hardly pretty. I think you want to mark device broken, > queue the request and then wait for device to be reset. > > >> + } >> + >> + return ret; >> +} >> + >> +static int viommu_sync_req(struct viommu_dev *viommu) >> +{ >> + int ret; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&viommu->request_lock, flags); >> + ret = __viommu_sync_req(viommu); >> + if (ret) >> + dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret); >> + spin_unlock_irqrestore(&viommu->request_lock, flags); >> + >> + return ret; >> +} >> + >> +/* >> + * __viommu_add_request - Add one request to the queue >> + * @buf: pointer to the request buffer >> + * @len: length of the request buffer >> + * @writeback: copy data back to the buffer when the request completes. >> + * >> + * Add a request to the queue. Only synchronize the queue if it's already full. >> + * Otherwise don't kick the queue nor wait for requests to complete. >> + * >> + * When @writeback is true, data written by the device, including the request >> + * status, is copied into @buf after the request completes. This is unsafe if >> + * the caller allocates @buf on stack and drops the lock between add_req() and >> + * sync_req(). >> + * >> + * Return 0 if the request was successfully added to the queue. >> + */ >> +static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len, >> + bool writeback) >> +{ >> + int ret; >> + off_t write_offset; >> + struct viommu_request *req; >> + struct scatterlist top_sg, bottom_sg; >> + struct scatterlist *sg[2] = { &top_sg, &bottom_sg }; >> + struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ]; >> + >> + assert_spin_locked(&viommu->request_lock); >> + >> + write_offset = viommu_get_req_offset(viommu, buf, len); >> + if (!write_offset) >> + return -EINVAL; >> + >> + req = kzalloc(sizeof(*req) + len, GFP_ATOMIC); >> + if (!req) >> + return -ENOMEM; >> + >> + req->len = len; >> + if (writeback) { >> + req->writeback = buf + write_offset; >> + req->write_offset = write_offset; >> + } >> + memcpy(&req->buf, buf, write_offset); >> + >> + sg_init_one(&top_sg, req->buf, write_offset); >> + sg_init_one(&bottom_sg, req->buf + write_offset, len - write_offset); >> + >> + ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC); >> + if (ret == -ENOSPC) { >> + /* If the queue is full, sync and retry */ >> + if (!__viommu_sync_req(viommu)) >> + ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC); >> + } >> + if (ret) >> + goto err_free; >> + >> + list_add_tail(&req->list, &viommu->requests); >> + return 0; >> + >> +err_free: >> + kfree(req); >> + return ret; >> +} >> + >> +static int viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len) >> +{ >> + int ret; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&viommu->request_lock, flags); >> + ret = __viommu_add_req(viommu, buf, len, false); >> + if (ret) >> + dev_dbg(viommu->dev, "could not add request: %d\n", ret); >> + spin_unlock_irqrestore(&viommu->request_lock, flags); >> + >> + return ret; >> +} >> + >> +/* >> + * Send a request and wait for it to complete. Return the request status (as an >> + * errno) >> + */ >> +static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf, >> + size_t len) >> +{ >> + int ret; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&viommu->request_lock, flags); >> + >> + ret = __viommu_add_req(viommu, buf, len, true); >> + if (ret) { >> + dev_dbg(viommu->dev, "could not add request (%d)\n", ret); >> + goto out_unlock; >> + } >> + >> + ret = __viommu_sync_req(viommu); >> + if (ret) { >> + dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret); >> + /* Fall-through (get the actual request status) */ >> + } >> + >> + ret = viommu_get_req_errno(buf, len); >> +out_unlock: >> + spin_unlock_irqrestore(&viommu->request_lock, flags); >> + return ret; >> +} >> + >> +/* >> + * viommu_add_mapping - add a mapping to the internal tree >> + * >> + * On success, return the new mapping. Otherwise return NULL. >> + */ >> +static struct viommu_mapping * >> +viommu_add_mapping(struct viommu_domain *vdomain, unsigned long iova, >> + phys_addr_t paddr, size_t size, u32 flags) >> +{ >> + unsigned long irqflags; >> + struct viommu_mapping *mapping; >> + >> + mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC); >> + if (!mapping) >> + return NULL; >> + >> + mapping->paddr = paddr; >> + mapping->iova.start = iova; >> + mapping->iova.last = iova + size - 1; >> + mapping->flags = flags; >> + >> + spin_lock_irqsave(&vdomain->mappings_lock, irqflags); >> + interval_tree_insert(&mapping->iova, &vdomain->mappings); >> + spin_unlock_irqrestore(&vdomain->mappings_lock, irqflags); >> + >> + return mapping; >> +} >> + >> +/* >> + * viommu_del_mappings - remove mappings from the internal tree >> + * >> + * @vdomain: the domain >> + * @iova: start of the range >> + * @size: size of the range. A size of 0 corresponds to the entire address >> + * space. >> + * >> + * On success, returns the number of unmapped bytes (>= size) >> + */ >> +static size_t viommu_del_mappings(struct viommu_domain *vdomain, >> + unsigned long iova, size_t size) >> +{ >> + size_t unmapped = 0; >> + unsigned long flags; >> + unsigned long last = iova + size - 1; >> + struct viommu_mapping *mapping = NULL; >> + struct interval_tree_node *node, *next; >> + >> + spin_lock_irqsave(&vdomain->mappings_lock, flags); >> + next = interval_tree_iter_first(&vdomain->mappings, iova, last); >> + while (next) { >> + node = next; >> + mapping = container_of(node, struct viommu_mapping, iova); >> + next = interval_tree_iter_next(node, iova, last); >> + >> + /* Trying to split a mapping? */ >> + if (mapping->iova.start < iova) >> + break; >> + >> + /* >> + * Note that for a partial range, this will return the full >> + * mapping so we avoid sending split requests to the device. >> + */ >> + unmapped += mapping->iova.last - mapping->iova.start + 1; >> + >> + interval_tree_remove(node, &vdomain->mappings); >> + kfree(mapping); >> + } >> + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); >> + >> + return unmapped; >> +} >> + >> +/* >> + * viommu_replay_mappings - re-send MAP requests >> + * >> + * When reattaching a domain that was previously detached from all endpoints, >> + * mappings were deleted from the device. Re-create the mappings available in >> + * the internal tree. >> + */ >> +static int viommu_replay_mappings(struct viommu_domain *vdomain) >> +{ >> + int ret; ret needs to be initialized here. Otherwise this can lead to a crash in viommu_add_device. Thanks Eric >> + unsigned long flags; >> + struct viommu_mapping *mapping; >> + struct interval_tree_node *node; >> + struct virtio_iommu_req_map map; >> + >> + spin_lock_irqsave(&vdomain->mappings_lock, flags); >> + node = interval_tree_iter_first(&vdomain->mappings, 0, -1UL); >> + while (node) { >> + mapping = container_of(node, struct viommu_mapping, iova); >> + map = (struct virtio_iommu_req_map) { >> + .head.type = VIRTIO_IOMMU_T_MAP, >> + .domain = cpu_to_le32(vdomain->id), >> + .virt_start = cpu_to_le64(mapping->iova.start), >> + .virt_end = cpu_to_le64(mapping->iova.last), >> + .phys_start = cpu_to_le64(mapping->paddr), >> + .flags = cpu_to_le32(mapping->flags), >> + }; >> + >> + ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); >> + if (ret) >> + break; >> + >> + node = interval_tree_iter_next(node, 0, -1UL); >> + } >> + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); >> + >> + return ret; >> +} >> + >> +/* IOMMU API */ >> + >> +static struct iommu_domain *viommu_domain_alloc(unsigned type) >> +{ >> + struct viommu_domain *vdomain; >> + >> + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) >> + return NULL; >> + >> + vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); >> + if (!vdomain) >> + return NULL; >> + >> + mutex_init(&vdomain->mutex); >> + spin_lock_init(&vdomain->mappings_lock); >> + vdomain->mappings = RB_ROOT_CACHED; >> + >> + if (type == IOMMU_DOMAIN_DMA && >> + iommu_get_dma_cookie(&vdomain->domain)) { >> + kfree(vdomain); >> + return NULL; >> + } >> + >> + return &vdomain->domain; >> +} >> + >> +static int viommu_domain_finalise(struct viommu_dev *viommu, >> + struct iommu_domain *domain) >> +{ >> + int ret; >> + struct viommu_domain *vdomain = to_viommu_domain(domain); >> + unsigned int max_domain = viommu->domain_bits > 31 ? ~0 : >> + (1U << viommu->domain_bits) - 1; >> + >> + vdomain->viommu = viommu; >> + >> + domain->pgsize_bitmap = viommu->pgsize_bitmap; >> + domain->geometry = viommu->geometry; >> + >> + ret = ida_alloc_max(&viommu->domain_ids, max_domain, GFP_KERNEL); >> + if (ret >= 0) >> + vdomain->id = (unsigned int)ret; >> + >> + return ret > 0 ? 0 : ret; >> +} >> + >> +static void viommu_domain_free(struct iommu_domain *domain) >> +{ >> + struct viommu_domain *vdomain = to_viommu_domain(domain); >> + >> + iommu_put_dma_cookie(domain); >> + >> + /* Free all remaining mappings (size 2^64) */ >> + viommu_del_mappings(vdomain, 0, 0); >> + >> + if (vdomain->viommu) >> + ida_free(&vdomain->viommu->domain_ids, vdomain->id); >> + >> + kfree(vdomain); >> +} >> + >> +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) >> +{ >> + int i; >> + int ret = 0; >> + struct virtio_iommu_req_attach req; >> + struct iommu_fwspec *fwspec = dev->iommu_fwspec; >> + struct viommu_endpoint *vdev = fwspec->iommu_priv; >> + struct viommu_domain *vdomain = to_viommu_domain(domain); >> + >> + mutex_lock(&vdomain->mutex); >> + if (!vdomain->viommu) { >> + /* >> + * Initialize the domain proper now that we know which viommu >> + * owns it. >> + */ >> + ret = viommu_domain_finalise(vdev->viommu, domain); >> + } else if (vdomain->viommu != vdev->viommu) { >> + dev_err(dev, "cannot attach to foreign vIOMMU\n"); >> + ret = -EXDEV; >> + } >> + mutex_unlock(&vdomain->mutex); >> + >> + if (ret) >> + return ret; >> + >> + /* >> + * In the virtio-iommu device, when attaching the endpoint to a new >> + * domain, it is detached from the old one and, if as as a result the >> + * old domain isn't attached to any endpoint, all mappings are removed >> + * from the old domain and it is freed. >> + * >> + * In the driver the old domain still exists, and its mappings will be >> + * recreated if it gets reattached to an endpoint. Otherwise it will be >> + * freed explicitly. >> + * >> + * vdev->vdomain is protected by group->mutex >> + */ >> + if (vdev->vdomain) >> + vdev->vdomain->nr_endpoints--; >> + >> + req = (struct virtio_iommu_req_attach) { >> + .head.type = VIRTIO_IOMMU_T_ATTACH, >> + .domain = cpu_to_le32(vdomain->id), >> + }; >> + >> + for (i = 0; i < fwspec->num_ids; i++) { >> + req.endpoint = cpu_to_le32(fwspec->ids[i]); >> + >> + ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req)); >> + if (ret) >> + return ret; >> + } >> + >> + if (!vdomain->nr_endpoints) { >> + /* >> + * This endpoint is the first to be attached to the domain. >> + * Replay existing mappings (e.g. SW MSI). >> + */ >> + ret = viommu_replay_mappings(vdomain); >> + if (ret) >> + return ret; >> + } >> + >> + vdomain->nr_endpoints++; >> + vdev->vdomain = vdomain; >> + >> + return 0; >> +} >> + >> +static int viommu_map(struct iommu_domain *domain, unsigned long iova, >> + phys_addr_t paddr, size_t size, int prot) >> +{ >> + int ret; >> + int flags; >> + struct viommu_mapping *mapping; >> + struct virtio_iommu_req_map map; >> + struct viommu_domain *vdomain = to_viommu_domain(domain); >> + >> + flags = (prot & IOMMU_READ ? VIRTIO_IOMMU_MAP_F_READ : 0) | >> + (prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) | >> + (prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0); >> + >> + mapping = viommu_add_mapping(vdomain, iova, paddr, size, flags); >> + if (!mapping) >> + return -ENOMEM; >> + >> + map = (struct virtio_iommu_req_map) { >> + .head.type = VIRTIO_IOMMU_T_MAP, >> + .domain = cpu_to_le32(vdomain->id), >> + .virt_start = cpu_to_le64(iova), >> + .phys_start = cpu_to_le64(paddr), >> + .virt_end = cpu_to_le64(iova + size - 1), >> + .flags = cpu_to_le32(flags), >> + }; >> + >> + if (!vdomain->nr_endpoints) >> + return 0; >> + >> + ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); >> + if (ret) >> + viommu_del_mappings(vdomain, iova, size); >> + >> + return ret; >> +} >> + >> +static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova, >> + size_t size) >> +{ >> + int ret = 0; >> + size_t unmapped; >> + struct virtio_iommu_req_unmap unmap; >> + struct viommu_domain *vdomain = to_viommu_domain(domain); >> + >> + unmapped = viommu_del_mappings(vdomain, iova, size); >> + if (unmapped < size) >> + return 0; >> + >> + /* Device already removed all mappings after detach. */ >> + if (!vdomain->nr_endpoints) >> + return unmapped; >> + >> + unmap = (struct virtio_iommu_req_unmap) { >> + .head.type = VIRTIO_IOMMU_T_UNMAP, >> + .domain = cpu_to_le32(vdomain->id), >> + .virt_start = cpu_to_le64(iova), >> + .virt_end = cpu_to_le64(iova + unmapped - 1), >> + }; >> + >> + ret = viommu_add_req(vdomain->viommu, &unmap, sizeof(unmap)); >> + return ret ? 0 : unmapped; >> +} >> + >> +static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain, >> + dma_addr_t iova) >> +{ >> + u64 paddr = 0; >> + unsigned long flags; >> + struct viommu_mapping *mapping; >> + struct interval_tree_node *node; >> + struct viommu_domain *vdomain = to_viommu_domain(domain); >> + >> + spin_lock_irqsave(&vdomain->mappings_lock, flags); >> + node = interval_tree_iter_first(&vdomain->mappings, iova, iova); >> + if (node) { >> + mapping = container_of(node, struct viommu_mapping, iova); >> + paddr = mapping->paddr + (iova - mapping->iova.start); >> + } >> + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); >> + >> + return paddr; >> +} >> + >> +static void viommu_iotlb_sync(struct iommu_domain *domain) >> +{ >> + struct viommu_domain *vdomain = to_viommu_domain(domain); >> + >> + viommu_sync_req(vdomain->viommu); >> +} >> + >> +static void viommu_get_resv_regions(struct device *dev, struct list_head *head) >> +{ >> + struct iommu_resv_region *region; >> + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; >> + >> + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, prot, >> + IOMMU_RESV_SW_MSI); >> + if (!region) >> + return; >> + >> + list_add_tail(®ion->list, head); >> + iommu_dma_get_resv_regions(dev, head); >> +} >> + >> +static void viommu_put_resv_regions(struct device *dev, struct list_head *head) >> +{ >> + struct iommu_resv_region *entry, *next; >> + >> + list_for_each_entry_safe(entry, next, head, list) >> + kfree(entry); >> +} >> + >> +static struct iommu_ops viommu_ops; >> +static struct virtio_driver virtio_iommu_drv; >> + >> +static int viommu_match_node(struct device *dev, void *data) >> +{ >> + return dev->parent->fwnode == data; >> +} >> + >> +static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode) >> +{ >> + struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL, >> + fwnode, viommu_match_node); >> + put_device(dev); >> + >> + return dev ? dev_to_virtio(dev)->priv : NULL; >> +} >> + >> +static int viommu_add_device(struct device *dev) >> +{ >> + int ret; >> + struct iommu_group *group; >> + struct viommu_endpoint *vdev; >> + struct viommu_dev *viommu = NULL; >> + struct iommu_fwspec *fwspec = dev->iommu_fwspec; >> + >> + if (!fwspec || fwspec->ops != &viommu_ops) >> + return -ENODEV; >> + >> + viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode); >> + if (!viommu) >> + return -ENODEV; >> + >> + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); >> + if (!vdev) >> + return -ENOMEM; >> + >> + vdev->viommu = viommu; >> + fwspec->iommu_priv = vdev; >> + >> + ret = iommu_device_link(&viommu->iommu, dev); >> + if (ret) >> + goto err_free_dev; >> + >> + /* >> + * Last step creates a default domain and attaches to it. Everything >> + * must be ready. >> + */ >> + group = iommu_group_get_for_dev(dev); >> + if (IS_ERR(group)) { >> + ret = PTR_ERR(group); >> + goto err_unlink_dev; >> + } >> + >> + iommu_group_put(group); >> + >> + return PTR_ERR_OR_ZERO(group); >> + >> +err_unlink_dev: >> + iommu_device_unlink(&viommu->iommu, dev); >> + >> +err_free_dev: >> + kfree(vdev); >> + >> + return ret; >> +} >> + >> +static void viommu_remove_device(struct device *dev) >> +{ >> + struct viommu_endpoint *vdev; >> + struct iommu_fwspec *fwspec = dev->iommu_fwspec; >> + >> + if (!fwspec || fwspec->ops != &viommu_ops) >> + return; >> + >> + vdev = fwspec->iommu_priv; >> + >> + iommu_group_remove_device(dev); >> + iommu_device_unlink(&vdev->viommu->iommu, dev); >> + kfree(vdev); >> +} >> + >> +static struct iommu_group *viommu_device_group(struct device *dev) >> +{ >> + if (dev_is_pci(dev)) >> + return pci_device_group(dev); >> + else >> + return generic_device_group(dev); >> +} >> + >> +static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args) >> +{ >> + return iommu_fwspec_add_ids(dev, args->args, 1); >> +} >> + >> +static struct iommu_ops viommu_ops = { >> + .domain_alloc = viommu_domain_alloc, >> + .domain_free = viommu_domain_free, >> + .attach_dev = viommu_attach_dev, >> + .map = viommu_map, >> + .unmap = viommu_unmap, >> + .iova_to_phys = viommu_iova_to_phys, >> + .iotlb_sync = viommu_iotlb_sync, >> + .add_device = viommu_add_device, >> + .remove_device = viommu_remove_device, >> + .device_group = viommu_device_group, >> + .get_resv_regions = viommu_get_resv_regions, >> + .put_resv_regions = viommu_put_resv_regions, >> + .of_xlate = viommu_of_xlate, >> +}; >> + >> +static int viommu_init_vqs(struct viommu_dev *viommu) >> +{ >> + struct virtio_device *vdev = dev_to_virtio(viommu->dev); >> + const char *name = "request"; >> + void *ret; >> + >> + ret = virtio_find_single_vq(vdev, NULL, name); >> + if (IS_ERR(ret)) { >> + dev_err(viommu->dev, "cannot find VQ\n"); >> + return PTR_ERR(ret); >> + } >> + >> + viommu->vqs[VIOMMU_REQUEST_VQ] = ret; >> + >> + return 0; >> +} >> + >> +static int viommu_probe(struct virtio_device *vdev) >> +{ >> + struct device *parent_dev = vdev->dev.parent; >> + struct viommu_dev *viommu = NULL; >> + struct device *dev = &vdev->dev; >> + u64 input_start = 0; >> + u64 input_end = -1UL; >> + int ret; >> + >> + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) >> + return -ENODEV; > > I'm a bit confused about what will happen if this device > happens to be behind an iommu itself. > > If we can't handle that, should we clear PLATFORM_IOMMU > e.g. like the balloon does? > > >> + >> + viommu = devm_kzalloc(dev, sizeof(*viommu), GFP_KERNEL); >> + if (!viommu) >> + return -ENOMEM; >> + >> + spin_lock_init(&viommu->request_lock); >> + ida_init(&viommu->domain_ids); >> + viommu->dev = dev; >> + viommu->vdev = vdev; >> + INIT_LIST_HEAD(&viommu->requests); >> + >> + ret = viommu_init_vqs(viommu); >> + if (ret) >> + return ret; >> + >> + virtio_cread(vdev, struct virtio_iommu_config, page_size_mask, >> + &viommu->pgsize_bitmap); >> + >> + if (!viommu->pgsize_bitmap) { >> + ret = -EINVAL; >> + goto err_free_vqs; >> + } >> + >> + viommu->domain_bits = 32; >> + >> + /* Optional features */ >> + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, >> + struct virtio_iommu_config, input_range.start, >> + &input_start); >> + >> + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, >> + struct virtio_iommu_config, input_range.end, >> + &input_end); >> + >> + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_BITS, >> + struct virtio_iommu_config, domain_bits, >> + &viommu->domain_bits); >> + >> + viommu->geometry = (struct iommu_domain_geometry) { >> + .aperture_start = input_start, >> + .aperture_end = input_end, >> + .force_aperture = true, >> + }; >> + >> + viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; >> + >> + virtio_device_ready(vdev); >> + >> + ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s", >> + virtio_bus_name(vdev)); >> + if (ret) >> + goto err_free_vqs; >> + >> + iommu_device_set_ops(&viommu->iommu, &viommu_ops); >> + iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode); >> + >> + iommu_device_register(&viommu->iommu); >> + >> +#ifdef CONFIG_PCI >> + if (pci_bus_type.iommu_ops != &viommu_ops) { >> + pci_request_acs(); >> + ret = bus_set_iommu(&pci_bus_type, &viommu_ops); >> + if (ret) >> + goto err_unregister; >> + } >> +#endif >> +#ifdef CONFIG_ARM_AMBA >> + if (amba_bustype.iommu_ops != &viommu_ops) { >> + ret = bus_set_iommu(&amba_bustype, &viommu_ops); >> + if (ret) >> + goto err_unregister; >> + } >> +#endif >> + if (platform_bus_type.iommu_ops != &viommu_ops) { >> + ret = bus_set_iommu(&platform_bus_type, &viommu_ops); >> + if (ret) >> + goto err_unregister; >> + } >> + >> + vdev->priv = viommu; >> + >> + dev_info(dev, "input address: %u bits\n", >> + order_base_2(viommu->geometry.aperture_end)); >> + dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap); >> + >> + return 0; >> + >> +err_unregister: >> + iommu_device_sysfs_remove(&viommu->iommu); >> + iommu_device_unregister(&viommu->iommu); >> +err_free_vqs: >> + vdev->config->del_vqs(vdev); >> + >> + return ret; >> +} >> + >> +static void viommu_remove(struct virtio_device *vdev) >> +{ >> + struct viommu_dev *viommu = vdev->priv; >> + >> + iommu_device_sysfs_remove(&viommu->iommu); >> + iommu_device_unregister(&viommu->iommu); >> + >> + /* Stop all virtqueues */ >> + vdev->config->reset(vdev); >> + vdev->config->del_vqs(vdev); >> + >> + dev_info(&vdev->dev, "device removed\n"); >> +} >> + >> +static void viommu_config_changed(struct virtio_device *vdev) >> +{ >> + dev_warn(&vdev->dev, "config changed\n"); >> +} >> + >> +static unsigned int features[] = { >> + VIRTIO_IOMMU_F_MAP_UNMAP, >> + VIRTIO_IOMMU_F_DOMAIN_BITS, >> + VIRTIO_IOMMU_F_INPUT_RANGE, >> +}; >> + >> +static struct virtio_device_id id_table[] = { >> + { VIRTIO_ID_IOMMU, VIRTIO_DEV_ANY_ID }, >> + { 0 }, >> +}; >> + >> +static struct virtio_driver virtio_iommu_drv = { >> + .driver.name = KBUILD_MODNAME, >> + .driver.owner = THIS_MODULE, >> + .id_table = id_table, >> + .feature_table = features, >> + .feature_table_size = ARRAY_SIZE(features), >> + .probe = viommu_probe, >> + .remove = viommu_remove, >> + .config_changed = viommu_config_changed, >> +}; >> + >> +module_virtio_driver(virtio_iommu_drv); >> + >> +MODULE_DESCRIPTION("Virtio IOMMU driver"); >> +MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx>"); >> +MODULE_LICENSE("GPL v2"); >> diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h >> index 6d5c3b2d4f4d..cfe47c5d9a56 100644 >> --- a/include/uapi/linux/virtio_ids.h >> +++ b/include/uapi/linux/virtio_ids.h >> @@ -43,5 +43,6 @@ >> #define VIRTIO_ID_INPUT 18 /* virtio input */ >> #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ >> #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ >> +#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ >> >> #endif /* _LINUX_VIRTIO_IDS_H */ >> diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h >> new file mode 100644 >> index 000000000000..e808fc7fbe82 >> --- /dev/null >> +++ b/include/uapi/linux/virtio_iommu.h >> @@ -0,0 +1,101 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause */ >> +/* >> + * Virtio-iommu definition v0.8 >> + * >> + * Copyright (C) 2018 Arm Ltd. >> + */ >> +#ifndef _UAPI_LINUX_VIRTIO_IOMMU_H >> +#define _UAPI_LINUX_VIRTIO_IOMMU_H >> + >> +#include <linux/types.h> >> + >> +/* Feature bits */ >> +#define VIRTIO_IOMMU_F_INPUT_RANGE 0 >> +#define VIRTIO_IOMMU_F_DOMAIN_BITS 1 >> +#define VIRTIO_IOMMU_F_MAP_UNMAP 2 >> +#define VIRTIO_IOMMU_F_BYPASS 3 >> + >> +struct virtio_iommu_config { >> + /* Supported page sizes */ >> + __u64 page_size_mask; >> + /* Supported IOVA range */ >> + struct virtio_iommu_range { > > I'd rather we moved the definition outside even though gcc allows it - > some old userspace compilers might not. > >> + __u64 start; >> + __u64 end; >> + } input_range; >> + /* Max domain ID size */ >> + __u8 domain_bits; > > Let's add explicit padding here as well? > >> +}; >> + >> +/* Request types */ >> +#define VIRTIO_IOMMU_T_ATTACH 0x01 >> +#define VIRTIO_IOMMU_T_DETACH 0x02 >> +#define VIRTIO_IOMMU_T_MAP 0x03 >> +#define VIRTIO_IOMMU_T_UNMAP 0x04 >> + >> +/* Status types */ >> +#define VIRTIO_IOMMU_S_OK 0x00 >> +#define VIRTIO_IOMMU_S_IOERR 0x01 >> +#define VIRTIO_IOMMU_S_UNSUPP 0x02 >> +#define VIRTIO_IOMMU_S_DEVERR 0x03 >> +#define VIRTIO_IOMMU_S_INVAL 0x04 >> +#define VIRTIO_IOMMU_S_RANGE 0x05 >> +#define VIRTIO_IOMMU_S_NOENT 0x06 >> +#define VIRTIO_IOMMU_S_FAULT 0x07 >> + >> +struct virtio_iommu_req_head { >> + __u8 type; >> + __u8 reserved[3]; >> +}; >> + >> +struct virtio_iommu_req_tail { >> + __u8 status; >> + __u8 reserved[3]; >> +}; >> + >> +struct virtio_iommu_req_attach { >> + struct virtio_iommu_req_head head; >> + __le32 domain; >> + __le32 endpoint; >> + __u8 reserved[8]; >> + struct virtio_iommu_req_tail tail; >> +}; >> + >> +struct virtio_iommu_req_detach { >> + struct virtio_iommu_req_head head; >> + __le32 domain; >> + __le32 endpoint; >> + __u8 reserved[8]; >> + struct virtio_iommu_req_tail tail; >> +}; >> + >> +#define VIRTIO_IOMMU_MAP_F_READ (1 << 0) >> +#define VIRTIO_IOMMU_MAP_F_WRITE (1 << 1) >> +#define VIRTIO_IOMMU_MAP_F_EXEC (1 << 2) >> +#define VIRTIO_IOMMU_MAP_F_MMIO (1 << 3) >> + >> +#define VIRTIO_IOMMU_MAP_F_MASK (VIRTIO_IOMMU_MAP_F_READ | \ >> + VIRTIO_IOMMU_MAP_F_WRITE | \ >> + VIRTIO_IOMMU_MAP_F_EXEC | \ >> + VIRTIO_IOMMU_MAP_F_MMIO) >> + >> +struct virtio_iommu_req_map { >> + struct virtio_iommu_req_head head; >> + __le32 domain; >> + __le64 virt_start; >> + __le64 virt_end; >> + __le64 phys_start; >> + __le32 flags; >> + struct virtio_iommu_req_tail tail; >> +}; >> + >> +struct virtio_iommu_req_unmap { >> + struct virtio_iommu_req_head head; >> + __le32 domain; >> + __le64 virt_start; >> + __le64 virt_end; >> + __u8 reserved[4]; >> + struct virtio_iommu_req_tail tail; >> +}; >> + >> +#endif >> -- >> 2.19.1