For systems that require MSI pages to be mapped into the IOMMU translation the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default recommended IOVA window to place these mappings. However, there is nothing special about this address. And to support the RMR trick in VMM for nested translation, the VMM needs to know what sw_msi window the kernel is using. As there is no particular reason to force VMM to adopt the kernel default, provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use to directly specify its desired sw_msi window, which replaces and disables the default IOMMU_RESV_SW_MSI from the driver, to avoid having to build an API to discover the default IOMMU_RESV_SW_MSI. Since iommufd now has its own sw_msi function, this is easy to implement. Keep these two options per iommufd_device, so each device can set its own desired MSI window. VMM must set the values before attaching the device to any HWPT/IOAS to have an effect. Suggested-by: Jason Gunthorpe <jgg@xxxxxxxxxx> Signed-off-by: Nicolin Chen <nicolinc@xxxxxxxxxx> --- drivers/iommu/iommufd/iommufd_private.h | 2 + include/uapi/linux/iommufd.h | 20 ++++- drivers/iommu/iommufd/io_pagetable.c | 15 +++- drivers/iommu/iommufd/ioas.c | 97 +++++++++++++++++++++++++ drivers/iommu/iommufd/main.c | 4 + 5 files changed, 134 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 7a9cc6e61152..2d1aae7c8610 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -279,6 +279,7 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); +int iommufd_option_sw_msi(struct iommufd_ucmd *ucmd); int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); @@ -423,6 +424,7 @@ struct iommufd_device { struct mutex iopf_lock; unsigned int iopf_enabled; phys_addr_t sw_msi_start; + size_t sw_msi_size; }; static inline struct iommufd_device * diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 78747b24bd0f..310256bc3dbf 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -294,7 +294,9 @@ struct iommu_ioas_unmap { /** * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and - * ioctl(IOMMU_OPTION_HUGE_PAGES) + * ioctl(IOMMU_OPTION_HUGE_PAGES) and + * ioctl(IOMMU_OPTION_SW_MSI_START) and + * ioctl(IOMMU_OPTION_SW_MSI_SIZE) * @IOMMU_OPTION_RLIMIT_MODE: * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege * to invoke this. Value 0 (default) is user based accounting, 1 uses process @@ -304,10 +306,26 @@ struct iommu_ioas_unmap { * iommu mappings. Value 0 disables combining, everything is mapped to * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS * option, the object_id must be the IOAS ID. + * @IOMMU_OPTION_SW_MSI_START: + * Change the base address of the IOMMU mapping region for MSI doorbell(s). + * This option being unset or @IOMMU_OPTION_SW_MSI_SIZE being value 0 tells + * the kernel to pick its default MSI doorbell window, ignoring these two + * options. To set this option, userspace must do before attaching a device + * to an IOAS/HWPT. Otherwise, kernel will return error (-EBUSY). An address + * must be 1MB aligned. This option is per-device, the object_id must be the + * device ID. + * @IOMMU_OPTION_SW_MSI_SIZE: + * Change the size (in MB) of the IOMMU mapping region for MSI doorbell(s). + * The minimum value is 1 MB. A value 0 (default) tells the kernel to ignore + * the base address value set to @IOMMU_OPTION_SW_MSI_START, and to pick its + * default MSI doorbell window. Same requirements are applied to this option + * too, so check @IOMMU_OPTION_SW_MSI_START for details. */ enum iommufd_option { IOMMU_OPTION_RLIMIT_MODE = 0, IOMMU_OPTION_HUGE_PAGES = 1, + IOMMU_OPTION_SW_MSI_START = 2, + IOMMU_OPTION_SW_MSI_SIZE = 3, }; /** diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 441da0314a54..6e6dcc480922 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1441,18 +1441,27 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, iommu_get_resv_regions(dev, &resv_regions); list_for_each_entry(resv, &resv_regions, list) { + unsigned long start = PHYS_ADDR_MAX, last = 0; + if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) continue; if (sw_msi_start && resv->type == IOMMU_RESV_MSI) num_hw_msi++; if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { - *sw_msi_start = resv->start; + if (idev->sw_msi_size) { + start = *sw_msi_start; + last = idev->sw_msi_size - 1 + start; + } num_sw_msi++; } - rc = iopt_reserve_iova(iopt, resv->start, - resv->length - 1 + resv->start, dev); + if (start == PHYS_ADDR_MAX) { + start = resv->start; + last = resv->length - 1 + start; + } + + rc = iopt_reserve_iova(iopt, start, last, dev); if (rc) goto out_reserved; } diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 1542c5fd10a8..1fc93bc70bf4 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -620,6 +620,103 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd, return -EOPNOTSUPP; } +static inline int iommufd_option_sw_msi_test(struct iommufd_device *idev, + phys_addr_t start, size_t size) +{ + const phys_addr_t alignment = SZ_1M - 1; + struct iommu_resv_region *resv; + phys_addr_t resv_last, last; + LIST_HEAD(resv_regions); + int rc = 0; + + /* Alignment Test */ + if (start & alignment) + return -EINVAL; + + /* Overlap Test */ + if (!size) + size = SZ_1M; + last = size - 1 + start; + /* FIXME: drivers allocate memory but there is no failure propogated */ + iommu_get_resv_regions(idev->dev, &resv_regions); + list_for_each_entry(resv, &resv_regions, list) { + if (resv->type == IOMMU_RESV_SW_MSI || /* iommufd will bypass */ + resv->type == IOMMU_RESV_DIRECT_RELAXABLE) + continue; + resv_last = resv->length - 1 + resv->start; + if (resv->start <= last && resv_last >= start) { + rc = -EADDRINUSE; + break; + } + } + iommu_put_resv_regions(idev->dev, &resv_regions); + return rc; +} + +int iommufd_option_sw_msi(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + struct iommufd_device *idev; + int rc = 0; + + idev = iommufd_get_device(ucmd, cmd->object_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + mutex_lock(&idev->igroup->lock); + /* Device cannot enforce the sw_msi window if already attached */ + if (idev->igroup->hwpt) { + rc = -EBUSY; + goto out_unlock; + } + + if (cmd->op == IOMMU_OPTION_OP_GET) { + switch (cmd->option_id) { + case IOMMU_OPTION_SW_MSI_START: + cmd->val64 = (u64)idev->sw_msi_start; + break; + case IOMMU_OPTION_SW_MSI_SIZE: + cmd->val64 = (u64)idev->sw_msi_size / SZ_1M; + break; + default: + rc = -EOPNOTSUPP; + break; + } + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + phys_addr_t start = idev->sw_msi_start; + size_t size = idev->sw_msi_size; + + switch (cmd->option_id) { + case IOMMU_OPTION_SW_MSI_START: + start = (phys_addr_t)cmd->val64; + rc = iommufd_option_sw_msi_test(idev, start, size); + if (rc) + break; + idev->sw_msi_start = start; + break; + case IOMMU_OPTION_SW_MSI_SIZE: + size = (size_t)cmd->val64 * SZ_1M; + if (size) { + rc = iommufd_option_sw_msi_test(idev, start, + size); + if (rc) + break; + } + idev->sw_msi_size = size; + break; + default: + rc = -EOPNOTSUPP; + break; + } + } + +out_unlock: + mutex_unlock(&idev->igroup->lock); + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, struct iommufd_ioas *ioas) { diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index b6fa9fd11bc1..f92fb03ca3c1 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -287,6 +287,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) case IOMMU_OPTION_RLIMIT_MODE: rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); break; + case IOMMU_OPTION_SW_MSI_START: + case IOMMU_OPTION_SW_MSI_SIZE: + rc = iommufd_option_sw_msi(ucmd); + break; case IOMMU_OPTION_HUGE_PAGES: rc = iommufd_ioas_option(ucmd); break; -- 2.43.0