Hi, On 1/11/25 4:32 AM, Nicolin Chen wrote: > For systems that require MSI pages to be mapped into the IOMMU translation > the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default > recommended IOVA window to place these mappings. However, there is nothing > special about this address. And to support the RMR trick in VMM for nested well at least it shall not overlap VMM's RAM. So it was not random either. > translation, the VMM needs to know what sw_msi window the kernel is using. > As there is no particular reason to force VMM to adopt the kernel default, > provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use > to directly specify the sw_msi window that it wants to use, which replaces > and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having > to build an API to discover the default IOMMU_RESV_SW_MSI. IIUC the MSI window will then be different when using legacy VFIO assignment and iommufd backend. MSI reserved regions are exposed in /sys/kernel/iommu_groups/<n>/reserved_regions 0x0000000008000000 0x00000000080fffff msi Is that configurability reflected accordingly? How do you make sure it does not collide with other resv regions? I don't see any check here. > > Since iommufd now has its own sw_msi function, this is easy to implement. > > To keep things simple, the parameters are global to the entire iommufd FD, > and will directly replace the IOMMU_RESV_SW_MSI values. The VMM must set > the values before creating any hwpt's to have any effect. > > Suggested-by: Jason Gunthorpe <jgg@xxxxxxxxxx> > Signed-off-by: Nicolin Chen <nicolinc@xxxxxxxxxx> > --- > drivers/iommu/iommufd/iommufd_private.h | 4 +++ > include/uapi/linux/iommufd.h | 18 ++++++++++++- > drivers/iommu/iommufd/device.c | 4 +++ > drivers/iommu/iommufd/io_pagetable.c | 4 ++- > drivers/iommu/iommufd/ioas.c | 34 +++++++++++++++++++++++++ > drivers/iommu/iommufd/main.c | 6 +++++ > 6 files changed, 68 insertions(+), 2 deletions(-) > > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h > index 3e83bbb5912c..9f071609f00b 100644 > --- a/drivers/iommu/iommufd/iommufd_private.h > +++ b/drivers/iommu/iommufd/iommufd_private.h > @@ -45,6 +45,9 @@ struct iommufd_ctx { > struct mutex sw_msi_lock; > struct list_head sw_msi_list; > unsigned int sw_msi_id; > + /* User-programmed SW_MSI region, to override igroup->sw_msi_start */ > + phys_addr_t sw_msi_start; > + size_t sw_msi_size; > > u8 account_mode; > /* Compatibility with VFIO no iommu */ > @@ -281,6 +284,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); > int iommufd_ioas_option(struct iommufd_ucmd *ucmd); > int iommufd_option_rlimit_mode(struct iommu_option *cmd, > struct iommufd_ctx *ictx); > +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx); > > int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); > int iommufd_check_iova_range(struct io_pagetable *iopt, > diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h > index 34810f6ae2b5..c864a201e502 100644 > --- a/include/uapi/linux/iommufd.h > +++ b/include/uapi/linux/iommufd.h > @@ -294,7 +294,9 @@ struct iommu_ioas_unmap { > > /** > * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and > - * ioctl(IOMMU_OPTION_HUGE_PAGES) > + * ioctl(IOMMU_OPTION_HUGE_PAGES) and > + * ioctl(IOMMU_OPTION_SW_MSI_START) and > + * ioctl(IOMMU_OPTION_SW_MSI_SIZE) > * @IOMMU_OPTION_RLIMIT_MODE: > * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege > * to invoke this. Value 0 (default) is user based accounting, 1 uses process > @@ -304,10 +306,24 @@ struct iommu_ioas_unmap { > * iommu mappings. Value 0 disables combining, everything is mapped to > * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS > * option, the object_id must be the IOAS ID. > + * @IOMMU_OPTION_SW_MSI_START: > + * Change the base address of the IOMMU mapping region for MSI doorbell(s). > + * It must be set this before attaching a device to an IOAS/HWPT, otherwise > + * this option will be not effective on that IOAS/HWPT. User can choose to > + * let kernel pick a base address, by simply ignoring this option or setting > + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0 I think we should document it cannot be put at a random place either. > + * @IOMMU_OPTION_SW_MSI_SIZE: > + * Change the size of the IOMMU mapping region for MSI doorbell(s). It must > + * be set this before attaching a device to an IOAS/HWPT, otherwise it won't > + * be effective on that IOAS/HWPT. The value is in MB, and the minimum value > + * is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address > + * value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id must be 0 > */ > enum iommufd_option { > IOMMU_OPTION_RLIMIT_MODE = 0, > IOMMU_OPTION_HUGE_PAGES = 1, > + IOMMU_OPTION_SW_MSI_START = 2, > + IOMMU_OPTION_SW_MSI_SIZE = 3, > }; > > /** > diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c > index f75b3c23cd41..093a3bd798db 100644 > --- a/drivers/iommu/iommufd/device.c > +++ b/drivers/iommu/iommufd/device.c > @@ -445,10 +445,14 @@ static int > iommufd_device_attach_reserved_iova(struct iommufd_device *idev, > struct iommufd_hwpt_paging *hwpt_paging) > { > + struct iommufd_ctx *ictx = idev->ictx; > int rc; > > lockdep_assert_held(&idev->igroup->lock); > > + /* Override it with a user-programmed SW_MSI region */ > + if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX) > + idev->igroup->sw_msi_start = ictx->sw_msi_start; > rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, > idev->dev, > &idev->igroup->sw_msi_start); > diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c > index 8a790e597e12..5d7f5ca1eecf 100644 > --- a/drivers/iommu/iommufd/io_pagetable.c > +++ b/drivers/iommu/iommufd/io_pagetable.c > @@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, > if (sw_msi_start && resv->type == IOMMU_RESV_MSI) > num_hw_msi++; > if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { > - *sw_msi_start = resv->start; > + /* Bypass the driver-defined SW_MSI region, if preset */ > + if (*sw_msi_start == PHYS_ADDR_MAX) > + *sw_msi_start = resv->start; > num_sw_msi++; > } > > diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c > index 1542c5fd10a8..3f4e25b660f9 100644 > --- a/drivers/iommu/iommufd/ioas.c > +++ b/drivers/iommu/iommufd/ioas.c > @@ -620,6 +620,40 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd, > return -EOPNOTSUPP; > } > > +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx) > +{ > + if (cmd->object_id) > + return -EOPNOTSUPP; > + > + if (cmd->op == IOMMU_OPTION_OP_GET) { > + switch (cmd->option_id) { > + case IOMMU_OPTION_SW_MSI_START: > + cmd->val64 = (u64)ictx->sw_msi_start; > + break; > + case IOMMU_OPTION_SW_MSI_SIZE: > + cmd->val64 = (u64)ictx->sw_msi_size; > + break; > + default: > + return -EOPNOTSUPP; > + } > + return 0; > + } > + if (cmd->op == IOMMU_OPTION_OP_SET) { > + switch (cmd->option_id) { > + case IOMMU_OPTION_SW_MSI_START: > + ictx->sw_msi_start = (phys_addr_t)cmd->val64; > + break; > + case IOMMU_OPTION_SW_MSI_SIZE: > + ictx->sw_msi_size = (size_t)cmd->val64; > + break; > + default: > + return -EOPNOTSUPP; > + } > + return 0; > + } > + return -EOPNOTSUPP; > +} > + > static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, > struct iommufd_ioas *ioas) > { > diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c > index 7cc9497b7193..026297265c71 100644 > --- a/drivers/iommu/iommufd/main.c > +++ b/drivers/iommu/iommufd/main.c > @@ -229,6 +229,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) > init_waitqueue_head(&ictx->destroy_wait); > mutex_init(&ictx->sw_msi_lock); > INIT_LIST_HEAD(&ictx->sw_msi_list); > + ictx->sw_msi_start = PHYS_ADDR_MAX; > + ictx->sw_msi_size = 0; > filp->private_data = ictx; > return 0; > } > @@ -287,6 +289,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) > case IOMMU_OPTION_RLIMIT_MODE: > rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); > break; > + case IOMMU_OPTION_SW_MSI_START: > + case IOMMU_OPTION_SW_MSI_SIZE: > + rc = iommufd_option_sw_msi(cmd, ucmd->ictx); > + break; > case IOMMU_OPTION_HUGE_PAGES: > rc = iommufd_ioas_option(ucmd); > break; Eric