On Thu, Oct 26, 2023 at 10:49:28AM +0800, Lu Baolu wrote: > Add the file interface that provides a simple and efficient way for > userspace to handle page faults. The file interface allows userspace > to read fault messages sequentially, and to respond to the handling > result by writing to the same file. > > Userspace applications are recommended to use io_uring to speed up read > and write efficiency. > > With this done, allow userspace application to allocate a hw page table > with IOMMU_HWPT_ALLOC_IOPF_CAPABLE flag set. > > Signed-off-by: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx> > --- > drivers/iommu/iommufd/iommufd_private.h | 2 + > drivers/iommu/iommufd/hw_pagetable.c | 204 +++++++++++++++++++++++- > 2 files changed, 205 insertions(+), 1 deletion(-) > > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h > index 0dbaa2dc5b22..ff063bc48150 100644 > --- a/drivers/iommu/iommufd/iommufd_private.h > +++ b/drivers/iommu/iommufd/iommufd_private.h > @@ -237,6 +237,8 @@ struct hw_pgtable_fault { > struct mutex mutex; > struct list_head deliver; > struct list_head response; > + struct file *fault_file; > + int fault_fd; > }; > > /* > diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c > index 9f94c824cf86..f0aac1bb2d2d 100644 > --- a/drivers/iommu/iommufd/hw_pagetable.c > +++ b/drivers/iommu/iommufd/hw_pagetable.c > @@ -3,6 +3,8 @@ > * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES > */ > #include <linux/iommu.h> > +#include <linux/file.h> > +#include <linux/anon_inodes.h> > #include <uapi/linux/iommufd.h> > > #include "../iommu-priv.h" > @@ -38,9 +40,198 @@ static void iommufd_kernel_managed_hwpt_destroy(struct iommufd_object *obj) > refcount_dec(&hwpt->ioas->obj.users); > } > > +static int iommufd_compose_fault_message(struct iommu_fault *fault, > + struct iommu_hwpt_pgfault *hwpt_fault, > + struct device *dev) > +{ > + struct iommufd_device *idev = iopf_pasid_cookie_get(dev, IOMMU_NO_PASID); > + > + if (!idev) > + return -ENODEV; > + > + if (IS_ERR(idev)) > + return PTR_ERR(idev); > + > + hwpt_fault->size = sizeof(*hwpt_fault); > + hwpt_fault->flags = fault->prm.flags; > + hwpt_fault->dev_id = idev->obj.id; > + hwpt_fault->pasid = fault->prm.pasid; > + hwpt_fault->grpid = fault->prm.grpid; > + hwpt_fault->perm = fault->prm.perm; > + hwpt_fault->addr = fault->prm.addr; > + hwpt_fault->private_data[0] = fault->prm.private_data[0]; > + hwpt_fault->private_data[1] = fault->prm.private_data[1]; > + > + return 0; > +} > + > +static ssize_t hwpt_fault_fops_read(struct file *filep, char __user *buf, > + size_t count, loff_t *ppos) > +{ > + size_t fault_size = sizeof(struct iommu_hwpt_pgfault); > + struct hw_pgtable_fault *fault = filep->private_data; > + struct iommu_hwpt_pgfault data; > + struct iopf_group *group; > + struct iopf_fault *iopf; > + size_t done = 0; > + int rc; > + > + if (*ppos || count % fault_size) > + return -ESPIPE; > + > + mutex_lock(&fault->mutex); > + while (!list_empty(&fault->deliver) && count > done) { > + group = list_first_entry(&fault->deliver, > + struct iopf_group, node); > + > + if (list_count_nodes(&group->faults) * fault_size > count - done) > + break; > + > + list_for_each_entry(iopf, &group->faults, list) { > + rc = iommufd_compose_fault_message(&iopf->fault, > + &data, group->dev); > + if (rc) > + goto err_unlock; > + rc = copy_to_user(buf + done, &data, fault_size); > + if (rc) > + goto err_unlock; > + done += fault_size; > + } > + > + list_move_tail(&group->node, &fault->response); > + } > + mutex_unlock(&fault->mutex); > + > + return done; > +err_unlock: > + mutex_unlock(&fault->mutex); > + return rc; > +} > + > +static ssize_t hwpt_fault_fops_write(struct file *filep, > + const char __user *buf, > + size_t count, loff_t *ppos) > +{ > + size_t response_size = sizeof(struct iommu_hwpt_page_response); > + struct hw_pgtable_fault *fault = filep->private_data; > + struct iommu_hwpt_page_response response; > + struct iommufd_hw_pagetable *hwpt; > + struct iopf_group *iter, *group; > + struct iommufd_device *idev; > + size_t done = 0; > + int rc = 0; > + > + if (*ppos || count % response_size) > + return -ESPIPE; > + > + mutex_lock(&fault->mutex); > + while (!list_empty(&fault->response) && count > done) { > + rc = copy_from_user(&response, buf + done, response_size); > + if (rc) > + break; > + > + /* Get the device that this response targets at. */ > + idev = container_of(iommufd_get_object(fault->ictx, > + response.dev_id, > + IOMMUFD_OBJ_DEVICE), > + struct iommufd_device, obj); > + if (IS_ERR(idev)) { > + rc = PTR_ERR(idev); > + break; > + } > + > + /* > + * Get the hw page table that this response was generated for. > + * It must match the one stored in the fault data. > + */ > + hwpt = container_of(iommufd_get_object(fault->ictx, > + response.hwpt_id, > + IOMMUFD_OBJ_HW_PAGETABLE), > + struct iommufd_hw_pagetable, obj); > + if (IS_ERR(hwpt)) { > + iommufd_put_object(&idev->obj); > + rc = PTR_ERR(hwpt); > + break; > + } > + > + if (hwpt != fault->hwpt) { > + rc = -EINVAL; > + goto put_obj; > + } > + > + group = NULL; > + list_for_each_entry(iter, &fault->response, node) { > + if (response.grpid != iter->last_fault.fault.prm.grpid) > + continue; > + > + if (idev->dev != iter->dev) > + continue; > + > + if ((iter->last_fault.fault.prm.flags & > + IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) && > + response.pasid != iter->last_fault.fault.prm.pasid) > + continue; > + > + group = iter; > + break; > + } > + > + if (!group) { > + rc = -ENODEV; > + goto put_obj; > + } > + > + rc = iopf_group_response(group, response.code); > + if (rc) > + goto put_obj; > + > + list_del(&group->node); > + iopf_free_group(group); > + done += response_size; > +put_obj: > + iommufd_put_object(&hwpt->obj); > + iommufd_put_object(&idev->obj); > + if (rc) > + break; > + } > + mutex_unlock(&fault->mutex); > + > + return (rc < 0) ? rc : done; > +} > + > +static const struct file_operations hwpt_fault_fops = { > + .owner = THIS_MODULE, > + .read = hwpt_fault_fops_read, > + .write = hwpt_fault_fops_write, > +}; > + > +static int hw_pagetable_get_fault_fd(struct hw_pgtable_fault *fault) > +{ > + struct file *filep; > + int fdno; > + > + fdno = get_unused_fd_flags(O_CLOEXEC); > + if (fdno < 0) > + return fdno; > + > + filep = anon_inode_getfile("[iommufd-pgfault]", &hwpt_fault_fops, > + fault, O_RDWR); > + if (IS_ERR(filep)) { > + put_unused_fd(fdno); > + return PTR_ERR(filep); > + } > + > + fd_install(fdno, filep); > + fault->fault_file = filep; > + fault->fault_fd = fdno; > + > + return 0; > +} > + > static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void) > { > struct hw_pgtable_fault *fault; > + int rc; > > fault = kzalloc(sizeof(*fault), GFP_KERNEL); > if (!fault) > @@ -50,6 +241,12 @@ static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void) > INIT_LIST_HEAD(&fault->response); > mutex_init(&fault->mutex); > > + rc = hw_pagetable_get_fault_fd(fault); > + if (rc) { > + kfree(fault); > + return ERR_PTR(rc); > + } > + > return fault; > } > > @@ -58,6 +255,8 @@ static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault) > WARN_ON(!list_empty(&fault->deliver)); > WARN_ON(!list_empty(&fault->response)); > > + fput(fault->fault_file); > + put_unused_fd(fault->fault_fd); I have been running your code and have run into some invalid memory in this line. When `put_unused_fd` is called the files of the current task is accessed with `current->files`. In my case this is 0x0. The reason for it being 0x0 is that `do_exit` calls `exit_files` where the task files get set to NULL; this call is made in `do_exit` before we execute `exit_task_work`. 'exit_task_work` is the call that eventually arrives here to `hw_pagetable_fault_free`. The way I have arrived to this state is the following: 1. Version of linux kernel that I'm using : commit 357b5abcba0477f7f1391dd0fa3a919a6f06bdf0 (HEAD, lubaolu/iommufd-io-pgfault-delivery-v2) 2. Version of qemu that Im using : commit 577ef478780597d3f449feb01e853b93fa5c5530 (HEAD, yiliu/zhenzhong/wip/iommufd_nesting_rfcv1) 3. This error happens when my user space app is exiting. (hence the call to `do_exit` 4. I call the IOMMU_HWPT_ALLOC ioctl with .flags = IOMMU_HWPT_ALLOC_IOPF_CAPABLE and .hwpt_type = IOMMU_HWPT_TYPE_DEFAULT .pt_id = the default ioas id. I have resolved this in a naive way by just not calling the put_unused_fd function. Have you run into this? Is this a path that you were expecting? Also, please get back to me if you need more information about how I got to this place. I have provided what I think is enough info, but I might be missing something obvious. Best > kfree(fault); > } > > @@ -347,7 +546,9 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) > struct mutex *mutex; > int rc; > > - if (cmd->flags & ~IOMMU_HWPT_ALLOC_NEST_PARENT || cmd->__reserved) > + if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT | > + IOMMU_HWPT_ALLOC_IOPF_CAPABLE)) || > + cmd->__reserved) > return -EOPNOTSUPP; > if (!cmd->data_len && cmd->hwpt_type != IOMMU_HWPT_TYPE_DEFAULT) > return -EINVAL; > @@ -416,6 +617,7 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) > hwpt->fault->hwpt = hwpt; > hwpt->domain->iopf_handler = iommufd_hw_pagetable_iopf_handler; > hwpt->domain->fault_data = hwpt; > + cmd->out_fault_fd = hwpt->fault->fault_fd; > } > > cmd->out_hwpt_id = hwpt->obj.id; > -- > 2.34.1 > -- Joel Granados
Attachment:
signature.asc
Description: PGP signature