Re: [PATCH v2 4/6] iommufd: Deliver fault messages to user space

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Oct 26, 2023 at 10:49:28AM +0800, Lu Baolu wrote:
> Add the file interface that provides a simple and efficient way for
> userspace to handle page faults. The file interface allows userspace
> to read fault messages sequentially, and to respond to the handling
> result by writing to the same file.
> 
> Userspace applications are recommended to use io_uring to speed up read
> and write efficiency.
> 
> With this done, allow userspace application to allocate a hw page table
> with IOMMU_HWPT_ALLOC_IOPF_CAPABLE flag set.
> 
> Signed-off-by: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx>
> ---
>  drivers/iommu/iommufd/iommufd_private.h |   2 +
>  drivers/iommu/iommufd/hw_pagetable.c    | 204 +++++++++++++++++++++++-
>  2 files changed, 205 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 0dbaa2dc5b22..ff063bc48150 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -237,6 +237,8 @@ struct hw_pgtable_fault {
>  	struct mutex mutex;
>  	struct list_head deliver;
>  	struct list_head response;
> +	struct file *fault_file;
> +	int fault_fd;
>  };
>  
>  /*
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index 9f94c824cf86..f0aac1bb2d2d 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -3,6 +3,8 @@
>   * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
>   */
>  #include <linux/iommu.h>
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
>  #include <uapi/linux/iommufd.h>
>  
>  #include "../iommu-priv.h"
> @@ -38,9 +40,198 @@ static void iommufd_kernel_managed_hwpt_destroy(struct iommufd_object *obj)
>  	refcount_dec(&hwpt->ioas->obj.users);
>  }
>  
> +static int iommufd_compose_fault_message(struct iommu_fault *fault,
> +					 struct iommu_hwpt_pgfault *hwpt_fault,
> +					 struct device *dev)
> +{
> +	struct iommufd_device *idev = iopf_pasid_cookie_get(dev, IOMMU_NO_PASID);
> +
> +	if (!idev)
> +		return -ENODEV;
> +
> +	if (IS_ERR(idev))
> +		return PTR_ERR(idev);
> +
> +	hwpt_fault->size = sizeof(*hwpt_fault);
> +	hwpt_fault->flags = fault->prm.flags;
> +	hwpt_fault->dev_id = idev->obj.id;
> +	hwpt_fault->pasid = fault->prm.pasid;
> +	hwpt_fault->grpid = fault->prm.grpid;
> +	hwpt_fault->perm = fault->prm.perm;
> +	hwpt_fault->addr = fault->prm.addr;
> +	hwpt_fault->private_data[0] = fault->prm.private_data[0];
> +	hwpt_fault->private_data[1] = fault->prm.private_data[1];
> +
> +	return 0;
> +}
> +
> +static ssize_t hwpt_fault_fops_read(struct file *filep, char __user *buf,
> +				    size_t count, loff_t *ppos)
> +{
> +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> +	struct hw_pgtable_fault *fault = filep->private_data;
> +	struct iommu_hwpt_pgfault data;
> +	struct iopf_group *group;
> +	struct iopf_fault *iopf;
> +	size_t done = 0;
> +	int rc;
> +
> +	if (*ppos || count % fault_size)
> +		return -ESPIPE;
> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->deliver) && count > done) {
> +		group = list_first_entry(&fault->deliver,
> +					 struct iopf_group, node);
> +
> +		if (list_count_nodes(&group->faults) * fault_size > count - done)
> +			break;
> +
> +		list_for_each_entry(iopf, &group->faults, list) {
> +			rc = iommufd_compose_fault_message(&iopf->fault,
> +							   &data, group->dev);
> +			if (rc)
> +				goto err_unlock;
> +			rc = copy_to_user(buf + done, &data, fault_size);
> +			if (rc)
> +				goto err_unlock;
> +			done += fault_size;
> +		}
> +
> +		list_move_tail(&group->node, &fault->response);
> +	}
> +	mutex_unlock(&fault->mutex);
> +
> +	return done;
> +err_unlock:
> +	mutex_unlock(&fault->mutex);
> +	return rc;
> +}
> +
> +static ssize_t hwpt_fault_fops_write(struct file *filep,
> +				     const char __user *buf,
> +				     size_t count, loff_t *ppos)
> +{
> +	size_t response_size = sizeof(struct iommu_hwpt_page_response);
> +	struct hw_pgtable_fault *fault = filep->private_data;
> +	struct iommu_hwpt_page_response response;
> +	struct iommufd_hw_pagetable *hwpt;
> +	struct iopf_group *iter, *group;
> +	struct iommufd_device *idev;
> +	size_t done = 0;
> +	int rc = 0;
> +
> +	if (*ppos || count % response_size)
> +		return -ESPIPE;
> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->response) && count > done) {
> +		rc = copy_from_user(&response, buf + done, response_size);
> +		if (rc)
> +			break;
> +
> +		/* Get the device that this response targets at. */
> +		idev = container_of(iommufd_get_object(fault->ictx,
> +						       response.dev_id,
> +						       IOMMUFD_OBJ_DEVICE),
> +				    struct iommufd_device, obj);
> +		if (IS_ERR(idev)) {
> +			rc = PTR_ERR(idev);
> +			break;
> +		}
> +
> +		/*
> +		 * Get the hw page table that this response was generated for.
> +		 * It must match the one stored in the fault data.
> +		 */
> +		hwpt = container_of(iommufd_get_object(fault->ictx,
> +						       response.hwpt_id,
> +						       IOMMUFD_OBJ_HW_PAGETABLE),
> +				    struct iommufd_hw_pagetable, obj);
> +		if (IS_ERR(hwpt)) {
> +			iommufd_put_object(&idev->obj);
> +			rc = PTR_ERR(hwpt);
> +			break;
> +		}
> +
> +		if (hwpt != fault->hwpt) {
> +			rc = -EINVAL;
> +			goto put_obj;
> +		}
> +
> +		group = NULL;
> +		list_for_each_entry(iter, &fault->response, node) {
> +			if (response.grpid != iter->last_fault.fault.prm.grpid)
> +				continue;
> +
> +			if (idev->dev != iter->dev)
> +				continue;
> +
> +			if ((iter->last_fault.fault.prm.flags &
> +			     IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
> +			    response.pasid != iter->last_fault.fault.prm.pasid)
> +				continue;
> +
> +			group = iter;
> +			break;
> +		}
> +
> +		if (!group) {
> +			rc = -ENODEV;
> +			goto put_obj;
> +		}
> +
> +		rc = iopf_group_response(group, response.code);
> +		if (rc)
> +			goto put_obj;
> +
> +		list_del(&group->node);
> +		iopf_free_group(group);
> +		done += response_size;
> +put_obj:
> +		iommufd_put_object(&hwpt->obj);
> +		iommufd_put_object(&idev->obj);
> +		if (rc)
> +			break;
> +	}
> +	mutex_unlock(&fault->mutex);
> +
> +	return (rc < 0) ? rc : done;
> +}
> +
> +static const struct file_operations hwpt_fault_fops = {
> +	.owner		= THIS_MODULE,
> +	.read		= hwpt_fault_fops_read,
> +	.write		= hwpt_fault_fops_write,
> +};
> +
> +static int hw_pagetable_get_fault_fd(struct hw_pgtable_fault *fault)
> +{
> +	struct file *filep;
> +	int fdno;
> +
> +	fdno = get_unused_fd_flags(O_CLOEXEC);
> +	if (fdno < 0)
> +		return fdno;
> +
> +	filep = anon_inode_getfile("[iommufd-pgfault]", &hwpt_fault_fops,
> +				   fault, O_RDWR);
> +	if (IS_ERR(filep)) {
> +		put_unused_fd(fdno);
> +		return PTR_ERR(filep);
> +	}
> +
> +	fd_install(fdno, filep);
> +	fault->fault_file = filep;
> +	fault->fault_fd = fdno;
> +
> +	return 0;
> +}
> +
>  static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
>  {
>  	struct hw_pgtable_fault *fault;
> +	int rc;
>  
>  	fault = kzalloc(sizeof(*fault), GFP_KERNEL);
>  	if (!fault)
> @@ -50,6 +241,12 @@ static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
>  	INIT_LIST_HEAD(&fault->response);
>  	mutex_init(&fault->mutex);
>  
> +	rc = hw_pagetable_get_fault_fd(fault);
> +	if (rc) {
> +		kfree(fault);
> +		return ERR_PTR(rc);
> +	}
> +
>  	return fault;
>  }
>  
> @@ -58,6 +255,8 @@ static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
>  	WARN_ON(!list_empty(&fault->deliver));
>  	WARN_ON(!list_empty(&fault->response));
>  
> +	fput(fault->fault_file);
> +	put_unused_fd(fault->fault_fd);
I have been running your code and have run into some invalid memory in
this line. When `put_unused_fd` is called the files of the current task
is accessed with `current->files`. In my case this is 0x0.

The reason for it being 0x0 is that `do_exit` calls `exit_files` where
the task files get set to NULL; this call is made in `do_exit` before we
execute `exit_task_work`.

'exit_task_work` is the call that eventually arrives here to `hw_pagetable_fault_free`.

The way I have arrived to this state is the following:
1. Version of linux kernel that I'm using : commit 357b5abcba0477f7f1391dd0fa3a919a6f06bdf0 (HEAD, lubaolu/iommufd-io-pgfault-delivery-v2)
2. Version of qemu that Im using : commit 577ef478780597d3f449feb01e853b93fa5c5530 (HEAD, yiliu/zhenzhong/wip/iommufd_nesting_rfcv1)
3. This error happens when my user space app is exiting. (hence the call
   to `do_exit`
4. I call the IOMMU_HWPT_ALLOC ioctl with
  .flags = IOMMU_HWPT_ALLOC_IOPF_CAPABLE and 
  .hwpt_type = IOMMU_HWPT_TYPE_DEFAULT
  .pt_id = the default ioas id.

I have resolved this in a naive way by just not calling the
put_unused_fd function.

Have you run into this? Is this a path that you were expecting?
Also, please get back to me if you need more information about how I got
to this place. I have provided what I think is enough info, but I might
be missing something obvious.

Best

>  	kfree(fault);
>  }
>  
> @@ -347,7 +546,9 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>  	struct mutex *mutex;
>  	int rc;
>  
> -	if (cmd->flags & ~IOMMU_HWPT_ALLOC_NEST_PARENT || cmd->__reserved)
> +	if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT |
> +			    IOMMU_HWPT_ALLOC_IOPF_CAPABLE)) ||
> +	    cmd->__reserved)
>  		return -EOPNOTSUPP;
>  	if (!cmd->data_len && cmd->hwpt_type != IOMMU_HWPT_TYPE_DEFAULT)
>  		return -EINVAL;
> @@ -416,6 +617,7 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>  		hwpt->fault->hwpt = hwpt;
>  		hwpt->domain->iopf_handler = iommufd_hw_pagetable_iopf_handler;
>  		hwpt->domain->fault_data = hwpt;
> +		cmd->out_fault_fd = hwpt->fault->fault_fd;
>  	}
>  
>  	cmd->out_hwpt_id = hwpt->obj.id;
> -- 
> 2.34.1
> 

-- 

Joel Granados

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux