On 3/9/24 2:03 AM, Jason Gunthorpe wrote:
On Mon, Jan 22, 2024 at 03:38:59PM +0800, Lu Baolu wrote:
--- /dev/null
+++ b/drivers/iommu/iommufd/fault.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/iommufd.h>
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/iommufd.h>
+
+#include "iommufd_private.h"
+
+static int device_add_fault(struct iopf_group *group)
+{
+ struct iommufd_device *idev = group->cookie->private;
+ void *curr;
+
+ curr = xa_cmpxchg(&idev->faults, group->last_fault.fault.prm.grpid,
+ NULL, group, GFP_KERNEL);
+
+ return curr ? xa_err(curr) : 0;
+}
+
+static void device_remove_fault(struct iopf_group *group)
+{
+ struct iommufd_device *idev = group->cookie->private;
+
+ xa_store(&idev->faults, group->last_fault.fault.prm.grpid,
+ NULL, GFP_KERNEL);
xa_erase ?
Yes. Sure.
Is grpid OK to use this way? Doesn't it come from the originating
device?
The group ID is generated by the hardware. Here, we use it as an index
in the fault array to ensure it can be quickly retrieved in the page
fault response path.
+void iommufd_fault_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
+ struct iopf_group *group, *next;
+
+ mutex_lock(&fault->mutex);
+ list_for_each_entry_safe(group, next, &fault->deliver, node) {
+ list_del(&group->node);
+ iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+ iopf_free_group(group);
+ }
+ list_for_each_entry_safe(group, next, &fault->response, node) {
+ list_del(&group->node);
+ device_remove_fault(group);
+ iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+ iopf_free_group(group);
+ }
+ mutex_unlock(&fault->mutex);
+
+ mutex_destroy(&fault->mutex);
It is really weird to lock a mutex we are about to destroy? At this
point the refcount on the iommufd_object is zero so there had better
not be any other threads with access to this pointer!
You are right. I will remove the lock/unlock and add a comment instead.
+static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+ struct iommufd_fault *fault = filep->private_data;
+ struct iommu_hwpt_pgfault data;
+ struct iommufd_device *idev;
+ struct iopf_group *group;
+ struct iopf_fault *iopf;
+ size_t done = 0;
+ int rc;
+
+ if (*ppos || count % fault_size)
+ return -ESPIPE;
+
+ mutex_lock(&fault->mutex);
+ while (!list_empty(&fault->deliver) && count > done) {
+ group = list_first_entry(&fault->deliver,
+ struct iopf_group, node);
+
+ if (list_count_nodes(&group->faults) * fault_size > count - done)
+ break;
+
+ idev = (struct iommufd_device *)group->cookie->private;
+ list_for_each_entry(iopf, &group->faults, list) {
+ iommufd_compose_fault_message(&iopf->fault, &data, idev);
+ rc = copy_to_user(buf + done, &data, fault_size);
+ if (rc)
+ goto err_unlock;
+ done += fault_size;
+ }
+
+ rc = device_add_fault(group);
See I wonder if this should be some xa_alloc or something instead of
trying to use the grpid?
So this magic number will be passed to user space in the fault message.
And the user will then include this number in its response message. The
response message is valid only when the magic number matches. Do I get
you correctly?
+ while (!list_empty(&fault->response) && count > done) {
+ rc = copy_from_user(&response, buf + done, response_size);
+ if (rc)
+ break;
+
+ idev = container_of(iommufd_get_object(fault->ictx,
+ response.dev_id,
+ IOMMUFD_OBJ_DEVICE),
+ struct iommufd_device, obj);
It seems unfortunate we do this lookup for every iteration of the loop,
I would lift it up and cache it..
Okay.
+ if (IS_ERR(idev))
+ break;
+
+ group = device_get_fault(idev, response.grpid);
This looks locked wrong, it should hold the fault mutex here and we
should probably do xchng to zero it at the same time we fetch it.
Okay, will fix it.
+ if (!group ||
+ response.addr != group->last_fault.fault.prm.addr ||
+ ((group->last_fault.fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+ response.pasid != group->last_fault.fault.prm.pasid)) {
Why? If grpid is unique then just trust it.
I was just considering that we should add some sanity check here to
ensure the response message is composed in the right way.
+ iommufd_put_object(fault->ictx, &idev->obj);
+ break;
+ }
+
+ iopf_group_response(group, response.code);
+
+ mutex_lock(&fault->mutex);
+ list_del(&group->node);
+ mutex_unlock(&fault->mutex);
+
+ device_remove_fault(group);
+ iopf_free_group(group);
+ done += response_size;
+
+ iommufd_put_object(fault->ictx, &idev->obj);
+ }
+
+ return done;
+}
+
+static __poll_t iommufd_fault_fops_poll(struct file *filep,
+ struct poll_table_struct *wait)
+{
+ struct iommufd_fault *fault = filep->private_data;
+ __poll_t pollflags = 0;
+
+ poll_wait(filep, &fault->wait_queue, wait);
+ mutex_lock(&fault->mutex);
+ if (!list_empty(&fault->deliver))
+ pollflags = EPOLLIN | EPOLLRDNORM;
+ mutex_unlock(&fault->mutex);
+
+ return pollflags;
+}
+
+static const struct file_operations iommufd_fault_fops = {
+ .owner = THIS_MODULE,
+ .open = nonseekable_open,
+ .read = iommufd_fault_fops_read,
+ .write = iommufd_fault_fops_write,
+ .poll = iommufd_fault_fops_poll,
+ .llseek = no_llseek,
+};
No release? That seems wrong..
Yes. Will fix it.
+static int get_fault_fd(struct iommufd_fault *fault)
+{
+ struct file *filep;
+ int fdno;
+
+ fdno = get_unused_fd_flags(O_CLOEXEC);
+ if (fdno < 0)
+ return fdno;
+
+ filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
+ fault, O_RDWR);
^^^^^^
See here you stick the iommufd_object into the FD but we don't
refcount it...
And iommufd_fault_destroy() doesn't do anything about this, so it just
UAFs the fault memory in the FD.
You need to refcount the iommufd_object here and add a release
function for the fd to put it back
*and* this FD needs to take a reference on the base iommufd_ctx fd too
as we can't tolerate them being destroyed out of sequence.
Good catch. I will fix it.
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_fault_alloc *cmd = ucmd->cmd;
+ struct iommufd_fault *fault;
+ int rc;
+
+ if (cmd->flags)
+ return -EOPNOTSUPP;
+
+ fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
+ if (IS_ERR(fault))
+ return PTR_ERR(fault);
+
+ fault->ictx = ucmd->ictx;
+ INIT_LIST_HEAD(&fault->deliver);
+ INIT_LIST_HEAD(&fault->response);
+ mutex_init(&fault->mutex);
+ init_waitqueue_head(&fault->wait_queue);
+
+ rc = get_fault_fd(fault);
+ if (rc < 0)
+ goto out_abort;
And this is ordered wrong, fd_install has to happen after return to
user space is guarenteed as it cannot be undone.
+ cmd->out_fault_id = fault->obj.id;
+ cmd->out_fault_fd = rc;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_abort;
+ iommufd_object_finalize(ucmd->ictx, &fault->obj);
fd_install() goes here
Yes. Sure.
+ return 0;
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
+
+ return rc;
+}
Jason
Best regards,
baolu