On Wed, Oct 05, 2022 at 10:57:28AM -0300, Jason Gunthorpe wrote: > On Wed, Oct 05, 2022 at 09:46:45AM -0400, Matthew Rosato wrote: > > > > (again, with the follow-up applied) Besides the panic above I just > > noticed there is also this warning that immediately precedes and is > > perhaps more useful. Re: what triggers the WARN, both group->owner > > and group->owner_cnt are already 0 > > And this is after the 2nd try that fixes the locking? > > This shows that vfio_group_detach_container() is called twice (which > was my guess), hoever this looks to be impossible as both calls are > protected by 'if (group->container)' and the function NULL's > group->container and it is all under the proper lock. > > My guess was that missing locking caused the two cases to race and > trigger WARN, but the locking should fix that. > > So I'm at a loss, can you investigate a bit? Huh, perhaps I'm loosing my mind, but I'm sure I sent this out, but it is not in the archive. This v2 fixes the missing locking and the rest of the remarks. commit f8b993620af72fa5f15bd4c1515868013c1c173d Author: Jason Gunthorpe <jgg@xxxxxxxx> Date: Tue Oct 4 13:14:37 2022 -0300 vfio: Make the group FD disassociate from the iommu_group Allow the vfio_group struct to exist with a NULL iommu_group pointer. When the pointer is NULL the vfio_group users promise not to touch the iommu_group. This allows a driver to be hot unplugged while userspace is keeping the group FD open. SPAPR mode is excluded from this behavior because of how it wrongly hacks part of its iommu interface through KVM. Due to this we loose control over what it is doing and cannot revoke the iommu_group usage in the IOMMU layer via vfio_group_detach_container(). Thus, for SPAPR the group FDs must still be closed before a device can be hot unplugged. This fixes a userspace regression where we learned that virtnodedevd leaves a group FD open even though the /dev/ node for it has been deleted and all the drivers for it unplugged. Fixes: ca5f21b25749 ("vfio: Follow a strict lifetime for struct iommu_group") Reported-by: Christian Borntraeger <borntraeger@xxxxxxxxxxxxx> Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxx> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 59a28251bb0b97..badc9d828cac20 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1313,7 +1313,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, } /* Ensure the FD is a vfio group FD.*/ - if (!vfio_file_iommu_group(file)) { + if (!vfio_file_is_group(file)) { fput(file); ret = -EINVAL; break; diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 4d2de02f2ced6e..4e10a281420e66 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -59,6 +59,7 @@ struct vfio_group { struct mutex group_lock; struct kvm *kvm; struct file *opened_file; + bool preserve_iommu_group; struct swait_queue_head opened_file_wait; struct blocking_notifier_head notifier; }; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9b1e5fd5f7b73c..13d22bd84afc47 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -133,6 +133,10 @@ __vfio_group_get_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; + /* + * group->iommu_group from the vfio.group_list cannot be NULL + * under the vfio.group_lock. + */ list_for_each_entry(group, &vfio.group_list, vfio_next) { if (group->iommu_group == iommu_group) { refcount_inc(&group->drivers); @@ -159,7 +163,7 @@ static void vfio_group_release(struct device *dev) mutex_destroy(&group->device_lock); mutex_destroy(&group->group_lock); - iommu_group_put(group->iommu_group); + WARN_ON(group->iommu_group); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); } @@ -248,6 +252,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, static void vfio_device_remove_group(struct vfio_device *device) { struct vfio_group *group = device->group; + struct iommu_group *iommu_group; if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -265,13 +270,36 @@ static void vfio_device_remove_group(struct vfio_device *device) */ cdev_device_del(&group->cdev, &group->dev); + mutex_lock(&group->group_lock); + /* + * These data structures all have paired operations that can only be + * undone when the caller holds a live reference on the device. Since + * all pairs must be undone these WARN_ON's indicate some caller did not + * properly hold the group reference.l. + */ + WARN_ON(!list_empty(&group->device_list)); + WARN_ON(group->notifier.head); + + /* + * Revoke all users of group->iommu_group. At this point we know there + * are no devices active because we are unplugging the last one. Setting + * iommu_group to NULL blocks all new users. + */ + if (group->container) + vfio_group_detach_container(group); + iommu_group = group->iommu_group; + group->iommu_group = NULL; + mutex_unlock(&group->group_lock); + /* - * Before we allow the last driver in the group to be unplugged the - * group must be sanitized so nothing else is or can reference it. This - * is because the group->iommu_group pointer should only be used so long - * as a device driver is attached to a device in the group. + * Normally we can set the iommu_group to NULL above and that will + * prevent any users from touching it. However, the SPAPR kvm path takes + * a reference to the iommu_group and keeps using it in arch code out + * side our control. So if this path is triggred we have no choice but + * to wait for the group FD to be closed to be sure everyone has stopped + * touching the group. */ - while (group->opened_file) { + while (group->preserve_iommu_group && group->opened_file) { mutex_unlock(&vfio.group_lock); swait_event_idle_exclusive(group->opened_file_wait, !group->opened_file); @@ -279,17 +307,7 @@ static void vfio_device_remove_group(struct vfio_device *device) } mutex_unlock(&vfio.group_lock); - /* - * These data structures all have paired operations that can only be - * undone when the caller holds a live reference on the group. Since all - * pairs must be undone these WARN_ON's indicate some caller did not - * properly hold the group reference. - */ - WARN_ON(!list_empty(&group->device_list)); - WARN_ON(group->container || group->container_users); - WARN_ON(group->notifier.head); - group->iommu_group = NULL; - + iommu_group_put(iommu_group); put_device(&group->dev); } @@ -531,6 +549,10 @@ static int __vfio_register_dev(struct vfio_device *device, existing_device = vfio_group_get_device(group, device->dev); if (existing_device) { + /* + * group->iommu_group is non-NULL because we hold the drivers + * refcount. + */ dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); vfio_device_put_registration(existing_device); @@ -702,6 +724,11 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, ret = -EINVAL; goto out_unlock; } + if (!group->iommu_group) { + ret = -ENODEV; + goto out_unlock; + } + container = vfio_container_from_file(f.file); ret = -EINVAL; if (container) { @@ -862,6 +889,11 @@ static int vfio_group_ioctl_get_status(struct vfio_group *group, status.flags = 0; mutex_lock(&group->group_lock); + if (!group->iommu_group) { + mutex_unlock(&group->group_lock); + return -ENODEV; + } + if (group->container) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; @@ -938,13 +970,6 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) filep->private_data = NULL; mutex_lock(&group->group_lock); - /* - * Device FDs hold a group file reference, therefore the group release - * is only called when there are no open devices. - */ - WARN_ON(group->notifier.head); - if (group->container) - vfio_group_detach_container(group); group->opened_file = NULL; mutex_unlock(&group->group_lock); swake_up_one(&group->opened_file_wait); @@ -1553,17 +1578,41 @@ static const struct file_operations vfio_device_fops = { * @file: VFIO group file * * The returned iommu_group is valid as long as a ref is held on the file. + * This function is deprecated, only the SPAPR path in kvm should call it. */ struct iommu_group *vfio_file_iommu_group(struct file *file) { struct vfio_group *group = file->private_data; + struct iommu_group *iommu_group = NULL; + + if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) + return NULL; if (file->f_op != &vfio_group_fops) return NULL; - return group->iommu_group; + + mutex_lock(&vfio.group_lock); + mutex_lock(&group->group_lock); + if (group->iommu_group) { + iommu_group = group->iommu_group; + group->preserve_iommu_group = true; + } + mutex_unlock(&group->group_lock); + mutex_unlock(&vfio.group_lock); + return iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); +/** + * vfio_file_is_group - True if the file is usable with VFIO aPIS + * @file: VFIO group file + */ +bool vfio_file_is_group(struct file *file) +{ + return file->f_op == &vfio_group_fops; +} +EXPORT_SYMBOL_GPL(vfio_file_is_group); + /** * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file * is always CPU cache coherent diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 73bcb92179a224..bd9faaab85de18 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -199,6 +199,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, * External user API */ struct iommu_group *vfio_file_iommu_group(struct file *file); +bool vfio_file_is_group(struct file *file); bool vfio_file_enforced_coherent(struct file *file); void vfio_file_set_kvm(struct file *file, struct kvm *kvm); bool vfio_file_has_dev(struct file *file, struct vfio_device *device); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index ce1b01d02c5197..54aec3b0559c70 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -61,6 +61,23 @@ static bool kvm_vfio_file_enforced_coherent(struct file *file) return ret; } +static bool kvm_vfio_file_is_group(struct file *file) +{ + bool (*fn)(struct file *file); + bool ret; + + fn = symbol_get(vfio_file_is_group); + if (!fn) + return false; + + ret = fn(file); + + symbol_put(vfio_file_is_group); + + return ret; +} + +#ifdef CONFIG_SPAPR_TCE_IOMMU static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { struct iommu_group *(*fn)(struct file *file); @@ -77,7 +94,6 @@ static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) return ret; } -#ifdef CONFIG_SPAPR_TCE_IOMMU static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_group *kvg) { @@ -136,7 +152,7 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) return -EBADF; /* Ensure the FD is a vfio group FD.*/ - if (!kvm_vfio_file_iommu_group(filp)) { + if (!kvm_vfio_file_is_group(filp)) { ret = -EINVAL; goto err_fput; }