On 9/1/22 4:37 PM, Jason Gunthorpe wrote: > On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote: >> On 9/1/22 6:25 AM, Robin Murphy wrote: >>> On 2022-08-31 21:12, Matthew Rosato wrote: >>>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>>> domains and the DMA API handling. However, this commit does not >>>> sufficiently handle the case where the device is released via a call >>>> to the release_device op as it may occur at the same time as an opposing >>>> attach_dev or detach_dev since the group mutex is not held over >>>> release_device. This was observed if the device is deconfigured during a >>>> small window during vfio-pci initialization and can result in WARNs and >>>> potential kernel panics. >>> >>> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? >>> >>> Robin. >> >> So, I generally have seen the issue manifest as one of the calls >> into the iommu core from __vfio_group_unset_container >> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. >> This happens when the vfio group fd is released, which could be >> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. >> AFAICT there's nothing serializing the notion of calling into the >> iommu core here against a device that is simultaneously going >> through release_device (because we don't enter release_device with >> the group mutex held), resulting in unpredictable behavior between >> the dueling attach_dev/detach_dev and the release_device for >> s390-iommu at least. > > Oh, this is a vfio bug. I've been running with your diff applied today on s390 and this indeed fixes the issue by preventing the detach-after-release coming out of vfio. Can you send as a patch for review? > > dev->iommu_group is only a valid pointer as long as a driver is attach > to the device. > > vfio copies the dev->iommu_group into struct vfio_group during probe() > but then lets vfio_group live independently. Particularly the driver > can be removed()'d and the vfio_group keeps on going. > > Thus it is possible to UAF the iommu_group by referencing it through > the vfio_group. > > We must wait during remove for all the vfio_groups to stop > referencing iommu_group. > > Something like this or so: > > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c > index eb714a484662fc..d8f40b22c98ddb 100644 > --- a/drivers/vfio/vfio_main.c > +++ b/drivers/vfio/vfio_main.c > @@ -65,7 +65,15 @@ struct vfio_container { > struct vfio_group { > struct device dev; > struct cdev cdev; > + /* > + * When drivers is non-zero a driver is attached to the struct device > + * that provided the iommu_group and thus the iommu_group is a valid > + * pointer. When drivers is 0 the driver is being detached. Once users > + * reaches 0 then the iommu_group is invalid. > + */ > + refcount_t drivers; > refcount_t users; > + struct completion comp; > unsigned int container_users; > struct iommu_group *iommu_group; > struct vfio_container *container; > @@ -276,8 +284,6 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) > } > EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); > > -static void vfio_group_get(struct vfio_group *group); > - > /* > * Container objects - containers are created when /dev/vfio/vfio is > * opened, but their lifecycle extends until the last user is done, so > @@ -305,16 +311,21 @@ static void vfio_container_put(struct vfio_container *container) > /* > * Group objects - create, release, get, put, search > */ > + > + /* > + * This returns a driver reference. It can only be used in the probe function > + * of a device_driver, eg as part of the internal implementation of > + * __vfio_register_dev(). > + */ > static struct vfio_group * > __vfio_group_get_from_iommu(struct iommu_group *iommu_group) > { > struct vfio_group *group; > > list_for_each_entry(group, &vfio.group_list, vfio_next) { > - if (group->iommu_group == iommu_group) { > - vfio_group_get(group); > + if (group->iommu_group == iommu_group && > + refcount_inc_not_zero(&group->drivers)) > return group; > - } > } > return NULL; > } > @@ -364,6 +375,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, > group->cdev.owner = THIS_MODULE; > > refcount_set(&group->users, 1); > + refcount_set(&group->drivers, 1); > + init_completion(&group->comp); > init_rwsem(&group->group_rwsem); > INIT_LIST_HEAD(&group->device_list); > mutex_init(&group->device_lock); > @@ -422,8 +435,28 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, > > static void vfio_group_put(struct vfio_group *group) > { > - if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock)) > - return; > + if (refcount_dec_and_test(&group->users)) > + complete(&group->comp); > +} > + > +/* > + * When the drivers count reaches 0 then the group must be destroyed > + * immediately. A zero driver group is a zombie awaiting destruction. > + */ > +static void vfio_group_remove(struct vfio_group *group) > +{ > + /* Matches the get from vfio_group_alloc() */ > + vfio_group_put(group); > + > + cdev_device_del(&group->cdev, &group->dev); > + > + /* > + * Before we allow the last driver in the group to be unplugged the > + * group must be sanitized so nothing else is or can reference it. This > + * is because the group->iommu_group pointer is only valid so long as a > + * VFIO device driver is attached to a device belonging to the group. > + */ > + wait_for_completion(&group->comp); > > /* > * These data structures all have paired operations that can only be > @@ -434,19 +467,15 @@ static void vfio_group_put(struct vfio_group *group) > WARN_ON(!list_empty(&group->device_list)); > WARN_ON(group->container || group->container_users); > WARN_ON(group->notifier.head); > + group->iommu_group = NULL; > > + mutex_lock(&vfio.group_lock); > list_del(&group->vfio_next); > - cdev_device_del(&group->cdev, &group->dev); > mutex_unlock(&vfio.group_lock); > > put_device(&group->dev); > } > > -static void vfio_group_get(struct vfio_group *group) > -{ > - refcount_inc(&group->users); > -} > - > /* > * Device objects - create, release, get, put, search > */ > @@ -462,22 +491,6 @@ static bool vfio_device_try_get(struct vfio_device *device) > return refcount_inc_not_zero(&device->refcount); > } > > -static struct vfio_device *vfio_group_get_device(struct vfio_group *group, > - struct device *dev) > -{ > - struct vfio_device *device; > - > - mutex_lock(&group->device_lock); > - list_for_each_entry(device, &group->device_list, group_next) { > - if (device->dev == dev && vfio_device_try_get(device)) { > - mutex_unlock(&group->device_lock); > - return device; > - } > - } > - mutex_unlock(&group->device_lock); > - return NULL; > -} > - > /* > * VFIO driver API > */ > @@ -576,8 +589,10 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) > static int __vfio_register_dev(struct vfio_device *device, > struct vfio_group *group) > { > - struct vfio_device *existing_device; > - > + /* > + * In all cases group is the output of one of the group allocation functions > + * and we have group->drivers incremetned for us > + */ > if (IS_ERR(group)) > return PTR_ERR(group); > > @@ -588,18 +603,6 @@ static int __vfio_register_dev(struct vfio_device *device, > if (!device->dev_set) > vfio_assign_device_set(device, device); > > - existing_device = vfio_group_get_device(group, device->dev); > - if (existing_device) { > - dev_WARN(device->dev, "Device already exists on group %d\n", > - iommu_group_id(group->iommu_group)); > - vfio_device_put(existing_device); > - if (group->type == VFIO_NO_IOMMU || > - group->type == VFIO_EMULATED_IOMMU) > - iommu_group_remove_device(device->dev); > - vfio_group_put(group); > - return -EBUSY; > - } > - > /* Our reference on group is moved to the device */ > device->group = group; > > @@ -702,8 +705,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) > if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) > iommu_group_remove_device(device->dev); > > - /* Matches the get in vfio_register_group_dev() */ > - vfio_group_put(group); > + /* Matches the alloc get in vfio_register_group_dev() */ > + if (refcount_dec_and_test(&group->drivers)) > + vfio_group_remove(group); > } > EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); >