> From: Jason Gunthorpe <jgg@xxxxxxxxxx> > Sent: Friday, May 6, 2022 8:25 AM > > This is necessary to avoid various user triggerable races, for instance > racing SET_CONTAINER/UNSET_CONTAINER: > > ioctl(VFIO_GROUP_SET_CONTAINER) > ioctl(VFIO_GROUP_UNSET_CONTAINER) > vfio_group_unset_container > int users = atomic_cmpxchg(&group->container_users, 1, 0); > // users == 1 container_users == 0 > __vfio_group_unset_container(group); > > vfio_group_set_container() > if (atomic_read(&group->container_users)) if (!atomic_read(...)) > down_write(&container->group_lock); > group->container = container; > up_write(&container->group_lock); > It'd be clearer to add below step here: container = group->container; otherwise if this assignment is done before above race then the original container is not leaked. > down_write(&container->group_lock); > group->container = NULL; > up_write(&container->group_lock); > vfio_container_put(container); > /* woops we leaked the original container */ > > This can then go on to NULL pointer deref since container == 0 and > container_users == 1. > > Wrap all touches of container, except those on a performance path with a > known open device, with the group_rwsem. > > The only user of vfio_group_add_container_user() holds the user count for > a simple operation, change it to just hold the group_lock over the > operation and delete vfio_group_add_container_user(). Containers now only > gain a user when a device FD is opened. > > Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxx> Reviewed-by: Kevin Tian <kevin.tian@xxxxxxxxx> > --- > drivers/vfio/vfio.c | 66 +++++++++++++++++++++++++++------------------ > 1 file changed, 40 insertions(+), 26 deletions(-) > > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c > index d8d14e528ab795..63f7fa872eae60 100644 > --- a/drivers/vfio/vfio.c > +++ b/drivers/vfio/vfio.c > @@ -937,6 +937,8 @@ static void __vfio_group_unset_container(struct > vfio_group *group) > struct vfio_container *container = group->container; > struct vfio_iommu_driver *driver; > > + lockdep_assert_held_write(&group->group_rwsem); > + > down_write(&container->group_lock); > > driver = container->iommu_driver; > @@ -973,6 +975,8 @@ static int vfio_group_unset_container(struct > vfio_group *group) > { > int users = atomic_cmpxchg(&group->container_users, 1, 0); > > + lockdep_assert_held_write(&group->group_rwsem); > + > if (!users) > return -EINVAL; > if (users != 1) > @@ -991,8 +995,10 @@ static int vfio_group_unset_container(struct > vfio_group *group) > */ > static void vfio_group_try_dissolve_container(struct vfio_group *group) > { > + down_write(&group->group_rwsem); > if (0 == atomic_dec_if_positive(&group->container_users)) > __vfio_group_unset_container(group); > + up_write(&group->group_rwsem); > } > > static int vfio_group_set_container(struct vfio_group *group, int > container_fd) > @@ -1002,6 +1008,8 @@ static int vfio_group_set_container(struct > vfio_group *group, int container_fd) > struct vfio_iommu_driver *driver; > int ret = 0; > > + lockdep_assert_held_write(&group->group_rwsem); > + > if (atomic_read(&group->container_users)) > return -EINVAL; > > @@ -1059,23 +1067,6 @@ static int vfio_group_set_container(struct > vfio_group *group, int container_fd) > return ret; > } > > -static int vfio_group_add_container_user(struct vfio_group *group) > -{ > - if (!atomic_inc_not_zero(&group->container_users)) > - return -EINVAL; > - > - if (group->type == VFIO_NO_IOMMU) { > - atomic_dec(&group->container_users); > - return -EPERM; > - } > - if (!group->container->iommu_driver) { > - atomic_dec(&group->container_users); > - return -EINVAL; > - } > - > - return 0; > -} > - > static const struct file_operations vfio_device_fops; > > /* true if the vfio_device has open_device() called but not close_device() */ > @@ -1088,6 +1079,8 @@ static int vfio_device_assign_container(struct > vfio_device *device) > { > struct vfio_group *group = device->group; > > + lockdep_assert_held_write(&group->group_rwsem); > + > if (0 == atomic_read(&group->container_users) || > !group->container->iommu_driver) > return -EINVAL; > @@ -1109,7 +1102,9 @@ static struct file *vfio_device_open(struct > vfio_device *device) > struct file *filep; > int ret; > > + down_write(&device->group->group_rwsem); > ret = vfio_device_assign_container(device); > + up_write(&device->group->group_rwsem); > if (ret) > return ERR_PTR(ret); > > @@ -1219,11 +1214,13 @@ static long vfio_group_fops_unl_ioctl(struct file > *filep, > > status.flags = 0; > > + down_read(&group->group_rwsem); > if (group->container) > status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET > | > VFIO_GROUP_FLAGS_VIABLE; > else if (!iommu_group_dma_owner_claimed(group- > >iommu_group)) > status.flags |= VFIO_GROUP_FLAGS_VIABLE; > + up_read(&group->group_rwsem); > > if (copy_to_user((void __user *)arg, &status, minsz)) > return -EFAULT; > @@ -1241,11 +1238,15 @@ static long vfio_group_fops_unl_ioctl(struct file > *filep, > if (fd < 0) > return -EINVAL; > > + down_write(&group->group_rwsem); > ret = vfio_group_set_container(group, fd); > + up_write(&group->group_rwsem); > break; > } > case VFIO_GROUP_UNSET_CONTAINER: > + down_write(&group->group_rwsem); > ret = vfio_group_unset_container(group); > + up_write(&group->group_rwsem); > break; > case VFIO_GROUP_GET_DEVICE_FD: > { > @@ -1731,15 +1732,19 @@ bool vfio_file_enforced_coherent(struct file *file) > if (file->f_op != &vfio_group_fops) > return true; > > - /* > - * Since the coherency state is determined only once a container is > - * attached the user must do so before they can prove they have > - * permission. > - */ > - if (vfio_group_add_container_user(group)) > - return true; > - ret = vfio_ioctl_check_extension(group->container, > VFIO_DMA_CC_IOMMU); > - vfio_group_try_dissolve_container(group); > + down_read(&group->group_rwsem); > + if (group->container) { > + ret = vfio_ioctl_check_extension(group->container, > + VFIO_DMA_CC_IOMMU); > + } else { > + /* > + * Since the coherency state is determined only once a > container > + * is attached the user must do so before they can prove they > + * have permission. > + */ > + ret = true; > + } > + up_read(&group->group_rwsem); > return ret; > } > EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); > @@ -1932,6 +1937,7 @@ int vfio_pin_pages(struct vfio_device *device, > unsigned long *user_pfn, > if (group->dev_counter > 1) > return -EINVAL; > > + /* group->container cannot change while a vfio device is open */ > container = group->container; > driver = container->iommu_driver; > if (likely(driver && driver->ops->pin_pages)) > @@ -1967,6 +1973,7 @@ int vfio_unpin_pages(struct vfio_device *device, > unsigned long *user_pfn, > if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) > return -E2BIG; > > + /* group->container cannot change while a vfio device is open */ > container = device->group->container; > driver = container->iommu_driver; > if (likely(driver && driver->ops->unpin_pages)) > @@ -2006,6 +2013,7 @@ int vfio_dma_rw(struct vfio_device *device, > dma_addr_t user_iova, void *data, > if (!data || len <= 0 || !vfio_assert_device_open(device)) > return -EINVAL; > > + /* group->container cannot change while a vfio device is open */ > container = device->group->container; > driver = container->iommu_driver; > > @@ -2026,6 +2034,7 @@ static int vfio_register_iommu_notifier(struct > vfio_group *group, > struct vfio_iommu_driver *driver; > int ret; > > + down_read(&group->group_rwsem); > container = group->container; > driver = container->iommu_driver; > if (likely(driver && driver->ops->register_notifier)) > @@ -2033,6 +2042,8 @@ static int vfio_register_iommu_notifier(struct > vfio_group *group, > events, nb); > else > ret = -ENOTTY; > + up_read(&group->group_rwsem); > + > return ret; > } > > @@ -2043,6 +2054,7 @@ static int vfio_unregister_iommu_notifier(struct > vfio_group *group, > struct vfio_iommu_driver *driver; > int ret; > > + down_read(&group->group_rwsem); > container = group->container; > driver = container->iommu_driver; > if (likely(driver && driver->ops->unregister_notifier)) > @@ -2050,6 +2062,8 @@ static int vfio_unregister_iommu_notifier(struct > vfio_group *group, > nb); > else > ret = -ENOTTY; > + up_read(&group->group_rwsem); > + > return ret; > } > > -- > 2.36.0