> From: Alex Williamson <alex.williamson@xxxxxxxxxx> > Sent: Tuesday, March 28, 2023 4:41 AM > > On Mon, 27 Mar 2023 13:26:19 -0600 > Alex Williamson <alex.williamson@xxxxxxxxxx> wrote: > [...] > > > diff --git a/drivers/vfio/pci/vfio_pci_core.c > b/drivers/vfio/pci/vfio_pci_core.c > > > index 19f5b075d70a..45edf4e9b98b 100644 > > > --- a/drivers/vfio/pci/vfio_pci_core.c > > > +++ b/drivers/vfio/pci/vfio_pci_core.c > > > @@ -1181,6 +1181,102 @@ static int vfio_pci_ioctl_reset(struct > vfio_pci_core_device *vdev, > > > return ret; > > > } > > > > > > +static struct pci_dev * > > > +vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set); > > > + > > > +static int vfio_pci_ioctl_get_pci_hot_reset_group_info( > > > + struct vfio_pci_core_device *vdev, > > > + struct vfio_pci_hot_reset_group_info __user *arg) > > > +{ > > > + unsigned long minsz = > > > + offsetofend(struct vfio_pci_hot_reset_group_info, count); > > > + struct vfio_pci_hot_reset_group_info hdr; > > > + struct iommufd_ctx *iommufd, *cur_iommufd; > > > + u32 count = 0, index = 0, *devices = NULL; > > > + struct vfio_pci_core_device *cur; > > > + bool slot = false; > > > + int ret = 0; > > > + > > > + if (copy_from_user(&hdr, arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (hdr.argsz < minsz) > > > + return -EINVAL; > > > + > > > + hdr.flags = 0; > > > + > > > + /* Can we do a slot or bus reset or neither? */ > > > + if (!pci_probe_reset_slot(vdev->pdev->slot)) > > > + slot = true; > > > + else if (pci_probe_reset_bus(vdev->pdev->bus)) > > > + return -ENODEV; > > > + > > > + mutex_lock(&vdev->vdev.dev_set->lock); > > > + if (!vfio_pci_dev_set_resettable(vdev->vdev.dev_set)) { [1] > > > + ret = -EPERM; > > > + goto out_unlock; > > > + } > > > + > > > + iommufd = vfio_iommufd_physical_ictx(&vdev->vdev); > > > + if (!iommufd) { > > > + ret = -EPERM; > > > + goto out_unlock; > > > + } > > > + > > > + /* How many devices are affected? */ > > > + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, > vfio_pci_count_devs, > > > + &count, slot); > > > + if (ret) > > > + goto out_unlock; > > > + > > > + WARN_ON(!count); /* Should always be at least one */ > > > + > > > + /* > > > + * If there's enough space, fill it now, otherwise return -ENOSPC and > > > + * the number of devices affected. > > > + */ > > > + if (hdr.argsz < sizeof(hdr) + (count * sizeof(*devices))) { > > > + ret = -ENOSPC; > > > + hdr.count = count; > > > + goto reset_info_exit; > > > + } > > > + > > > + devices = kcalloc(count, sizeof(*devices), GFP_KERNEL); > > > + if (!devices) { > > > + ret = -ENOMEM; > > > + goto reset_info_exit; > > > + } > > > + > > > + list_for_each_entry(cur, &vdev->vdev.dev_set->device_list, > vdev.dev_set_list) { > > > + cur_iommufd = vfio_iommufd_physical_ictx(&cur->vdev); > > > + if (cur->vdev.open_count) { > > > + if (cur_iommufd != iommufd) { > > > + ret = -EPERM; > > > + break; > > > + } > > > + ret = vfio_iommufd_physical_devid(&cur->vdev, > &devices[index]); > > > + if (ret) > > > + break; > > > + index++; > > > + } > > > + } > > This also makes use of vfio_pci_for_each_slot_or_bus() to iterate > affected devices, but then still assumes those affected devices are > necessarily represented in the dev_set. For example, a two-function > device with ACS isolation can have function 0 bound to vfio and > function 1 bound to a native host driver. The above code requires the > user to pass a buffer large enough for both functions, but only > function 0 is part of the dev_set, so function 1 is not reported as > affected, nor does it generate an error. The vfio_pci_dev_set_resettable() is used at [1] to check if all the affected devices are in the dev_set. If not, then it fails at the first place. So in the following code, looping the devices in the dev_set is equivalent to looping devices with vfio_pci_for_each_slot_or_bus(). I need to loop dev_set as dev_set has the vfio_device which is more convenient to get dev_id. > > It looks like we also fail to set hdr.count except in the error path > above, so the below copy_to_user() copies a user specified range beyond > the end the allocated devices buffer out to userspace! Thanks, Oops, yes, it is. My test only has one device affected, so it does not hit the problem. ☹ Thanks, Yi Liu > Alex > > > > + > > > +reset_info_exit: > > > + if (copy_to_user(arg, &hdr, minsz)) > > > + ret = -EFAULT; > > > + > > > + if (!ret) { > > > + if (copy_to_user(&arg->devices, devices, > > > + hdr.count * sizeof(*devices))) > > > + ret = -EFAULT; > > > + } > > > + > > > + kfree(devices); > > > +out_unlock: > > > + mutex_unlock(&vdev->vdev.dev_set->lock); > > > + return ret; > > > +} > > > + > > > static int vfio_pci_ioctl_get_pci_hot_reset_info( > > > struct vfio_pci_core_device *vdev, > > > struct vfio_pci_hot_reset_info __user *arg) > > > @@ -1404,6 +1500,8 @@ long vfio_pci_core_ioctl(struct vfio_device > *core_vdev, unsigned int cmd, > > > return vfio_pci_ioctl_get_irq_info(vdev, uarg); > > > case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: > > > return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); > > > + case VFIO_DEVICE_GET_PCI_HOT_RESET_GROUP_INFO: > > > + return vfio_pci_ioctl_get_pci_hot_reset_group_info(vdev, > uarg); > > > case VFIO_DEVICE_GET_REGION_INFO: > > > return vfio_pci_ioctl_get_region_info(vdev, uarg); > > > case VFIO_DEVICE_IOEVENTFD: > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > index 25432ef213ee..61b801dfd40b 100644 > > > --- a/include/uapi/linux/vfio.h > > > +++ b/include/uapi/linux/vfio.h > > > @@ -669,6 +669,53 @@ struct vfio_pci_hot_reset_info { > > > > > > #define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, > VFIO_BASE + 12) > > > > > > +/** > > > + * VFIO_DEVICE_GET_PCI_HOT_RESET_GROUP_INFO - > _IOWR(VFIO_TYPE, VFIO_BASE + 12, > > > + * struct > vfio_pci_hot_reset_group_info) > > > + * > > > + * This is used in the vfio device cdev mode. It returns the list of > > > + * affected devices (represented by iommufd dev_id) when hot reset is > > > + * issued on the current device with which this ioctl is invoked. It > > > + * only includes the devices that are opened by the current user in the > > > + * time of this command is invoked. This list may change when user > opens > > > + * new device or close opened device, hence user should re-invoke it > > > + * after open/close devices. This command has no guarantee on the > result > > > + * of VFIO_DEVICE_PCI_HOT_RESET since the not-opened affected > device can > > > + * be by other users in the window between the two ioctls. If the > affected > > > + * devices are opened by multiple users, the > VFIO_DEVICE_PCI_HOT_RESET > > > + * shall fail, detail can check the description of > VFIO_DEVICE_PCI_HOT_RESET. > > > + * > > > + * For the users that open vfio group/container, this ioctl is useless as > > > + * they have no idea about the iommufd dev_id returned by this ioctl. > For > > > + * the users of the traditional mode vfio group/container, this ioctl will > > > + * fail as this mode does not use iommufd hence no dev_id to report > back. > > > + * For the users of the iommufd compat mode vfio group/container, > this ioctl > > > + * would succeed as this mode uses iommufd as container fd. But such > users > > > + * still have no idea about the iommufd dev_id as the dev_id is only > stored > > > + * in kernel in this mode. For the users of the vfio group/container, the > > > + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO should be used to know > the hot reset > > > + * affected devices. > > > + * > > > + * Return: 0 on success, -errno on failure: > > > + * -enospc = insufficient buffer; > > > + * -enodev = unsupported for device; > > > + * -eperm = no permission for device, this error comes: > > > + * - when there are affected devices that are opened but > > > + * not bound to the same iommufd with the current device > > > + * with which this ioctl is invoked, > > > + * - there are affected devices that are not bound to vfio > > > + * driver yet. > > > + * - no valid iommufd is bound (e.g. noiommu mode) > > > + */ > > > +struct vfio_pci_hot_reset_group_info { > > > + __u32 argsz; > > > + __u32 flags; > > > + __u32 count; > > > + __u32 devices[]; > > > +}; > > > + > > > +#define VFIO_DEVICE_GET_PCI_HOT_RESET_GROUP_INFO > _IO(VFIO_TYPE, VFIO_BASE + 18) > > > + > > > /** > > > * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13, > > > * struct vfio_pci_hot_reset) > >