On 4/5/23 18:25, Alex Williamson wrote: > On Wed, 5 Apr 2023 14:04:51 +0000 > "Liu, Yi L" <yi.l.liu@xxxxxxxxx> wrote: > >> Hi Eric, >> >>> From: Eric Auger <eric.auger@xxxxxxxxxx> >>> Sent: Wednesday, April 5, 2023 8:20 PM >>> >>> Hi Yi, >>> On 4/1/23 16:44, Yi Liu wrote: >>>> for the users that accept device fds passed from management stacks to be >>>> able to figure out the host reset affected devices among the devices >>>> opened by the user. This is needed as such users do not have BDF (bus, >>>> devfn) knowledge about the devices it has opened, hence unable to use >>>> the information reported by existing VFIO_DEVICE_GET_PCI_HOT_RESET_INFO >>>> to figure out the affected devices. >>>> >>>> Signed-off-by: Yi Liu <yi.l.liu@xxxxxxxxx> >>>> --- >>>> drivers/vfio/pci/vfio_pci_core.c | 58 ++++++++++++++++++++++++++++---- >>>> include/uapi/linux/vfio.h | 24 ++++++++++++- >>>> 2 files changed, 74 insertions(+), 8 deletions(-) >>>> >>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c >>>> index 19f5b075d70a..a5a7e148dce1 100644 >>>> --- a/drivers/vfio/pci/vfio_pci_core.c >>>> +++ b/drivers/vfio/pci/vfio_pci_core.c >>>> @@ -30,6 +30,7 @@ >>>> #if IS_ENABLED(CONFIG_EEH) >>>> #include <asm/eeh.h> >>>> #endif >>>> +#include <uapi/linux/iommufd.h> >>>> >>>> #include "vfio_pci_priv.h" >>>> >>>> @@ -767,6 +768,20 @@ static int vfio_pci_get_irq_count(struct >>> vfio_pci_core_device *vdev, int irq_typ >>>> return 0; >>>> } >>>> >>>> +static struct vfio_device * >>>> +vfio_pci_find_device_in_devset(struct vfio_device_set *dev_set, >>>> + struct pci_dev *pdev) >>>> +{ >>>> + struct vfio_device *cur; >>>> + >>>> + lockdep_assert_held(&dev_set->lock); >>>> + >>>> + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) >>>> + if (cur->dev == &pdev->dev) >>>> + return cur; >>>> + return NULL; >>>> +} >>>> + >>>> static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) >>>> { >>>> (*(int *)data)++; >>>> @@ -776,13 +791,20 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void >>> *data) >>>> struct vfio_pci_fill_info { >>>> int max; >>>> int cur; >>>> + bool require_devid; >>>> + struct iommufd_ctx *iommufd; >>>> + struct vfio_device_set *dev_set; >>>> struct vfio_pci_dependent_device *devices; >>>> }; >>>> >>>> static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) >>>> { >>>> struct vfio_pci_fill_info *fill = data; >>>> + struct vfio_device_set *dev_set = fill->dev_set; >>>> struct iommu_group *iommu_group; >>>> + struct vfio_device *vdev; >>>> + >>>> + lockdep_assert_held(&dev_set->lock); >>>> >>>> if (fill->cur == fill->max) >>>> return -EAGAIN; /* Something changed, try again */ >>>> @@ -791,7 +813,21 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void >>> *data) >>>> if (!iommu_group) >>>> return -EPERM; /* Cannot reset non-isolated devices */ >>>> >>>> - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); >>>> + if (fill->require_devid) { >>>> + /* >>>> + * Report dev_id of the devices that are opened as cdev >>>> + * and have the same iommufd with the fill->iommufd. >>>> + * Otherwise, just fill IOMMUFD_INVALID_ID. >>>> + */ >>>> + vdev = vfio_pci_find_device_in_devset(dev_set, pdev); >>>> + if (vdev && vfio_device_cdev_opened(vdev) && >>>> + fill->iommufd == vfio_iommufd_physical_ictx(vdev)) >>>> + vfio_iommufd_physical_devid(vdev, &fill->devices[fill- >>>> cur].dev_id); >>>> + else >>>> + fill->devices[fill->cur].dev_id = IOMMUFD_INVALID_ID; >>>> + } else { >>>> + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); >>>> + } >>>> fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); >>>> fill->devices[fill->cur].bus = pdev->bus->number; >>>> fill->devices[fill->cur].devfn = pdev->devfn; >>>> @@ -1230,17 +1266,27 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( >>>> return -ENOMEM; >>>> >>>> fill.devices = devices; >>>> + fill.dev_set = vdev->vdev.dev_set; >>>> >>>> + mutex_lock(&vdev->vdev.dev_set->lock); >>>> + if (vfio_device_cdev_opened(&vdev->vdev)) { >>>> + fill.require_devid = true; >>>> + fill.iommufd = vfio_iommufd_physical_ictx(&vdev->vdev); >>>> + } >>>> ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, >>>> &fill, slot); >>>> + mutex_unlock(&vdev->vdev.dev_set->lock); >>>> >>>> /* >>>> * If a device was removed between counting and filling, we may come up >>>> * short of fill.max. If a device was added, we'll have a return of >>>> * -EAGAIN above. >>>> */ >>>> - if (!ret) >>>> + if (!ret) { >>>> hdr.count = fill.cur; >>>> + if (fill.require_devid) >>>> + hdr.flags = VFIO_PCI_HOT_RESET_FLAG_IOMMUFD_DEV_ID; >>>> + } >>>> >>>> reset_info_exit: >>>> if (copy_to_user(arg, &hdr, minsz)) >>>> @@ -2346,12 +2392,10 @@ static bool vfio_dev_in_files(struct >>> vfio_pci_core_device *vdev, >>>> static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) >>>> { >>>> struct vfio_device_set *dev_set = data; >>>> - struct vfio_device *cur; >>>> >>>> - list_for_each_entry(cur, &dev_set->device_list, dev_set_list) >>>> - if (cur->dev == &pdev->dev) >>>> - return 0; >>>> - return -EBUSY; >>>> + lockdep_assert_held(&dev_set->lock); >>>> + >>>> + return vfio_pci_find_device_in_devset(dev_set, pdev) ? 0 : -EBUSY; >>>> } >>>> >>>> /* >>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h >>>> index 25432ef213ee..5a34364e3b94 100644 >>>> --- a/include/uapi/linux/vfio.h >>>> +++ b/include/uapi/linux/vfio.h >>>> @@ -650,11 +650,32 @@ enum { >>>> * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, >>>> * struct vfio_pci_hot_reset_info) >>>> * >>>> + * This command is used to query the affected devices in the hot reset for >>>> + * a given device. User could use the information reported by this command >>>> + * to figure out the affected devices among the devices it has opened. the 'opened' terminology does not look sufficient here because it is not only a matter of the device being opened using cdev but it also needs to have been bound to an iommufd, dev_id being the output of the dev-iommufd binding. By the way I am now confused. What does happen if the reset impact some devices which are not bound to an iommu ctx. Previously we returned the iommu group which always pre-exists but now you will report invalid id? >>>> + * This command always reports the segment, bus and devfn information for >>>> + * each affected device, and selectively report the group_id or the dev_id >>>> + * per the way how the device being queried is opened. >>>> + * - If the device is opened via the traditional group/container manner, >>>> + * this command reports the group_id for each affected device. >>>> + * >>>> + * - If the device is opened as a cdev, this command needs to report >>> s/needs to report/reports >> got it. >> >>>> + * dev_id for each affected device and set the >>>> + * VFIO_PCI_HOT_RESET_FLAG_IOMMUFD_DEV_ID flag. For the affected >>>> + * devices that are not opened as cdev or bound to different iommufds >>>> + * with the device that is queried, report an invalid dev_id to avoid or not bound at all >>> s/bound to different iommufds with the device that is queried/bound to >>> iommufds different from the reset device one? >> hmmm, I'm not a native speaker here. This _INFO is to query if want >> hot reset a given device, what devices would be affected. So it appears >> the queried device is better. But I'd admit "the queried device" is also >> "the reset device". may Alex help pick one. 😊 > - If the calling device is opened directly via cdev rather than > accessed through the vfio group, the returned > vfio_pci_depdendent_device structure reports the dev_id > rather than the group_id, which is indicated by the > VFIO_PCI_HOT_RESET_FLAG_IOMMUFD_DEV_ID flag in > vfio_pci_hot_reset_info. If the reset affects devices that > are not opened within the same iommufd context as the calling > device, IOMMUFD_INVALID_ID will be provided as the dev_id. > > But that kind of brings to light the question of what does the user do > when they encounter this situation. If the device is not opened, the > reset can complete. If the device is opened by a different user, the > reset is blocked. The only logical conclusion is that the user should > try the reset regardless of the result of the info ioctl, which the > null-array approach further solidifies as the direction of the API. > I'm not liking this. Thanks, > > Alex Thanks Eric > > >>>> + * potential dev_id conflict as dev_id is local to iommufd. For such >>>> + * affected devices, user shall fall back to use the segment, bus and >>>> + * devfn info to map it to opened device. >>>> + * >>>> * Return: 0 on success, -errno on failure: >>>> * -enospc = insufficient buffer, -enodev = unsupported for device. >>>> */ >>>> struct vfio_pci_dependent_device { >>>> - __u32 group_id; >>>> + union { >>>> + __u32 group_id; >>>> + __u32 dev_id; >>>> + }; >>>> __u16 segment; >>>> __u8 bus; >>>> __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ >>>> @@ -663,6 +684,7 @@ struct vfio_pci_dependent_device { >>>> struct vfio_pci_hot_reset_info { >>>> __u32 argsz; >>>> __u32 flags; >>>> +#define VFIO_PCI_HOT_RESET_FLAG_IOMMUFD_DEV_ID (1 << 0) >>>> __u32 count; >>>> struct vfio_pci_dependent_device devices[]; >>>> }; >>> Eric