On Thu, 5 Aug 2021 08:47:01 -0300 Jason Gunthorpe <jgg@xxxxxxxxxx> wrote: > On Tue, Aug 03, 2021 at 10:52:25AM -0600, Alex Williamson wrote: > > On Tue, 3 Aug 2021 13:41:52 -0300 > > Jason Gunthorpe <jgg@xxxxxxxxxx> wrote: > > > On Tue, Aug 03, 2021 at 10:34:06AM -0600, Alex Williamson wrote: > > > > I think the vfio_pci_find_reset_target() function needs to be re-worked > > > > to just tell us true/false that it's ok to reset the provided device, > > > > not to anoint an arbitrary target device. Thanks, > > > > > > Yes, though this logic is confusing, why do we need to check if any > > > device needs a reset at this point? If we are being asked to reset > > > vdev shouldn't vdev needs_reset? > > > > > > Or is the function more of a 'synchronize pending reset' kind of > > > thing? > > > > Yes, the latter. For instance think about a multi-function PCI device > > such as a GPU. The functions have dramatically different capabilities, > > some might have function level reset abilities and others not. We want > > to be able to trigger a bus reset as the last device of the set is > > released, no matter the order they're released and no matter the > > capabilities of the device we're currently processing. Thanks, > > I worked on this for awhile, I think this is much clearer about what > this algorithm is trying to do: > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index 5d6db93d6c680f..e418bcbb68facc 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -223,7 +223,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) > } > } > > -static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); > +static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); > static void vfio_pci_disable(struct vfio_pci_device *vdev); > static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data); > > @@ -404,6 +404,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) > struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; > int i, bar; > > + /* For needs_reset */ > + lockdep_assert_held(&vdev->vdev.dev_set->lock); > + > /* Stop the device from further DMA */ > pci_clear_master(pdev); > > @@ -487,9 +490,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) > out: > pci_disable_device(pdev); > > - vfio_pci_try_bus_reset(vdev); > - > - if (!disable_idle_d3) > + if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3) > vfio_pci_set_power_state(vdev, PCI_D3hot); > } > > @@ -2145,36 +2146,6 @@ static struct pci_driver vfio_pci_driver = { > .err_handler = &vfio_err_handlers, > }; > > -static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) > -{ > - struct vfio_devices *devs = data; > - struct vfio_device *device; > - struct vfio_pci_device *vdev; > - > - if (devs->cur_index == devs->max_index) > - return -ENOSPC; > - > - device = vfio_device_get_from_dev(&pdev->dev); > - if (!device) > - return -EINVAL; > - > - if (pci_dev_driver(pdev) != &vfio_pci_driver) { > - vfio_device_put(device); > - return -EBUSY; > - } > - > - vdev = container_of(device, struct vfio_pci_device, vdev); > - > - /* Fault if the device is not unused */ > - if (device->open_count) { > - vfio_device_put(device); > - return -EBUSY; > - } > - > - devs->devices[devs->cur_index++] = vdev; > - return 0; > -} > - > static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) > { > struct vfio_devices *devs = data; > @@ -2208,79 +2179,86 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) > return 0; > } > > +static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) > +{ > + struct vfio_device_set *dev_set = data; > + struct vfio_device *cur; > + > + lockdep_assert_held(&dev_set->lock); > + > + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) > + if (cur->dev == &pdev->dev) > + return 0; > + return -EBUSY; > +} > + > +static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) Slight nit on the name here since we're essentially combining needs_reset along with the notion of the device being unused. I'm not sure, maybe "should_reset"? Otherwise it looks ok. Thanks, Alex > +{ > + struct vfio_pci_device *cur; > + bool needs_reset = false; > + > + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { > + /* No VFIO device in the set can have an open device FD */ > + if (cur->vdev.open_count) > + return false; > + needs_reset |= cur->needs_reset; > + } > + return needs_reset; > +} > + > /* > - * If a bus or slot reset is available for the provided device and: > + * If a bus or slot reset is available for the provided dev_set and: > * - All of the devices affected by that bus or slot reset are unused > - * (!refcnt) > * - At least one of the affected devices is marked dirty via > * needs_reset (such as by lack of FLR support) > - * Then attempt to perform that bus or slot reset. Callers are required > - * to hold vdev->dev_set->lock, protecting the bus/slot reset group from > - * concurrent opens. A vfio_device reference is acquired for each device > - * to prevent unbinds during the reset operation. > - * > - * NB: vfio-core considers a group to be viable even if some devices are > - * bound to drivers like pci-stub or pcieport. Here we require all devices > - * to be bound to vfio_pci since that's the only way we can be sure they > - * stay put. > + * Then attempt to perform that bus or slot reset. > + * Returns true if the dev_set was reset. > */ > -static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) > +static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) > { > - struct vfio_devices devs = { .cur_index = 0 }; > - int i = 0, ret = -EINVAL; > - bool slot = false; > - struct vfio_pci_device *tmp; > + struct vfio_pci_device *cur; > + struct pci_dev *pdev; > + int ret; > > - if (!pci_probe_reset_slot(vdev->pdev->slot)) > - slot = true; > - else if (pci_probe_reset_bus(vdev->pdev->bus)) > - return; > + lockdep_assert_held(&dev_set->lock); > > - if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, > - &i, slot) || !i) > - return; > + /* > + * By definition all PCI devices in the dev_set share the same PCI > + * reset, so any pci_dev will have the same outcomes for > + * pci_probe_reset_*() and pci_reset_bus(). > + */ > + pdev = list_first_entry(&dev_set->device_list, struct vfio_pci_device, > + vdev.dev_set_list)->pdev; > > - devs.max_index = i; > - devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); > - if (!devs.devices) > - return; > + /* Reset of the dev_set is possible */ > + if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus)) > + return false; > > - if (vfio_pci_for_each_slot_or_bus(vdev->pdev, > - vfio_pci_get_unused_devs, > - &devs, slot)) > - goto put_devs; > + if (!vfio_pci_dev_set_needs_reset(dev_set)) > + return false; > > - /* Does at least one need a reset? */ > - for (i = 0; i < devs.cur_index; i++) { > - tmp = devs.devices[i]; > - if (tmp->needs_reset) { > - ret = pci_reset_bus(vdev->pdev); > - break; > - } > + /* > + * vfio-core considers a group to be viable and will create a > + * vfio_device even if some devices are bound to drivers like pci-stub > + * or pcieport. Here we require all PCI devices to be inside our dev_set > + * since that ensures they stay put and that every driver controlling > + * the device can co-ordinate with the device reset. > + */ > + if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set, > + dev_set, > + !pci_probe_reset_slot(pdev->slot))) > + return false; > + > + ret = pci_reset_bus(pdev); > + if (ret) > + return false; > + > + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { > + cur->needs_reset = false; > + if (!disable_idle_d3) > + vfio_pci_set_power_state(cur, PCI_D3hot); > } > - > -put_devs: > - for (i = 0; i < devs.cur_index; i++) { > - tmp = devs.devices[i]; > - > - /* > - * If reset was successful, affected devices no longer need > - * a reset and we should return all the collateral devices > - * to low power. If not successful, we either didn't reset > - * the bus or timed out waiting for it, so let's not touch > - * the power state. > - */ > - if (!ret) { > - tmp->needs_reset = false; > - > - if (tmp != vdev && !disable_idle_d3) > - vfio_pci_set_power_state(tmp, PCI_D3hot); > - } > - > - vfio_device_put(&tmp->vdev); > - } > - > - kfree(devs.devices); > + return true; > } > > static void __exit vfio_pci_cleanup(void) >