Hi Alex, On 11/9/18 11:09 PM, Alex Williamson wrote: > In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and > remove") a mutex was added to freeze the refcnt for a device so that > we can handle errors and perform bus resets on final close. However, > bus resets can be rather slow and a global mutex here is undesirable. > A per-device mutex provides the best granularity, but then our chances > of triggering a bus/slot reset with multiple affected devices is slim > when devices are released in parallel. Sorry I don't get the above sentence. Instead create a reflck object > shared among all devices under the same bus or slot, allowing devices > on independent buses to be released in parallel while serializing per > bus/slot.> > Reported-by: Christian Ehrhardt <christian.ehrhardt@xxxxxxxxxxxxx> > Tested-by: Christian Ehrhardt <christian.ehrhardt@xxxxxxxxxxxxx> > Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> > --- > drivers/vfio/pci/vfio_pci.c | 157 ++++++++++++++++++++++++++++++----- > drivers/vfio/pci/vfio_pci_private.h | 6 + > 2 files changed, 139 insertions(+), 24 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index 50cdedfca9fe..d443fb7a4e75 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -56,8 +56,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR); > MODULE_PARM_DESC(disable_idle_d3, > "Disable using the PCI D3 low power state for idle, unused devices"); > > -static DEFINE_MUTEX(driver_lock); > - > static inline bool vfio_vga_disabled(void) > { > #ifdef CONFIG_VFIO_PCI_VGA > @@ -393,14 +391,14 @@ static void vfio_pci_release(void *device_data) > { > struct vfio_pci_device *vdev = device_data; > > - mutex_lock(&driver_lock); > + mutex_lock(&vdev->reflck->lock); > > if (!(--vdev->refcnt)) { > vfio_spapr_pci_eeh_release(vdev->pdev); > vfio_pci_disable(vdev); > } > > - mutex_unlock(&driver_lock); > + mutex_unlock(&vdev->reflck->lock); > > module_put(THIS_MODULE); > } > @@ -413,7 +411,7 @@ static int vfio_pci_open(void *device_data) > if (!try_module_get(THIS_MODULE)) > return -ENODEV; > > - mutex_lock(&driver_lock); > + mutex_lock(&vdev->reflck->lock); > > if (!vdev->refcnt) { > ret = vfio_pci_enable(vdev); > @@ -424,7 +422,7 @@ static int vfio_pci_open(void *device_data) > } > vdev->refcnt++; > error: > - mutex_unlock(&driver_lock); > + mutex_unlock(&vdev->reflck->lock); > if (ret) > module_put(THIS_MODULE); > return ret; > @@ -1187,6 +1185,9 @@ static const struct vfio_device_ops vfio_pci_ops = { > .request = vfio_pci_request, > }; > > +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev); > +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck); > + > static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > { > struct vfio_pci_device *vdev; > @@ -1233,6 +1234,14 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > return ret; > } > > + ret = vfio_pci_reflck_attach(vdev); > + if (ret) { > + vfio_del_group_dev(&pdev->dev); > + vfio_iommu_group_put(group, &pdev->dev); > + kfree(vdev); > + return ret; > + } > + > if (vfio_pci_is_vga(pdev)) { > vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); > vga_set_legacy_decoding(pdev, > @@ -1264,6 +1273,8 @@ static void vfio_pci_remove(struct pci_dev *pdev) > if (!vdev) > return; > > + vfio_pci_reflck_put(vdev->reflck); > + > vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); > kfree(vdev->region); > mutex_destroy(&vdev->ioeventfds_lock); > @@ -1320,16 +1331,97 @@ static struct pci_driver vfio_pci_driver = { > .err_handler = &vfio_err_handlers, > }; > > +static DEFINE_MUTEX(reflck_lock); > + > +static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void) > +{ > + struct vfio_pci_reflck *reflck; > + > + reflck = kzalloc(sizeof(*reflck), GFP_KERNEL); > + if (!reflck) > + return ERR_PTR(-ENOMEM); > + > + kref_init(&reflck->kref); > + mutex_init(&reflck->lock); > + > + return reflck; > +} > + > +static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck) > +{ > + kref_get(&reflck->kref); > +} > + > +static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) > +{ > + struct vfio_pci_reflck **preflck = data; > + struct vfio_device *device; > + struct vfio_pci_device *vdev; > + > + device = vfio_device_get_from_dev(&pdev->dev); > + if (!device) > + return 0; > + > + if (pci_dev_driver(pdev) != &vfio_pci_driver) { > + vfio_device_put(device); > + return 0; > + } > + > + vdev = vfio_device_data(device); > + > + if (vdev->reflck) { > + vfio_pci_reflck_get(vdev->reflck); > + *preflck = vdev->reflck; > + vfio_device_put(device); > + return 1; > + } > + > + vfio_device_put(device); > + return 0; > +} > + > +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev) > +{ > + bool slot = !pci_probe_reset_slot(vdev->pdev->slot); > + > + mutex_lock(&reflck_lock); > + > + if (pci_is_root_bus(vdev->pdev->bus) || > + vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find, > + &vdev->reflck, slot) <= 0) !vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find, &vdev->reflck, slot)? I don't think we can have negative returned value. > + vdev->reflck = vfio_pci_reflck_alloc(); > + > + mutex_unlock(&reflck_lock); > + > + return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0; > +} > + > +static void vfio_pci_reflck_release(struct kref *kref) > +{ > + struct vfio_pci_reflck *reflck = container_of(kref, > + struct vfio_pci_reflck, > + kref); > + > + kfree(reflck); > + mutex_unlock(&reflck_lock); > +} > + > +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck) > +{ > + kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock); > +} > + > struct vfio_devices { > struct vfio_device **devices; > int cur_index; > int max_index; > }; > > -static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) > +static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) > { > struct vfio_devices *devs = data; > struct vfio_device *device; > + struct vfio_pci_device *vdev; > > if (devs->cur_index == devs->max_index) > return -ENOSPC; > @@ -1343,16 +1435,25 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) > return -EBUSY; > } > > + vdev = vfio_device_data(device); > + > + /* Only collect unused devices */ abort if the slot/bus features a used device? > + if (vdev->refcnt) { > + vfio_device_put(device); > + return -EBUSY; > + } > + > devs->devices[devs->cur_index++] = device; > return 0; > } > > /* > - * Attempt to do a bus/slot reset if there are devices affected by a reset for > - * this device that are needs_reset and all of the affected devices are unused > - * (!refcnt). Callers are required to hold driver_lock when calling this to > - * prevent device opens and concurrent bus reset attempts. We prevent device > - * unbinds by acquiring and holding a reference to the vfio_device. > + * Attempt to do a bus/slot reset if there are devices affected by a reset > + * for this device that are "needs_reset" and all of the affected devices > + * are unused (!refcnt). This comment still sounds a bit cryptic to me. Assuming I got the point, may I suggest something like: If one of the device of the slot/bus needs a reset (but cannot be reset at function level) and all the devices of the slot/bus are unused (!refcount), we attempt to do a bus/slot reset. Callers are required to hold vdev->reflck->lock > + * to prevent concurrent device opens, which is expected to protect all > + * affected devices. A vfio_device reference is also acquired for each > + * affected device to prevent unbinds. > * > * NB: vfio-core considers a group to be viable even if some devices are > * bound to drivers like pci-stub or pcieport. Here we require all devices > @@ -1363,7 +1464,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) > { > struct vfio_devices devs = { .cur_index = 0 }; > int i = 0, ret = -EINVAL; > - bool needs_reset = false, slot = false; > + bool slot = false; > struct vfio_pci_device *tmp; > > if (!pci_probe_reset_slot(vdev->pdev->slot)) > @@ -1381,28 +1482,36 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) > return; > > if (vfio_pci_for_each_slot_or_bus(vdev->pdev, > - vfio_pci_get_devs, &devs, slot)) > + vfio_pci_get_unused_devs, > + &devs, slot)) > goto put_devs; > > + /* Does at least one need a reset? */ > for (i = 0; i < devs.cur_index; i++) { > tmp = vfio_device_data(devs.devices[i]); > - if (tmp->needs_reset) > - needs_reset = true; > - if (tmp->refcnt) > - goto put_devs; > + if (tmp->needs_reset) { > + ret = pci_reset_bus(vdev->pdev); > + break; > + } > } > > - if (needs_reset) > - ret = pci_reset_bus(vdev->pdev); > - > put_devs: > for (i = 0; i < devs.cur_index; i++) { > tmp = vfio_device_data(devs.devices[i]); > - if (!ret) > + > + /* > + * If reset was successful, affected devices no longer need > + * a reset and we should return all the collateral devices > + * to low power. If not successful, we either didn't reset > + * the bus or timed out waiting for it, so let's not touch > + * the power state. > + */ > + if (!ret) { > tmp->needs_reset = false; > > - if (!tmp->refcnt && !disable_idle_d3) > - pci_set_power_state(tmp->pdev, PCI_D3hot); > + if (tmp != vdev && !disable_idle_d3) > + pci_set_power_state(tmp->pdev, PCI_D3hot); > + } > > vfio_device_put(devs.devices[i]); > } > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h > index cde3b5d3441a..aa2355e67340 100644 > --- a/drivers/vfio/pci/vfio_pci_private.h > +++ b/drivers/vfio/pci/vfio_pci_private.h > @@ -76,6 +76,11 @@ struct vfio_pci_dummy_resource { > struct list_head res_next; > }; > > +struct vfio_pci_reflck { > + struct kref kref; > + struct mutex lock; > +}; > + > struct vfio_pci_device { > struct pci_dev *pdev; > void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; > @@ -104,6 +109,7 @@ struct vfio_pci_device { > bool needs_reset; > bool nointx; > struct pci_saved_state *pci_saved_state; > + struct vfio_pci_reflck *reflck; > int refcnt; > int ioeventfds_nr; > struct eventfd_ctx *err_trigger; > Reviewed-by: Eric Auger <eric.auger@xxxxxxxxxx> Thanks Eric