On 8/23/22 22:30, Matthew Rosato wrote:
With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
calls") s390-iommu is supposed to handle dynamic switching between IOMMU
domains and the DMA API handling. However, this commit does not
sufficiently handle the case where the device is released via a call
to the release_device op as it may occur at the same time as an opposing
attach_dev or detach_dev since the group mutex is not held over
release_device. This was observed if the device is deconfigured during a
small window during vfio-pci initialization and can result in WARNs and
potential kernel panics.
Handle this by tracking when the device is probed/released via
dev_iommu_priv_set/get(). Ensure that once the device is released only
release_device handles the re-init of the device DMA.
Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
Signed-off-by: Matthew Rosato <mjrosato@xxxxxxxxxxxxx>
---
arch/s390/include/asm/pci.h | 1 +
arch/s390/pci/pci.c | 1 +
drivers/iommu/s390-iommu.c | 68 ++++++++++++++++++++++++++++---------
3 files changed, 54 insertions(+), 16 deletions(-)
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 7b4cdadbc023..1295b6900e4b 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -157,6 +157,7 @@ struct zpci_dev {
/* DMA stuff */
unsigned long *dma_table;
spinlock_t dma_table_lock;
+ spinlock_t dma_domain_lock;
int tlb_refresh;
spinlock_t iommu_bitmap_lock;
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 73cdc5539384..61901c1be3cc 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
kref_init(&zdev->kref);
mutex_init(&zdev->lock);
mutex_init(&zdev->kzdev_lock);
+ spin_lock_init(&zdev->dma_domain_lock);
rc = zpci_init_iommu(zdev);
if (rc)
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index c898bcbbce11..513a7ebd23b3 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -90,15 +90,39 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain_device *domain_device;
unsigned long flags;
- int cc, rc;
+ int cc, rc = 0;
if (!zdev)
return -ENODEV;
+ /* First check compatibility */
+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ /* First device defines the DMA range limits */
+ if (list_empty(&s390_domain->devices)) {
+ domain->geometry.aperture_start = zdev->start_dma;
+ domain->geometry.aperture_end = zdev->end_dma;
+ domain->geometry.force_aperture = true;
+ /* Allow only devices with identical DMA range limits */
+ } else if (domain->geometry.aperture_start != zdev->start_dma ||
+ domain->geometry.aperture_end != zdev->end_dma) {
+ rc = -EINVAL;
+ }
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+ if (rc)
+ return rc;
+
domain_device = kzalloc(sizeof(*domain_device), GFP_KERNEL);
if (!domain_device)
return -ENOMEM;
+ /* Leave now if the device has already been released */
+ spin_lock_irqsave(&zdev->dma_domain_lock, flags);
+ if (!dev_iommu_priv_get(dev)) {
+ spin_unlock_irqrestore(&zdev->dma_domain_lock, flags);
+ kfree(domain_device);
+ return 0;
+ }
+
if (zdev->dma_table && !zdev->s390_domain) {
cc = zpci_dma_exit_device(zdev);
if (cc) {
Am I wrong? It seems to me that zpci_dma_exit_device here is called with
the spin_lock locked but this function zpci_dma_exit_device calls vfree
which may sleep.
@@ -117,22 +141,11 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
rc = -EIO;
goto out_restore;
}
+ zdev->s390_domain = s390_domain;
+ spin_unlock_irqrestore(&zdev->dma_domain_lock, flags);
spin_lock_irqsave(&s390_domain->list_lock, flags);
- /* First device defines the DMA range limits */
- if (list_empty(&s390_domain->devices)) {
- domain->geometry.aperture_start = zdev->start_dma;
- domain->geometry.aperture_end = zdev->end_dma;
- domain->geometry.force_aperture = true;
- /* Allow only devices with identical DMA range limits */
- } else if (domain->geometry.aperture_start != zdev->start_dma ||
- domain->geometry.aperture_end != zdev->end_dma) {
- rc = -EINVAL;
- spin_unlock_irqrestore(&s390_domain->list_lock, flags);
- goto out_restore;
- }
domain_device->zdev = zdev;
- zdev->s390_domain = s390_domain;
list_add(&domain_device->list, &s390_domain->devices);
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
@@ -147,6 +160,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
virt_to_phys(zdev->dma_table));
}
out_free:
+ spin_unlock_irqrestore(&zdev->dma_domain_lock, flags);
kfree(domain_device);
return rc;
@@ -176,17 +190,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
}
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
- if (found && (zdev->s390_domain == s390_domain)) {
+ spin_lock_irqsave(&zdev->dma_domain_lock, flags);
+ if (found && (zdev->s390_domain == s390_domain) &&
+ dev_iommu_priv_get(dev)) {
zdev->s390_domain = NULL;
zpci_unregister_ioat(zdev, 0);
zpci_dma_init_device(zdev);
}
+ spin_unlock_irqrestore(&zdev->dma_domain_lock, flags);
}
static struct iommu_device *s390_iommu_probe_device(struct device *dev)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
+ dev_iommu_priv_set(dev, zdev);
+
return &zdev->iommu_dev;
}
@@ -194,6 +213,7 @@ static void s390_iommu_release_device(struct device *dev)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
struct iommu_domain *domain;
+ unsigned long flags;
/*
* This is a workaround for a scenario where the IOMMU API common code
@@ -206,10 +226,26 @@ static void s390_iommu_release_device(struct device *dev)
*
* So let's call detach_dev from here if it hasn't been called before.
*/
- if (zdev && zdev->s390_domain) {
+ if (zdev) {
+ /*
+ * Clear priv to block further attaches for this device,
+ * ensure detaches don't init DMA
+ */
+ spin_lock_irqsave(&zdev->dma_domain_lock, flags);
+ dev_iommu_priv_set(dev, NULL);
+ spin_unlock_irqrestore(&zdev->dma_domain_lock, flags);
+ /* Make sure this device is removed from the domain list */
domain = iommu_get_domain_for_dev(dev);
if (domain)
s390_iommu_detach_device(domain, dev);
+ /* Now ensure DMA is initialized from here */
+ spin_lock_irqsave(&zdev->dma_domain_lock, flags);
+ if (zdev->s390_domain) {
+ zdev->s390_domain = NULL;
+ zpci_unregister_ioat(zdev, 0);
+ zpci_dma_init_device(zdev);
If I do not make a mistake, zpci_dma_init_device() calls vzalloc() and
dma_alloc_cpu_table() which both could sleep.
+ }
+ spin_unlock_irqrestore(&zdev->dma_domain_lock, flags);
}
}
--
Pierre Morel
IBM Lab Boeblingen