Re: [PATCH v5 19/19] iommu/dma: Add support for mapping MSIs

Robin Murphy <robin.murphy@xxxxxxx> · Fri, 26 Aug 2016 02:17:36 +0100

Hi Eric,

On Fri, 26 Aug 2016 00:25:34 +0200
Auger Eric <eric.auger@xxxxxxxxxx> wrote:

> Hi Robin,
> 
> On 23/08/2016 21:05, Robin Murphy wrote:
> > When an MSI doorbell is located downstream of an IOMMU, attaching
> > devices to a DMA ops domain and switching on translation leads to a
> > rude shock when their attempt to write to the physical address
> > returned by the irqchip driver faults (or worse, writes into some
> > already-mapped buffer) and no interrupt is forthcoming.
> > 
> > Address this by adding a hook for relevant irqchip drivers to call
> > from their compose_msi_msg() callback, to swizzle the physical
> > address with an appropriatly-mapped IOVA for any device attached to
> > one of our DMA ops domains.
> > 
> > CC: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > CC: Jason Cooper <jason@xxxxxxxxxxxxxx>
> > CC: Marc Zyngier <marc.zyngier@xxxxxxx>
> > CC: linux-kernel@xxxxxxxxxxxxxxx
> > Signed-off-by: Robin Murphy <robin.murphy@xxxxxxx>
> > ---
> >  drivers/iommu/dma-iommu.c        | 141
> > ++++++++++++++++++++++++++++++++++-----
> > drivers/irqchip/irq-gic-v2m.c    |   3 +
> > drivers/irqchip/irq-gic-v3-its.c |   3 +
> > include/linux/dma-iommu.h        |   9 +++ 4 files changed, 141
> > insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> > index 00c8a08d56e7..330cce60cad9 100644
> > --- a/drivers/iommu/dma-iommu.c
> > +++ b/drivers/iommu/dma-iommu.c
> > @@ -25,10 +25,29 @@
> >  #include <linux/huge_mm.h>
> >  #include <linux/iommu.h>
> >  #include <linux/iova.h>
> > +#include <linux/irq.h>
> >  #include <linux/mm.h>
> >  #include <linux/scatterlist.h>
> >  #include <linux/vmalloc.h>
> >  
> > +struct iommu_dma_msi_page {
> > +	struct list_head	list;
> > +	dma_addr_t		iova;
> > +	u32			phys_lo;
> > +	u32			phys_hi;
> > +};
> > +
> > +struct iommu_dma_cookie {
> > +	struct iova_domain	iovad;
> > +	struct list_head	msi_page_list;
> > +	spinlock_t		msi_lock;
> > +};
> > +
> > +static inline struct iova_domain *cookie_iovad(struct iommu_domain
> > *domain) +{
> > +	return &((struct iommu_dma_cookie
> > *)domain->iova_cookie)->iovad; +}
> > +
> >  int iommu_dma_init(void)
> >  {
> >  	return iova_cache_get();
> > @@ -43,15 +62,19 @@ int iommu_dma_init(void)
> >   */
> >  int iommu_get_dma_cookie(struct iommu_domain *domain)
> >  {
> > -	struct iova_domain *iovad;
> > +	struct iommu_dma_cookie *cookie;
> >  
> >  	if (domain->iova_cookie)
> >  		return -EEXIST;
> >  
> > -	iovad = kzalloc(sizeof(*iovad), GFP_KERNEL);
> > -	domain->iova_cookie = iovad;
> > +	cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
> > +	if (!cookie)
> > +		return -ENOMEM;
> >  
> > -	return iovad ? 0 : -ENOMEM;
> > +	spin_lock_init(&cookie->msi_lock);
> > +	INIT_LIST_HEAD(&cookie->msi_page_list);
> > +	domain->iova_cookie = cookie;
> > +	return 0;
> >  }
> >  EXPORT_SYMBOL(iommu_get_dma_cookie);
> >  
> > @@ -63,14 +86,20 @@ EXPORT_SYMBOL(iommu_get_dma_cookie);
> >   */
> >  void iommu_put_dma_cookie(struct iommu_domain *domain)
> >  {
> > -	struct iova_domain *iovad = domain->iova_cookie;
> > +	struct iommu_dma_cookie *cookie = domain->iova_cookie;
> > +	struct iommu_dma_msi_page *msi, *tmp;
> >  
> > -	if (!iovad)
> > +	if (!cookie)
> >  		return;
> >  
> > -	if (iovad->granule)
> > -		put_iova_domain(iovad);
> > -	kfree(iovad);
> > +	if (cookie->iovad.granule)
> > +		put_iova_domain(&cookie->iovad);
> > +
> > +	list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list,
> > list) {
> > +		list_del(&msi->list);
> > +		kfree(msi);
> > +	}
> > +	kfree(cookie);
> >  	domain->iova_cookie = NULL;
> >  }
> >  EXPORT_SYMBOL(iommu_put_dma_cookie);
> > @@ -88,7 +117,7 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
> >   */
> >  int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t
> > base, u64 size) {
> > -	struct iova_domain *iovad = domain->iova_cookie;
> > +	struct iova_domain *iovad = cookie_iovad(domain);
> >  	unsigned long order, base_pfn, end_pfn;
> >  
> >  	if (!iovad)
> > @@ -155,7 +184,7 @@ int dma_direction_to_prot(enum
> > dma_data_direction dir, bool coherent) static struct iova
> > *__alloc_iova(struct iommu_domain *domain, size_t size, dma_addr_t
> > dma_limit) {
> > -	struct iova_domain *iovad = domain->iova_cookie;
> > +	struct iova_domain *iovad = cookie_iovad(domain);
> >  	unsigned long shift = iova_shift(iovad);
> >  	unsigned long length = iova_align(iovad, size) >> shift;
> >  
> > @@ -171,7 +200,7 @@ static struct iova *__alloc_iova(struct
> > iommu_domain *domain, size_t size, /* The IOVA allocator knows what
> > we mapped, so just unmap whatever that was */ static void
> > __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr)
> > {
> > -	struct iova_domain *iovad = domain->iova_cookie;
> > +	struct iova_domain *iovad = cookie_iovad(domain);
> >  	unsigned long shift = iova_shift(iovad);
> >  	unsigned long pfn = dma_addr >> shift;
> >  	struct iova *iova = find_iova(iovad, pfn);
> > @@ -294,7 +323,7 @@ struct page **iommu_dma_alloc(struct device
> > *dev, size_t size, gfp_t gfp, void (*flush_page)(struct device *,
> > const void *, phys_addr_t)) {
> >  	struct iommu_domain *domain =
> > iommu_get_domain_for_dev(dev);
> > -	struct iova_domain *iovad = domain->iova_cookie;
> > +	struct iova_domain *iovad = cookie_iovad(domain);
> >  	struct iova *iova;
> >  	struct page **pages;
> >  	struct sg_table sgt;
> > @@ -386,7 +415,7 @@ dma_addr_t iommu_dma_map_page(struct device
> > *dev, struct page *page, {
> >  	dma_addr_t dma_addr;
> >  	struct iommu_domain *domain =
> > iommu_get_domain_for_dev(dev);
> > -	struct iova_domain *iovad = domain->iova_cookie;
> > +	struct iova_domain *iovad = cookie_iovad(domain);
> >  	phys_addr_t phys = page_to_phys(page) + offset;
> >  	size_t iova_off = iova_offset(iovad, phys);
> >  	size_t len = iova_align(iovad, size + iova_off);
> > @@ -495,7 +524,7 @@ int iommu_dma_map_sg(struct device *dev, struct
> > scatterlist *sg, int nents, int prot)
> >  {
> >  	struct iommu_domain *domain =
> > iommu_get_domain_for_dev(dev);
> > -	struct iova_domain *iovad = domain->iova_cookie;
> > +	struct iova_domain *iovad = cookie_iovad(domain);
> >  	struct iova *iova;
> >  	struct scatterlist *s, *prev = NULL;
> >  	dma_addr_t dma_addr;
> > @@ -587,3 +616,85 @@ int iommu_dma_mapping_error(struct device
> > *dev, dma_addr_t dma_addr) {
> >  	return dma_addr == DMA_ERROR_CODE;
> >  }
> > +
> > +static int __iommu_dma_map_msi_page(struct device *dev, struct
> > msi_msg *msg,
> > +		struct iommu_domain *domain, struct
> > iommu_dma_msi_page **ppage) +{
> > +	struct iommu_dma_cookie *cookie = domain->iova_cookie;
> > +	struct iommu_dma_msi_page *msi_page;
> > +	struct iova_domain *iovad = &cookie->iovad;
> > +	struct iova *iova;
> > +	phys_addr_t msi_addr = (u64)msg->address_hi << 32 |
> > msg->address_lo;
> > +	int ret, prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;  
> In my series I ended up putting the memory attributes as a property of
> the doorbell, advised to do so by Marc. Here we hard freeze them. Do
> you foresee all the doorbells ill have the same attributes?

I can't think of any good reason any device would want to read from an
ITS or GICv2m doorbell, execute it, or allow the interconnect to reorder
or cache its MSI writes. If some crazy hardware comes along to prove my
assumption wrong then we'll revisit it, but right now I just want the
simplest possible solution for PCI DMA ops to not break MSIs.

> > +
> > +	msi_page = kzalloc(sizeof(*msi_page), GFP_ATOMIC);
> > +	if (!msi_page)
> > +		return -ENOMEM;
> > +
> > +	iova = __alloc_iova(domain, iovad->granule,
> > dma_get_mask(dev));
> > +	if (!iova) {
> > +		ret = -ENOSPC;
> > +		goto out_free_page;
> > +	}
> > +
> > +	msi_page->iova = iova_dma_addr(iovad, iova);
> > +	ret = iommu_map(domain, msi_page->iova, msi_addr &
> > ~iova_mask(iovad),
> > +			iovad->granule, prot);
> > +	if (ret)
> > +		goto out_free_iova;
> > +
> > +	msi_page->phys_hi = msg->address_hi;
> > +	msi_page->phys_lo = msg->address_lo;
> > +	INIT_LIST_HEAD(&msi_page->list);
> > +	list_add(&msi_page->list, &cookie->msi_page_list);
> > +	*ppage = msi_page;
> > +	return 0;
> > +
> > +out_free_iova:
> > +	__free_iova(iovad, iova);
> > +out_free_page:
> > +	kfree(msi_page);
> > +	return ret;
> > +}
> > +
> > +void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)  
> Marc told in the past it was reasonable to consider adding a size
> parameter to the allocate function. Obviously you don't have the same
> concern as I had in the passthrough series where the window aperture
> is set by the userspace but well that is just for checking.

The beauty is that at this level we really don't care. It's simply
"here's an address from the irqchip driver, is there a mapping in this
domain which covers it?" The only possible use of knowing a size
here would be if it happens to correspond to a larger page size we
could use for the mapping, but that would entail a bunch of complication
for what seems like a highly tenuous benefit, and right now I just want
the simplest possible solution for PCI DMA ops to not break MSIs.

> 
> > +{
> > +	struct device *dev =
> > msi_desc_to_dev(irq_get_msi_desc(irq));
> > +	struct iommu_domain *domain =
> > iommu_get_domain_for_dev(dev);
> > +	struct iova_domain *iovad;
> > +	struct iommu_dma_cookie *cookie;
> > +	struct iommu_dma_msi_page *msi_page;
> > +	int ret = 0;
> > +
> > +	if (!domain || !domain->iova_cookie)
> > +		return;
> > +
> > +	cookie = domain->iova_cookie;
> > +	iovad = &cookie->iovad;
> > +
> > +	spin_lock(&cookie->msi_lock);
> > +	list_for_each_entry(msi_page, &cookie->msi_page_list, list)
> > +		if (msi_page->phys_hi == msg->address_hi &&
> > +		    msi_page->phys_lo - msg->address_lo <
> > iovad->granule)
> > +			goto unlock;
> > +
> > +	ret = __iommu_dma_map_msi_page(dev, msg, domain,
> > &msi_page); +unlock:
> > +	spin_unlock(&cookie->msi_lock);
> > +
> > +	if (!ret) {
> > +		msg->address_hi = upper_32_bits(msi_page->iova);
> > +		msg->address_lo &= iova_mask(iovad);
> > +		msg->address_lo += lower_32_bits(msi_page->iova);
> > +	} else {
> > +		/*
> > +		 * We're called from a void callback, so the best
> > we can do is
> > +		 * 'fail' by filling the message with obviously
> > bogus values.
> > +		 * Since we got this far due to an IOMMU being
> > present, it's
> > +		 * not like the existing address would have worked
> > anyway...
> > +		 */
> > +		msg->address_hi = ~0U;
> > +		msg->address_lo = ~0U;
> > +		msg->data = ~0U;
> > +	}
> > +}
> > diff --git a/drivers/irqchip/irq-gic-v2m.c
> > b/drivers/irqchip/irq-gic-v2m.c index 35eb7ac5d21f..863e073c6f7f
> > 100644 --- a/drivers/irqchip/irq-gic-v2m.c
> > +++ b/drivers/irqchip/irq-gic-v2m.c
> > @@ -16,6 +16,7 @@
> >  #define pr_fmt(fmt) "GICv2m: " fmt
> >  
> >  #include <linux/acpi.h>
> > +#include <linux/dma-iommu.h>
> >  #include <linux/irq.h>
> >  #include <linux/irqdomain.h>
> >  #include <linux/kernel.h>
> > @@ -108,6 +109,8 @@ static void gicv2m_compose_msi_msg(struct
> > irq_data *data, struct msi_msg *msg) 
> >  	if (v2m->flags & GICV2M_NEEDS_SPI_OFFSET)
> >  		msg->data -= v2m->spi_offset;
> > +
> > +	iommu_dma_map_msi_msg(data->irq, msg);  
> In the past we identified that msi_compose was not authorized to sleep
> (https://lkml.org/lkml/2016/3/10/216) since potentialy called in
> atomic context.
> 
> This is why in my passthrough series I was forced to move the mapping
> in msi_domain_alloc, which also has the benefit to happen earlier and
> is able to fail whereas the compose cannot due, to the subsequent
> BUG_ON. Has things changed since this notice which now allow to do
> the mapping here?

We've got a non-sleeping spinlock covering the lookup, and new pages are
allocated with GFP_ATOMIC; what have I missed? Any IOMMU driver whose
iommu_map() implementation might sleep already can't use this layer, as
DMA API calls may come from atomic context as well.

The "oh well, at least we tried" failure case (I've added a WARN_ON
now for good measure) is largely mitigated by the fact that 99% of the
time in practice we'll just be mapping one page once per domain and
hitting it on subsequent lookups. If someone's already consumed all the
available IOVA space before that page is mapped, or the IOMMU page
tables are corrupted, then the device won't be able to do DMA anyway, so
the fact that it can't raise its MSI is likely moot.

> 
> >  }
> >  
> >  static struct irq_chip gicv2m_irq_chip = {
> > diff --git a/drivers/irqchip/irq-gic-v3-its.c
> > b/drivers/irqchip/irq-gic-v3-its.c index 7ceaba81efb4..73f4f10dc204
> > 100644 --- a/drivers/irqchip/irq-gic-v3-its.c
> > +++ b/drivers/irqchip/irq-gic-v3-its.c
> > @@ -18,6 +18,7 @@
> >  #include <linux/bitmap.h>
> >  #include <linux/cpu.h>
> >  #include <linux/delay.h>
> > +#include <linux/dma-iommu.h>
> >  #include <linux/interrupt.h>
> >  #include <linux/log2.h>
> >  #include <linux/mm.h>
> > @@ -655,6 +656,8 @@ static void its_irq_compose_msi_msg(struct
> > irq_data *d, struct msi_msg *msg) msg->address_lo		=
> > addr & ((1UL << 32) - 1); msg->address_hi		= addr >>
> > 32; msg->data		= its_get_event_id(d);
> > +
> > +	iommu_dma_map_msi_msg(d->irq, msg);
> >  }
> >  
> >  static struct irq_chip its_irq_chip = {
> > diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
> > index 81c5c8d167ad..5ee806e41b5c 100644
> > --- a/include/linux/dma-iommu.h
> > +++ b/include/linux/dma-iommu.h
> > @@ -21,6 +21,7 @@
> >  
> >  #ifdef CONFIG_IOMMU_DMA
> >  #include <linux/iommu.h>
> > +#include <linux/msi.h>
> >  
> >  int iommu_dma_init(void);
> >  
> > @@ -62,9 +63,13 @@ void iommu_dma_unmap_sg(struct device *dev,
> > struct scatterlist *sg, int nents, int iommu_dma_supported(struct
> > device *dev, u64 mask); int iommu_dma_mapping_error(struct device
> > *dev, dma_addr_t dma_addr); 
> > +/* The DMA API isn't _quite_ the whole story, though... */
> > +void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg);  
> So I understand the patch currently addresses dma-mapping use case.
> What about the passthrough use case? Here you obviously propose a
> simpler version but it also looks to me it skips some comments we
> collected in the past which resulted in the direction taken before:

This is only intended to be the lowest-level remapping machinery -
clearly the guest MSI case has a lot more complexity on top, but that
doesn't apply on the host side, and right now I just want the simplest
possible solution for PCI DMA ops to not break MSIs.

> - generic API to allocate msi iovas
> - msi_geometry semantic recommended by Alex
> - the handling of the size parameter as recommended by Marc
> - separation of allocation/enumeration for msi_domain_allocate_irqs
> /msi_compose separation.

I have also thrown together an illustrative patch for plugging
this into VFIO domains [1] following some internal discussion, but
that's about as far as I was planning to go myself - AFAICS all your
MSI geometry and VFIO bits remain valid, I just looked at the
msi_cookie stuff and found it really didn't tie in with DMA ops at all
well.

> 
> For passthrough we also have to care about the safety issue, the
> window size computation. Please can we collaborate to converge on a
> unified solution?

I remain adamant that the safety thing is a property of the irqchip and
the irqchip alone (I've also come to realise that iommu_capable() is
fundamentally unworkable altogether, but that's another story). Thus
as I see it, getting the low-level remapping aspect out of the way in a
common manner leaves the rest of the guest MSI problem firmly between
VFIO and the MSI layer, now that we've got a much clearer view of
it thanks to your efforts. What do you think?

After all, right now... well, y'know ;)

Robin.

[1]:http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=991effe0712750ce24f6a0b2e2e3f8f57d4a9910

> 
> Best Regards
> 
> Eric
> 
> > +
> >  #else
> >  
> >  struct iommu_domain;
> > +struct msi_msg;
> >  
> >  static inline int iommu_dma_init(void)
> >  {
> > @@ -80,6 +85,10 @@ static inline void iommu_put_dma_cookie(struct
> > iommu_domain *domain) {
> >  }
> >  
> > +static inline void iommu_dma_map_msi_msg(int irq, struct msi_msg
> > *msg) +{
> > +}
> > +
> >  #endif	/* CONFIG_IOMMU_DMA */
> >  #endif	/* __KERNEL__ */
> >  #endif	/* __DMA_IOMMU_H */
> >   
> 

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html