Hi Tomi, Thank you for the patch. On Friday 19 February 2016 11:47:40 Tomi Valkeinen wrote: > Errata i878 says that MPU should not be used to access RAM and DMM at > the same time. As it's not possible to prevent MPU accessing RAM, we > need to access DMM via a proxy. > > This patch changes DMM driver to access DMM registers via sDMA. Instead > of doing a normal readl/writel call to read/write a register, we use > sDMA to copy 4 bytes from/to the DMM registers. > > This patch provides only a partial workaround for i878, as not only DMM > register reads/writes are affected, but also accesses to the DMM mapped > buffers (framebuffers, usually). Will this patch really improve the situation if the DMM mapping is accessed anyway ? > Signed-off-by: Tomi Valkeinen <tomi.valkeinen@xxxxxx> > --- > Documentation/devicetree/bindings/arm/omap/dmm.txt | 3 +- > drivers/gpu/drm/omapdrm/omap_dmm_priv.h | 8 ++ > drivers/gpu/drm/omapdrm/omap_dmm_tiler.c | 141 +++++++++++++++++- > 3 files changed, 149 insertions(+), 3 deletions(-) > > diff --git a/Documentation/devicetree/bindings/arm/omap/dmm.txt > b/Documentation/devicetree/bindings/arm/omap/dmm.txt index > 8bd6d0a238a8..e5fc2eb7f4da 100644 > --- a/Documentation/devicetree/bindings/arm/omap/dmm.txt > +++ b/Documentation/devicetree/bindings/arm/omap/dmm.txt > @@ -8,7 +8,8 @@ translation for initiators which need contiguous dma bus > addresses. > > Required properties: > - compatible: Should contain "ti,omap4-dmm" for OMAP4 family > - Should contain "ti,omap5-dmm" for OMAP5 and DRA7x family > + Should contain "ti,omap5-dmm" for OMAP5 family > + Should contain "ti,dra7-dmm" for DRA7x family Isn't it DRA7xx instead of DRA7x ? > - reg: Contains DMM register address range (base address and length) > - interrupts: Should contain an interrupt-specifier for DMM_IRQ. > - ti,hwmods: Name of the hwmod associated to DMM, which is typically "dmm" > diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h > b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h index 9f32a83ca507..4d292ba5bcca > 100644 > --- a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h > +++ b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h > @@ -155,10 +155,12 @@ struct refill_engine { > > struct dmm_platform_data { > uint32_t cpu_cache_flags; > + bool errata_i878_wa; > }; > > struct dmm { > struct device *dev; > + u32 phys_base; > void __iomem *base; > int irq; > > @@ -189,6 +191,12 @@ struct dmm { > struct list_head alloc_head; > > const struct dmm_platform_data *plat_data; > + > + bool dmm_workaround; > + spinlock_t wa_lock; > + u32 *wa_dma_data; > + dma_addr_t wa_dma_handle; > + struct dma_chan *wa_dma_chan; > }; > > #endif > diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c > b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c index fe5260477b52..3ec5c6633585 > 100644 > --- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c > +++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c > @@ -19,6 +19,7 @@ > #include <linux/completion.h> > #include <linux/delay.h> > #include <linux/dma-mapping.h> > +#include <linux/dmaengine.h> > #include <linux/errno.h> > #include <linux/init.h> > #include <linux/interrupt.h> > @@ -79,14 +80,124 @@ static const uint32_t reg[][4] = { > DMM_PAT_DESCR__2, DMM_PAT_DESCR__3}, > }; > > +static int dmm_dma_copy(struct dmm *dmm, u32 src, u32 dst) Anything wrong with using dma_addr_t ? > +{ > + struct dma_device *dma_dev = dmm->wa_dma_chan->device; > + struct dma_async_tx_descriptor *tx = NULL; No need to initialize tx to NULL. > + enum dma_status status; > + dma_cookie_t cookie; > + > + tx = dma_dev->device_prep_dma_memcpy(dmm->wa_dma_chan, dst, src, 4, 0); Given that you're transferring between an I/O mapped register and system memory, I believe you should use dmaengine_prep_slave_single() instead, with a call to dmaengine_slave_config() to set the I/O mapped register physical address. You will also need to request a slave DMA channel instead of a memcpy DMA channel. > + if (!tx) { > + dev_err(dmm->dev, "Failed to prepare DMA memcpy\n"); > + return -EIO; > + } > + > + cookie = tx->tx_submit(tx); > + if (dma_submit_error(cookie)) { > + dev_err(dmm->dev, "Failed to do DMA tx_submit\n"); > + return -EIO; > + } > + > + dma_async_issue_pending(dmm->wa_dma_chan); > + status = dma_sync_wait(dmm->wa_dma_chan, cookie); > + if (status != DMA_COMPLETE) > + dev_err(dmm->dev, "i878 wa DMA copy failure\n"); > + > + dmaengine_terminate_all(dmm->wa_dma_chan); > + return 0; > +} > + > +static u32 dmm_read_wa(struct dmm *dmm, u32 reg) > +{ > + u32 src, dst; > + int r; > + > + src = (u32)(dmm->phys_base + reg); > + dst = (u32)dmm->wa_dma_handle; > + > + r = dmm_dma_copy(dmm, src, dst); > + if (r) { > + dev_err(dmm->dev, "sDMA read transfer timeout\n"); > + return readl(dmm->base + reg); > + } > + > + return readl(dmm->wa_dma_data); readl() has a memory barrier *after* the read, not before. Similarly writel() has its memory barrier *before* the write. I don't think that's what you want, shouldn't you use readl_relaxed() and writel_relaxed() with explicit barriers ? I'm also unsure whether readl and writel are the right primitives, as they target access to I/O memory, not system memory. Can't you use explicit barriers and direct access (return *dmm->wa_dma_data) ? > +} > + > +static void dmm_write_wa(struct dmm *dmm, u32 val, u32 reg) > +{ > + u32 src, dst; > + int r; > + > + writel(val, dmm->wa_dma_data); Same comment here. > + src = (u32)dmm->wa_dma_handle; > + dst = (u32)(dmm->phys_base + reg); > + > + r = dmm_dma_copy(dmm, src, dst); > + if (r) { > + dev_err(dmm->dev, "sDMA write transfer timeout\n"); > + writel(val, dmm->base + reg); > + } > +} > + > static u32 dmm_read(struct dmm *dmm, u32 reg) > { > - return readl(dmm->base + reg); > + if (dmm->dmm_workaround) { > + u32 v; > + unsigned long flags; > + > + spin_lock_irqsave(&dmm->wa_lock, flags); > + v = dmm_read_wa(dmm, reg); dma_sync_wait() with a spinlock held doesn't seem like the best idea to me. > + spin_unlock_irqrestore(&dmm->wa_lock, flags); > + > + return v; > + } else { > + return readl(dmm->base + reg); > + } > } > > static void dmm_write(struct dmm *dmm, u32 val, u32 reg) > { > - writel(val, dmm->base + reg); > + if (dmm->dmm_workaround) { > + unsigned long flags; > + > + spin_lock_irqsave(&dmm->wa_lock, flags); > + dmm_write_wa(dmm, val, reg); Ditto. > + spin_unlock_irqrestore(&dmm->wa_lock, flags); > + } else { > + writel(val, dmm->base + reg); > + } > +} > + > +static int dmm_workaround_init(struct dmm *dmm) > +{ > + dma_cap_mask_t mask; > + > + dma_cap_zero(mask); > + dma_cap_set(DMA_MEMCPY, mask); Nitpicking, I'd move those two lines down right before the dma_request_channel() call, as they are part of the same logical block of code. > + spin_lock_init(&dmm->wa_lock); > + > + dmm->wa_dma_data = dma_alloc_coherent(dmm->dev, 4, &dmm->wa_dma_handle, > GFP_KERNEL); Maybe a macro instead of 4 ? > + if (!dmm->wa_dma_data) > + return -ENOMEM; > + > + dmm->wa_dma_chan = dma_request_channel(mask, NULL, NULL); > + if (!dmm->wa_dma_chan) { > + dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle); > + return -ENODEV; > + } > + > + return 0; > +} > + > +static void dmm_workaround_uninit(struct dmm *dmm) > +{ > + dma_release_channel(dmm->wa_dma_chan); > + > + dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle); > } > > /* simple allocator to grab next 16 byte aligned memory from txn */ > @@ -606,6 +717,9 @@ static int omap_dmm_remove(struct platform_device *dev) > if (omap_dmm->dummy_page) > __free_page(omap_dmm->dummy_page); > > + if (omap_dmm->dmm_workaround) > + dmm_workaround_uninit(omap_dmm); > + > if (omap_dmm->irq > 0) > free_irq(omap_dmm->irq, omap_dmm); > > @@ -653,6 +767,7 @@ static int omap_dmm_probe(struct platform_device *dev) > goto fail; > } > > + omap_dmm->phys_base = mem->start; > omap_dmm->base = ioremap(mem->start, SZ_2K); > > if (!omap_dmm->base) { > @@ -668,6 +783,19 @@ static int omap_dmm_probe(struct platform_device *dev) > > omap_dmm->dev = &dev->dev; > > + omap_dmm->dmm_workaround = omap_dmm->plat_data->errata_i878_wa; > + > + if (omap_dmm->dmm_workaround) { I'd test omap_dmm->plat_data->errata_i878_wa here, omap_dmm->dmm_workaround is initialized to false by kzalloc(). > + int r; > + r = dmm_workaround_init(omap_dmm); Can't you use the ret variable ? I know it's pre-initialized to -EFAULT, but it seems better to me to set it to explicit values in the three goto fail; error cases that would be impacted, especially given that -EFAULT is the wrong error code to return (and as it would touch that code, the platform_get_resource() + ioremap() calls can be optimized in a devm_ioremap_resource() in another cleanup patch). > + if (r) { > + omap_dmm->dmm_workaround = false; > + dev_err(&dev->dev, "failed to initialize work-around, WA not used\n"); As this is a non-fatal error, maybe dev_warn() ? > + } else { > + dev_info(&dev->dev, "workaround for errata i878 in use\n"); > + } > + } > + > hwinfo = dmm_read(omap_dmm, DMM_PAT_HWINFO); > omap_dmm->num_engines = (hwinfo >> 24) & 0x1F; > omap_dmm->num_lut = (hwinfo >> 16) & 0x1F; > @@ -1031,6 +1159,11 @@ static const struct dmm_platform_data > dmm_omap5_platform_data = { .cpu_cache_flags = OMAP_BO_UNCACHED, > }; > > +static const struct dmm_platform_data dmm_dra7_platform_data = { > + .cpu_cache_flags = OMAP_BO_UNCACHED, > + .errata_i878_wa = true, > +}; > + > static const struct of_device_id dmm_of_match[] = { > { > .compatible = "ti,omap4-dmm", > @@ -1040,6 +1173,10 @@ static const struct of_device_id dmm_of_match[] = { > .compatible = "ti,omap5-dmm", > .data = &dmm_omap5_platform_data, > }, > + { > + .compatible = "ti,dra7-dmm", > + .data = &dmm_dra7_platform_data, > + }, > {}, > }; > #endif -- Regards, Laurent Pinchart _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel