On Monday, 13 August 2018 17:50:22 MSK Thierry Reding wrote: > From: Thierry Reding <treding@xxxxxxxxxx> > > Implement support for using an IOMMU to map physically discontiguous > buffers into contiguous I/O virtual mappings that the VDE can use. This > allows importing arbitrary DMA-BUFs for use by the VDE. > > While at it, make sure that the device is detached from any DMA/IOMMU > mapping that it might have automatically been attached to at boot. If > using the IOMMU API explicitly, detaching from any existing mapping is > required to avoid double mapping of buffers. > > Signed-off-by: Thierry Reding <treding@xxxxxxxxxx> > --- > drivers/staging/media/tegra-vde/tegra-vde.c | 171 +++++++++++++++++--- > 1 file changed, 153 insertions(+), 18 deletions(-) > > diff --git a/drivers/staging/media/tegra-vde/tegra-vde.c > b/drivers/staging/media/tegra-vde/tegra-vde.c index > 2496a03fd158..3bc0bfcfe34e 100644 > --- a/drivers/staging/media/tegra-vde/tegra-vde.c > +++ b/drivers/staging/media/tegra-vde/tegra-vde.c > @@ -13,7 +13,9 @@ > #include <linux/dma-buf.h> > #include <linux/genalloc.h> > #include <linux/interrupt.h> > +#include <linux/iommu.h> > #include <linux/iopoll.h> > +#include <linux/iova.h> > #include <linux/miscdevice.h> > #include <linux/module.h> > #include <linux/of_device.h> > @@ -22,6 +24,10 @@ > #include <linux/slab.h> > #include <linux/uaccess.h> > > +#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) > +#include <asm/dma-iommu.h> > +#endif > + > #include <soc/tegra/pmc.h> > > #include <drm/drm_fourcc.h> > @@ -61,6 +67,11 @@ struct video_frame { > u32 frame_num; > u32 flags; > u64 modifier; > + > + struct iova *y_iova; > + struct iova *cb_iova; > + struct iova *cr_iova; > + struct iova *aux_iova; > }; > > struct tegra_vde_soc { > @@ -93,6 +104,12 @@ struct tegra_vde { > struct clk *clk_bsev; > dma_addr_t iram_lists_addr; > u32 *iram; > + > + struct iommu_domain *domain; > + struct iommu_group *group; > + struct iova_domain iova; > + unsigned long limit; > + unsigned int shift; > }; > > static void tegra_vde_set_bits(struct tegra_vde *vde, > @@ -634,12 +651,22 @@ static void tegra_vde_decode_frame(struct tegra_vde > *vde, VDE_WR(0x20000000 | (macroblocks_nb - 1), vde->sxe + 0x00); > } > > -static void tegra_vde_detach_and_put_dmabuf(struct dma_buf_attachment *a, > +static void tegra_vde_detach_and_put_dmabuf(struct tegra_vde *vde, > + struct dma_buf_attachment *a, > struct sg_table *sgt, > + struct iova *iova, > enum dma_data_direction dma_dir) > { > struct dma_buf *dmabuf = a->dmabuf; > > + if (vde->domain) { > + unsigned long size = iova_size(iova) << vde->shift; Let's make it "size = iova_align(&vde->iova, dmabuf->size)" for better readability. > + dma_addr_t addr = iova_dma_addr(&vde->iova, iova); > + > + iommu_unmap(vde->domain, addr, size); > + __free_iova(&vde->iova, iova); > + } > + > dma_buf_unmap_attachment(a, sgt, dma_dir); > dma_buf_detach(dmabuf, a); > dma_buf_put(dmabuf); > @@ -651,14 +678,16 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde > *vde, size_t min_size, > size_t align_size, > struct dma_buf_attachment **a, > - dma_addr_t *addr, > + dma_addr_t *addrp, > struct sg_table **s, > - size_t *size, > + struct iova **iovap, > + size_t *sizep, > enum dma_data_direction dma_dir) > { > struct dma_buf_attachment *attachment; > struct dma_buf *dmabuf; > struct sg_table *sgt; > + size_t size; > int err; > > dmabuf = dma_buf_get(fd); > @@ -695,18 +724,47 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde > *vde, goto err_detach; > } > > - if (sgt->nents != 1) { > + if (sgt->nents > 1 && !vde->domain) { > dev_err(vde->dev, "Sparse DMA region is unsupported\n"); > err = -EINVAL; > goto err_unmap; > } > > - *addr = sg_dma_address(sgt->sgl) + offset; > + if (vde->domain) { > + int prot = IOMMU_READ | IOMMU_WRITE; > + struct iova *iova; > + dma_addr_t addr; > + > + size = (dmabuf->size - offset) >> vde->shift; Offset shall not be subtracted and dmabuf size shall be rounded to IOVA granule. Also, let's not carry shift within the vde structure as it doesn't really worth it. shift = iova_shift(&vde->iova); size = iova_align(&vde->iova, dmabuf->size) >> shift; > + > + iova = alloc_iova(&vde->iova, size, vde->limit - 1, true); > + if (!iova) { > + err = -ENOMEM; > + goto err_unmap; > + } > + > + addr = iova_dma_addr(&vde->iova, iova); > + > + size = iommu_map_sg(vde->domain, addr, sgt->sgl, sgt->nents, > + prot); > + if (!size) { > + __free_iova(&vde->iova, iova); > + err = -ENXIO; > + goto err_unmap; > + } > + > + *addrp = addr; Returned address shall point at the beginning of the buffer + offset. *addrp = addr + offset; Returned size shall represent the leftover size after the offset subtraction, like for example it is used for the bitstream_data address counter limit. size = dmabuf->size - offset; > + *iovap = iova; > + } else { > + *addrp = sg_dma_address(sgt->sgl) + offset; > + size = dmabuf->size - offset; > + } > + > *a = attachment; > *s = sgt; > > - if (size) > - *size = dmabuf->size - offset; > + if (sizep) > + *sizep = size; > > return 0; > > @@ -734,6 +792,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct > tegra_vde *vde, &frame->y_dmabuf_attachment, > &frame->y_addr, > &frame->y_sgt, > + &frame->y_iova, > NULL, dma_dir); > if (err) > return err; > @@ -743,6 +802,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct > tegra_vde *vde, &frame->cb_dmabuf_attachment, > &frame->cb_addr, > &frame->cb_sgt, > + &frame->cb_iova, > NULL, dma_dir); > if (err) > goto err_release_y; > @@ -752,6 +812,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct > tegra_vde *vde, &frame->cr_dmabuf_attachment, > &frame->cr_addr, > &frame->cr_sgt, > + &frame->cr_iova, > NULL, dma_dir); > if (err) > goto err_release_cb; > @@ -766,6 +827,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct > tegra_vde *vde, &frame->aux_dmabuf_attachment, > &frame->aux_addr, > &frame->aux_sgt, > + &frame->aux_iova, > NULL, dma_dir); > if (err) > goto err_release_cr; > @@ -774,13 +836,16 @@ static int tegra_vde_attach_dmabufs_to_frame(struct > tegra_vde *vde, > > err_release_cr: > tegra_vde_detach_and_put_dmabuf(vde, frame->cr_dmabuf_attachment, > - frame->cr_sgt, dma_dir); > + frame->cr_sgt, frame->cr_iova, > + dma_dir); > err_release_cb: > tegra_vde_detach_and_put_dmabuf(vde, frame->cb_dmabuf_attachment, > - frame->cb_sgt, dma_dir); > + frame->cb_sgt, frame->cb_iova, > + dma_dir); > err_release_y: > tegra_vde_detach_and_put_dmabuf(vde, frame->y_dmabuf_attachment, > - frame->y_sgt, dma_dir); > + frame->y_sgt, frame->y_iova, > + dma_dir); > > return err; > } > @@ -793,16 +858,20 @@ static void tegra_vde_release_frame_dmabufs(struct > tegra_vde *vde, if (!baseline_profile) > tegra_vde_detach_and_put_dmabuf(vde, > frame->aux_dmabuf_attachment, > - frame->aux_sgt, dma_dir); > + frame->aux_sgt, > + frame->aux_iova, dma_dir); > > tegra_vde_detach_and_put_dmabuf(vde, frame->cr_dmabuf_attachment, > - frame->cr_sgt, dma_dir); > + frame->cr_sgt, frame->cr_iova, > + dma_dir); > > tegra_vde_detach_and_put_dmabuf(vde, frame->cb_dmabuf_attachment, > - frame->cb_sgt, dma_dir); > + frame->cb_sgt, frame->cb_iova, > + dma_dir); > > tegra_vde_detach_and_put_dmabuf(vde, frame->y_dmabuf_attachment, > - frame->y_sgt, dma_dir); > + frame->y_sgt, frame->y_iova, > + dma_dir); > } > > static int tegra_vde_validate_frame(struct device *dev, > @@ -923,6 +992,8 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde > *vde, struct sg_table *bitstream_sgt, *secure_sgt; > enum dma_data_direction dma_dir; > dma_addr_t bitstream_data_addr; > + struct iova *bitstream_iova; > + struct iova *secure_iova; > dma_addr_t secure_addr; > dma_addr_t bsev_ptr; > size_t lsize, csize; > @@ -948,6 +1019,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde > *vde, &bitstream_data_dmabuf_attachment, > &bitstream_data_addr, > &bitstream_sgt, > + &bitstream_iova, > &bitstream_data_size, > DMA_TO_DEVICE); > if (ret) > @@ -959,6 +1031,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde > *vde, &secure_attachment, > &secure_addr, > &secure_sgt, > + &secure_iova, > &secure_size, > DMA_TO_DEVICE); > if (ret) > @@ -1095,12 +1168,13 @@ static int tegra_vde_ioctl_decode_h264(struct > tegra_vde *vde, release_bitstream_dmabuf: > if (secure_attachment) > tegra_vde_detach_and_put_dmabuf(vde, secure_attachment, > - secure_sgt, > + secure_sgt, secure_iova, > DMA_TO_DEVICE); > > tegra_vde_detach_and_put_dmabuf(vde, > bitstream_data_dmabuf_attachment, > - bitstream_sgt, DMA_TO_DEVICE); > + bitstream_sgt, bitstream_iova, > + DMA_TO_DEVICE); > > return ret; > } > @@ -1193,6 +1267,15 @@ static int tegra_vde_probe(struct platform_device > *pdev) struct tegra_vde *vde; > int irq, err; > > +#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) > + if (dev->archdata.mapping) { > + struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev); > + > + arm_iommu_detach_device(dev); > + arm_iommu_release_mapping(mapping); > + } > +#endif > + > vde = devm_kzalloc(dev, sizeof(*vde), GFP_KERNEL); > if (!vde) > return -ENOMEM; > @@ -1335,6 +1418,37 @@ static int tegra_vde_probe(struct platform_device > *pdev) return -ENOMEM; > } > > + vde->group = iommu_group_get(dev); > + if (vde->group) { > + unsigned long order; > + > + vde->domain = iommu_domain_alloc(&platform_bus_type); > + if (!vde->domain) { > + iommu_group_put(vde->group); > + vde->group = NULL; > + } else { > + err = iova_cache_get(); > + if (err < 0) > + goto free_domain; > + > + order = __ffs(vde->domain->pgsize_bitmap); > + > + init_iova_domain(&vde->iova, 1UL << order, 0); The minimum address of IOVA allocations shall be determined by the domains aperture start address. > + vde->shift = iova_shift(&vde->iova); > + vde->limit = 1 << (32 - vde->shift); IOVA limit shall be determined by the domains aperture size. struct iommu_domain_geometry *geometry; order = __ffs(vde->domain->pgsize_bitmap); geometry = &vde->domain->geometry; init_iova_domain(&vde->iova, 1UL << order, geometry->aperture_start >> order); vde->iova_end = geometry->aperture_end; Hence let's replace 'limit' with 'iova_end'. > + > + /* > + * VDE doesn't seem to like accessing the last page of > + * its 32-bit address space. > + */ > + vde->limit -= 1; That's probably because some VDE HW address counter is getting wrapped around and it can't cope with that due to a HW bug / optimization. Since this only affects the end of AS, let's check if that adjustment is needed: if (vde->iova_end == 0xffffffff) vde->iova_end -= 1UL << order; > + > + err = iommu_attach_group(vde->domain, vde->group); > + if (err < 0) goto put_iova; > + goto put_cache; > + } > + } > + > mutex_init(&vde->lock); > init_completion(&vde->decode_completion); > > @@ -1346,7 +1460,7 @@ static int tegra_vde_probe(struct platform_device > *pdev) err = misc_register(&vde->miscdev); > if (err) { > dev_err(dev, "Failed to register misc device: %d\n", err); > - goto err_gen_free; > + goto detach; > } > > pm_runtime_enable(dev); > @@ -1364,7 +1478,21 @@ static int tegra_vde_probe(struct platform_device > *pdev) err_misc_unreg: > misc_deregister(&vde->miscdev); > > -err_gen_free: put_iova: put_iova_domain(&vde->iova); > +detach: > + if (vde->domain) > + iommu_detach_group(vde->domain, vde->group); > + > +put_cache: > + if (vde->domain) > + iova_cache_put(); > + > +free_domain: > + if (vde->domain) > + iommu_domain_free(vde->domain); > + > + if (vde->group) > + iommu_group_put(vde->group); > + > gen_pool_free(vde->iram_pool, (unsigned long)vde->iram, > gen_pool_size(vde->iram_pool)); > > @@ -1388,6 +1516,13 @@ static int tegra_vde_remove(struct platform_device > *pdev) > > misc_deregister(&vde->miscdev); > > + if (vde->domain) { > + iommu_detach_group(vde->domain, vde->group); put_iova_domain(&vde->iova); > + iova_cache_put(); > + iommu_domain_free(vde->domain); > + iommu_group_put(vde->group); > + } > + > gen_pool_free(vde->iram_pool, (unsigned long)vde->iram, > gen_pool_size(vde->iram_pool));