Frequent IOMMU remappings take about 50% of CPU usage because there is quite a lot to remap. Defer dmabuf's unmapping by 5 seconds in order to mitigate the mapping overhead which goes away completely and driver works as fast as in a case of a disabled IOMMU. The case of a disabled IOMMU should also benefit a tad from the caching since CPU cache maintenance that happens on dmabuf's attaching takes some resources. Signed-off-by: Dmitry Osipenko <digetx@xxxxxxxxx> --- drivers/staging/media/tegra-vde/Makefile | 2 +- .../staging/media/tegra-vde/dmabuf-cache.c | 223 ++++++++++++++++++ drivers/staging/media/tegra-vde/iommu.c | 2 - drivers/staging/media/tegra-vde/vde.c | 143 +++-------- drivers/staging/media/tegra-vde/vde.h | 18 +- 5 files changed, 273 insertions(+), 115 deletions(-) create mode 100644 drivers/staging/media/tegra-vde/dmabuf-cache.c diff --git a/drivers/staging/media/tegra-vde/Makefile b/drivers/staging/media/tegra-vde/Makefile index c11867e28233..2827f7601de8 100644 --- a/drivers/staging/media/tegra-vde/Makefile +++ b/drivers/staging/media/tegra-vde/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -tegra-vde-y := vde.o iommu.o +tegra-vde-y := vde.o iommu.o dmabuf-cache.o obj-$(CONFIG_TEGRA_VDE) += tegra-vde.o diff --git a/drivers/staging/media/tegra-vde/dmabuf-cache.c b/drivers/staging/media/tegra-vde/dmabuf-cache.c new file mode 100644 index 000000000000..fcde8d1c37e7 --- /dev/null +++ b/drivers/staging/media/tegra-vde/dmabuf-cache.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * NVIDIA Tegra Video decoder driver + * + * Copyright (C) 2016-2019 GRATE-DRIVER project + */ + +#include <linux/dma-buf.h> +#include <linux/iova.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +#include "vde.h" + +struct tegra_vde_cache_entry { + enum dma_data_direction dma_dir; + struct dma_buf_attachment *a; + struct delayed_work dwork; + struct tegra_vde *vde; + struct list_head list; + struct sg_table *sgt; + struct iova *iova; + unsigned int refcnt; +}; + +static void tegra_vde_release_entry(struct tegra_vde_cache_entry *entry) +{ + struct dma_buf *dmabuf = entry->a->dmabuf; + + WARN_ON_ONCE(entry->refcnt); + + if (entry->vde->domain) + tegra_vde_iommu_unmap(entry->vde, entry->iova); + + dma_buf_unmap_attachment(entry->a, entry->sgt, entry->dma_dir); + dma_buf_detach(dmabuf, entry->a); + dma_buf_put(dmabuf); + + list_del(&entry->list); + kfree(entry); +} + +static void tegra_vde_delayed_unmap(struct work_struct *work) +{ + struct tegra_vde_cache_entry *entry; + + entry = container_of(work, struct tegra_vde_cache_entry, + dwork.work); + + mutex_lock(&entry->vde->map_lock); + tegra_vde_release_entry(entry); + mutex_unlock(&entry->vde->map_lock); +} + +int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde, + struct dma_buf *dmabuf, + enum dma_data_direction dma_dir, + struct dma_buf_attachment **ap, + dma_addr_t *addrp) +{ + struct device *dev = vde->miscdev.parent; + struct tegra_vde_cache_entry *entry; + struct dma_buf_attachment *attachment; + struct sg_table *sgt; + struct iova *iova; + int err; + + mutex_lock(&vde->map_lock); + + list_for_each_entry(entry, &vde->map_list, list) { + if (entry->a->dmabuf != dmabuf) + continue; + + if (!cancel_delayed_work(&entry->dwork)) + continue; + + if (entry->dma_dir != dma_dir) + entry->dma_dir = DMA_BIDIRECTIONAL; + + dma_buf_put(dmabuf); + + if (vde->domain) + *addrp = iova_dma_addr(&vde->iova, entry->iova); + else + *addrp = sg_dma_address(entry->sgt->sgl); + + goto ref; + } + + attachment = dma_buf_attach(dmabuf, dev); + if (IS_ERR(attachment)) { + dev_err(dev, "Failed to attach dmabuf\n"); + err = PTR_ERR(attachment); + goto err_unlock; + } + + sgt = dma_buf_map_attachment(attachment, dma_dir); + if (IS_ERR(sgt)) { + dev_err(dev, "Failed to get dmabufs sg_table\n"); + err = PTR_ERR(sgt); + goto err_detach; + } + + if (!vde->domain && sgt->nents > 1) { + dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n"); + err = -EINVAL; + goto err_unmap; + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + err = -ENOMEM; + goto err_unmap; + } + + if (vde->domain) { + err = tegra_vde_iommu_map(vde, sgt, &iova, dmabuf->size); + if (err) + goto err_free; + + *addrp = iova_dma_addr(&vde->iova, iova); + } else { + *addrp = sg_dma_address(sgt->sgl); + } + + INIT_DELAYED_WORK(&entry->dwork, tegra_vde_delayed_unmap); + list_add(&entry->list, &vde->map_list); + + entry->dma_dir = dma_dir; + entry->iova = iova; + entry->vde = vde; + entry->sgt = sgt; + entry->a = attachment; +ref: + entry->refcnt++; + + *ap = entry->a; + + mutex_unlock(&vde->map_lock); + + return 0; + +err_free: + kfree(entry); +err_unmap: + dma_buf_unmap_attachment(attachment, sgt, dma_dir); +err_detach: + dma_buf_detach(dmabuf, attachment); +err_unlock: + mutex_unlock(&vde->map_lock); + + return err; +} + +void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde, + struct dma_buf_attachment *a, + bool release) +{ + struct tegra_vde_cache_entry *entry; + + mutex_lock(&vde->map_lock); + + list_for_each_entry(entry, &vde->map_list, list) { + if (entry->a != a) + continue; + + WARN_ON_ONCE(!entry->refcnt); + + if (--entry->refcnt == 0) { + if (release) + tegra_vde_release_entry(entry); + else + schedule_delayed_work(&entry->dwork, 5 * HZ); + } + break; + } + + mutex_unlock(&vde->map_lock); +} + +void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde) +{ + struct tegra_vde_cache_entry *entry, *tmp; + + mutex_lock(&vde->map_lock); + + list_for_each_entry_safe(entry, tmp, &vde->map_list, list) { + if (entry->refcnt) + continue; + + if (!cancel_delayed_work(&entry->dwork)) + continue; + + tegra_vde_release_entry(entry); + } + + mutex_unlock(&vde->map_lock); +} + +void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde) +{ + struct tegra_vde_cache_entry *entry, *tmp; + + mutex_lock(&vde->map_lock); + + do { + list_for_each_entry_safe(entry, tmp, &vde->map_list, list) { + if (!cancel_delayed_work(&entry->dwork)) + continue; + + tegra_vde_release_entry(entry); + } + + mutex_unlock(&vde->map_lock); + schedule(); + mutex_lock(&vde->map_lock); + } while (!list_empty(&vde->map_list)); + + mutex_unlock(&vde->map_lock); +} diff --git a/drivers/staging/media/tegra-vde/iommu.c b/drivers/staging/media/tegra-vde/iommu.c index 295c3d7cccd3..6d635332e0ec 100644 --- a/drivers/staging/media/tegra-vde/iommu.c +++ b/drivers/staging/media/tegra-vde/iommu.c @@ -19,7 +19,6 @@ int tegra_vde_iommu_map(struct tegra_vde *vde, struct sg_table *sgt, struct iova **iovap, - dma_addr_t *addrp, size_t size) { struct iova *iova; @@ -45,7 +44,6 @@ int tegra_vde_iommu_map(struct tegra_vde *vde, } *iovap = iova; - *addrp = addr; return 0; } diff --git a/drivers/staging/media/tegra-vde/vde.c b/drivers/staging/media/tegra-vde/vde.c index cbcdbfef072d..3466daddf663 100644 --- a/drivers/staging/media/tegra-vde/vde.c +++ b/drivers/staging/media/tegra-vde/vde.c @@ -11,6 +11,7 @@ #include <linux/genalloc.h> #include <linux/interrupt.h> #include <linux/iopoll.h> +#include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/of_device.h> @@ -37,18 +38,10 @@ #define BSE_DMA_BUSY BIT(23) struct video_frame { - struct iova *y_iova; - struct iova *cb_iova; - struct iova *cr_iova; - struct iova *aux_iova; struct dma_buf_attachment *y_dmabuf_attachment; struct dma_buf_attachment *cb_dmabuf_attachment; struct dma_buf_attachment *cr_dmabuf_attachment; struct dma_buf_attachment *aux_dmabuf_attachment; - struct sg_table *y_sgt; - struct sg_table *cb_sgt; - struct sg_table *cr_sgt; - struct sg_table *aux_sgt; dma_addr_t y_addr; dma_addr_t cb_addr; dma_addr_t cr_addr; @@ -494,22 +487,6 @@ static void tegra_vde_decode_frame(struct tegra_vde *vde, vde->sxe, 0x00); } -static void tegra_vde_detach_and_put_dmabuf(struct tegra_vde *vde, - enum dma_data_direction dma_dir, - struct dma_buf_attachment *a, - struct sg_table *sgt, - struct iova *iova) -{ - struct dma_buf *dmabuf = a->dmabuf; - - if (vde->domain) - tegra_vde_iommu_unmap(vde, iova); - - dma_buf_unmap_attachment(a, sgt, dma_dir); - dma_buf_detach(dmabuf, a); - dma_buf_put(dmabuf); -} - static int tegra_vde_attach_dmabuf(struct tegra_vde *vde, int fd, unsigned long offset, @@ -517,15 +494,11 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde, size_t align_size, struct dma_buf_attachment **a, dma_addr_t *addrp, - struct sg_table **s, - struct iova **iovap, size_t *size, enum dma_data_direction dma_dir) { struct device *dev = vde->miscdev.parent; - struct dma_buf_attachment *attachment; struct dma_buf *dmabuf; - struct sg_table *sgt; int err; dmabuf = dma_buf_get(fd); @@ -546,49 +519,17 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde, return -EINVAL; } - attachment = dma_buf_attach(dmabuf, dev); - if (IS_ERR(attachment)) { - dev_err(dev, "Failed to attach dmabuf\n"); - err = PTR_ERR(attachment); + err = tegra_vde_dmabuf_cache_map(vde, dmabuf, dma_dir, a, addrp); + if (err) goto err_put; - } - - sgt = dma_buf_map_attachment(attachment, dma_dir); - if (IS_ERR(sgt)) { - dev_err(dev, "Failed to get dmabufs sg_table\n"); - err = PTR_ERR(sgt); - goto err_detach; - } - - if (!vde->domain && sgt->nents > 1) { - dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n"); - err = -EINVAL; - goto err_unmap; - } - - if (vde->domain) { - err = tegra_vde_iommu_map(vde, sgt, iovap, addrp, dmabuf->size); - if (err) { - dev_err(dev, "IOMMU mapping failed: %d\n", err); - goto err_unmap; - } - } else { - *addrp = sg_dma_address(sgt->sgl); - } *addrp = *addrp + offset; - *a = attachment; - *s = sgt; if (size) *size = dmabuf->size - offset; return 0; -err_unmap: - dma_buf_unmap_attachment(attachment, sgt, dma_dir); -err_detach: - dma_buf_detach(dmabuf, attachment); err_put: dma_buf_put(dmabuf); @@ -608,8 +549,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde, src->y_offset, lsize, SZ_256, &frame->y_dmabuf_attachment, &frame->y_addr, - &frame->y_sgt, - &frame->y_iova, NULL, dma_dir); if (err) return err; @@ -618,8 +557,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde, src->cb_offset, csize, SZ_256, &frame->cb_dmabuf_attachment, &frame->cb_addr, - &frame->cb_sgt, - &frame->cb_iova, NULL, dma_dir); if (err) goto err_release_y; @@ -628,8 +565,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde, src->cr_offset, csize, SZ_256, &frame->cr_dmabuf_attachment, &frame->cr_addr, - &frame->cr_sgt, - &frame->cr_iova, NULL, dma_dir); if (err) goto err_release_cb; @@ -643,8 +578,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde, src->aux_offset, csize, SZ_256, &frame->aux_dmabuf_attachment, &frame->aux_addr, - &frame->aux_sgt, - &frame->aux_iova, NULL, dma_dir); if (err) goto err_release_cr; @@ -652,20 +585,11 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde, return 0; err_release_cr: - tegra_vde_detach_and_put_dmabuf(vde, dma_dir, - frame->cr_dmabuf_attachment, - frame->cr_sgt, - frame->cr_iova); + tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, true); err_release_cb: - tegra_vde_detach_and_put_dmabuf(vde, dma_dir, - frame->cb_dmabuf_attachment, - frame->cb_sgt, - frame->cb_iova); + tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, true); err_release_y: - tegra_vde_detach_and_put_dmabuf(vde, dma_dir, - frame->y_dmabuf_attachment, - frame->y_sgt, - frame->y_iova); + tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, true); return err; } @@ -673,28 +597,16 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde, static void tegra_vde_release_frame_dmabufs(struct tegra_vde *vde, struct video_frame *frame, enum dma_data_direction dma_dir, - bool baseline_profile) + bool baseline_profile, + bool release) { if (!baseline_profile) - tegra_vde_detach_and_put_dmabuf(vde, dma_dir, - frame->aux_dmabuf_attachment, - frame->aux_sgt, - frame->aux_iova); - - tegra_vde_detach_and_put_dmabuf(vde, dma_dir, - frame->cr_dmabuf_attachment, - frame->cr_sgt, - frame->cr_iova); - - tegra_vde_detach_and_put_dmabuf(vde, dma_dir, - frame->cb_dmabuf_attachment, - frame->cb_sgt, - frame->cb_iova); - - tegra_vde_detach_and_put_dmabuf(vde, dma_dir, - frame->y_dmabuf_attachment, - frame->y_sgt, - frame->y_iova); + tegra_vde_dmabuf_cache_unmap(vde, frame->aux_dmabuf_attachment, + release); + + tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, release); + tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, release); + tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, release); } static int tegra_vde_validate_frame(struct device *dev, @@ -786,8 +698,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde, struct tegra_vde_h264_frame __user *frames_user; struct video_frame *dpb_frames; struct dma_buf_attachment *bitstream_data_dmabuf_attachment; - struct sg_table *bitstream_sgt; - struct iova *bitstream_iova; enum dma_data_direction dma_dir; dma_addr_t bitstream_data_addr; dma_addr_t bsev_ptr; @@ -812,8 +722,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde, SZ_16K, SZ_16K, &bitstream_data_dmabuf_attachment, &bitstream_data_addr, - &bitstream_sgt, - &bitstream_iova, &bitstream_data_size, DMA_TO_DEVICE); if (ret) @@ -944,7 +852,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde, dma_dir = (i == 0) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; tegra_vde_release_frame_dmabufs(vde, &dpb_frames[i], dma_dir, - ctx.baseline_profile); + ctx.baseline_profile, ret != 0); } free_dpb_frames: @@ -954,10 +862,8 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde, kfree(frames); release_bitstream_dmabuf: - tegra_vde_detach_and_put_dmabuf(vde, DMA_TO_DEVICE, - bitstream_data_dmabuf_attachment, - bitstream_sgt, - bitstream_iova); + tegra_vde_dmabuf_cache_unmap(vde, bitstream_data_dmabuf_attachment, + ret != 0); return ret; } @@ -979,9 +885,21 @@ static long tegra_vde_unlocked_ioctl(struct file *filp, return -ENOTTY; } +static int tegra_vde_release_file(struct inode *inode, struct file *filp) +{ + struct miscdevice *miscdev = filp->private_data; + struct tegra_vde *vde = container_of(miscdev, struct tegra_vde, + miscdev); + + tegra_vde_dmabuf_cache_unmap_sync(vde); + + return 0; +} + static const struct file_operations tegra_vde_fops = { .owner = THIS_MODULE, .unlocked_ioctl = tegra_vde_unlocked_ioctl, + .release = tegra_vde_release_file, }; static irqreturn_t tegra_vde_isr(int irq, void *data) @@ -1159,6 +1077,8 @@ static int tegra_vde_probe(struct platform_device *pdev) return -ENOMEM; } + INIT_LIST_HEAD(&vde->map_list); + mutex_init(&vde->map_lock); mutex_init(&vde->lock); init_completion(&vde->decode_completion); @@ -1221,6 +1141,7 @@ static int tegra_vde_remove(struct platform_device *pdev) misc_deregister(&vde->miscdev); + tegra_vde_dmabuf_cache_unmap_all(vde); tegra_vde_iommu_deinit(vde); gen_pool_free(vde->iram_pool, (unsigned long)vde->iram, diff --git a/drivers/staging/media/tegra-vde/vde.h b/drivers/staging/media/tegra-vde/vde.h index 37414b7fdee1..e138878e8e14 100644 --- a/drivers/staging/media/tegra-vde/vde.h +++ b/drivers/staging/media/tegra-vde/vde.h @@ -9,16 +9,20 @@ #define TEGRA_VDE_H #include <linux/completion.h> +#include <linux/dma-direction.h> +#include <linux/list.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/types.h> #include <linux/iova.h> struct clk; +struct dma_buf; struct gen_pool; struct iommu_group; struct iommu_domain; struct reset_control; +struct dma_buf_attachment; struct tegra_vde { void __iomem *sxe; @@ -31,6 +35,8 @@ struct tegra_vde { void __iomem *vdma; void __iomem *frameid; struct mutex lock; + struct mutex map_lock; + struct list_head map_list; struct miscdevice miscdev; struct reset_control *rst; struct reset_control *rst_mc; @@ -49,10 +55,20 @@ void tegra_vde_iommu_deinit(struct tegra_vde *vde); int tegra_vde_iommu_map(struct tegra_vde *vde, struct sg_table *sgt, struct iova **iovap, - dma_addr_t *addrp, size_t size); void tegra_vde_iommu_unmap(struct tegra_vde *vde, struct iova *iova); +int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde, + struct dma_buf *dmabuf, + enum dma_data_direction dma_dir, + struct dma_buf_attachment **ap, + dma_addr_t *addrp); +void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde, + struct dma_buf_attachment *a, + bool release); +void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde); +void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde); + static __maybe_unused char const * tegra_vde_reg_base_name(struct tegra_vde *vde, void __iomem *base) { -- 2.21.0