In case of 4k video buffer, the allocation from a reserved memory is taking a long time, ~500ms. This is root caused to the memset() operations on the allocated memory which is consuming more cpu cycles. Due to this delay, we see that initial frames are being dropped. To fix this, we have wrapped the default memset, done when allocating coherent memory, under the __GFP_ZERO flag. So, we only clear allocated memory if __GFP_ZERO flag is enabled. We believe this should be safe as the video decoder always writes before reading. This optimizes decoder initialization as we do not set the __GFP_ZERO flag when allocating memory for decoder. With this optimization, we don't see initial frame drops and decoder initialization time is ~100ms. This patch adds plumbing through dma_alloc functions to pass gfp flag set by user to __dma_alloc_from_coherent(). Here gfp flag is checked for __GFP_ZERO. If present, we memset the buffer to 0 otherwise we skip memset. Signed-off-by: Dylan Yip <dylan.yip@xxxxxxxxxx> --- arch/arm/mm/dma-mapping-nommu.c | 2 +- include/linux/dma-mapping.h | 11 +++++++---- kernel/dma/coherent.c | 15 +++++++++------ kernel/dma/mapping.c | 2 +- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index 52b8255..242b2c3 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -35,7 +35,7 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t size, unsigned long attrs) { - void *ret = dma_alloc_from_global_coherent(size, dma_handle); + void *ret = dma_alloc_from_global_coherent(size, dma_handle, gfp); /* * dma_alloc_from_global_coherent() may fail because: diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index f7d1eea..b715c9f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -160,24 +160,27 @@ static inline int is_device_dma_capable(struct device *dev) * Don't use them in device drivers. */ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret); + dma_addr_t *dma_handle, void **ret, + gfp_t flag); int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle); +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle, + gfp_t flag); int dma_release_from_global_coherent(int order, void *vaddr); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); #else -#define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) +#define dma_alloc_from_dev_coherent(dev, size, handle, ret, flag) (0) #define dma_release_from_dev_coherent(dev, order, vaddr) (0) #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) static inline void *dma_alloc_from_global_coherent(ssize_t size, - dma_addr_t *dma_handle) + dma_addr_t *dma_handle, + gfp_t flag) { return NULL; } diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 29fd659..d85fab5 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -136,7 +136,7 @@ void dma_release_declared_memory(struct device *dev) EXPORT_SYMBOL(dma_release_declared_memory); static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, - ssize_t size, dma_addr_t *dma_handle) + ssize_t size, dma_addr_t *dma_handle, gfp_t gfp_flag) { int order = get_order(size); unsigned long flags; @@ -158,7 +158,8 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); ret = mem->virt_base + (pageno << PAGE_SHIFT); spin_unlock_irqrestore(&mem->spinlock, flags); - memset(ret, 0, size); + if (gfp_flag & __GFP_ZERO) + memset(ret, 0, size); return ret; err: spin_unlock_irqrestore(&mem->spinlock, flags); @@ -172,6 +173,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, * @dma_handle: This will be filled with the correct dma handle * @ret: This pointer will be filled with the virtual address * to allocated area. + * @flag: gfp flag set by user * * This function should be only called from per-arch dma_alloc_coherent() * to support allocation from per-device coherent memory pools. @@ -180,24 +182,25 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, * generic memory areas, or !0 if dma_alloc_coherent should return @ret. */ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) + dma_addr_t *dma_handle, void **ret, gfp_t flag) { struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); if (!mem) return 0; - *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + *ret = __dma_alloc_from_coherent(mem, size, dma_handle, flag); return 1; } -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle, + gfp_t flag) { if (!dma_coherent_default_memory) return NULL; return __dma_alloc_from_coherent(dma_coherent_default_memory, size, - dma_handle); + dma_handle, flag); } static int __dma_release_from_coherent(struct dma_coherent_mem *mem, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b0038ca..bfea1d2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -272,7 +272,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, WARN_ON_ONCE(!dev->coherent_dma_mask); - if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr, flag)) return cpu_addr; /* let the implementation decide on the zone to allocate from: */ -- 2.7.4