On Sat, Oct 03, 2020 at 04:02:57AM +0000, John Stultz wrote: > This adds a heap that allocates non-contiguous buffers that are > marked as writecombined, so they are not cached by the CPU. > > This is useful, as most graphics buffers are usually not touched > by the CPU or only written into once by the CPU. So when mapping > the buffer over and over between devices, we can skip the CPU > syncing, which saves a lot of cache management overhead, greatly > improving performance. > > For folk using ION, there was a ION_FLAG_CACHED flag, which > signaled if the returned buffer should be CPU cacheable or not. > With DMA-BUF heaps, we do not yet have such a flag, and by default > the current heaps (system and cma) produce CPU cachable buffers. > So for folks transitioning from ION to DMA-BUF Heaps, this fills > in some of that missing functionality. > > There has been a suggestion to make this functionality a flag > (DMAHEAP_FLAG_UNCACHED?) on the system heap, similar to how > ION used the ION_FLAG_CACHED. But I want to make sure an > _UNCACHED flag would truely be a generic attribute across all > heaps. So far that has been unclear, so having it as a separate > heap seemes better for now. (But I'm open to discussion on this > point!) > > This is a rework of earlier efforts to add a uncached system heap, > done utilizing the exisitng system heap, adding just a bit of > logic to handle the uncached case. > > Feedback would be very welcome! > > Many thanks to Liam Mark for his help to get this working. > > Pending opensource users of this code include: > * AOSP HiKey960 gralloc: > - https://android-review.googlesource.com/c/device/linaro/hikey/+/1399519 > - Visibly improves performance over the system heap > * AOSP Codec2 (possibly, needs more review): > - https://android-review.googlesource.com/c/platform/frameworks/av/+/1360640/17/media/codec2/vndk/C2DmaBufAllocator.cpp#325 > > Cc: Sumit Semwal <sumit.semwal@xxxxxxxxxx> > Cc: Liam Mark <lmark@xxxxxxxxxxxxxx> > Cc: Laura Abbott <labbott@xxxxxxxxxx> > Cc: Brian Starkey <Brian.Starkey@xxxxxxx> > Cc: Hridya Valsaraju <hridya@xxxxxxxxxx> > Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx> > Cc: Sandeep Patil <sspatil@xxxxxxxxxx> > Cc: Daniel Mentz <danielmentz@xxxxxxxxxx> > Cc: Chris Goldsworthy <cgoldswo@xxxxxxxxxxxxxx> > Cc: �rjan Eide <orjan.eide@xxxxxxx> > Cc: Robin Murphy <robin.murphy@xxxxxxx> > Cc: Ezequiel Garcia <ezequiel@xxxxxxxxxxxxx> > Cc: Simon Ser <contact@xxxxxxxxxxx> > Cc: James Jones <jajones@xxxxxxxxxx> > Cc: linux-media@xxxxxxxxxxxxxxx > Cc: dri-devel@xxxxxxxxxxxxxxxxxxxxx > Signed-off-by: John Stultz <john.stultz@xxxxxxxxxx> > --- > drivers/dma-buf/heaps/system_heap.c | 87 ++++++++++++++++++++++++++--- > 1 file changed, 79 insertions(+), 8 deletions(-) > > diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c > index 2b8d4b6abacb..952f1fd9dacf 100644 > --- a/drivers/dma-buf/heaps/system_heap.c > +++ b/drivers/dma-buf/heaps/system_heap.c > @@ -22,6 +22,7 @@ > #include <linux/vmalloc.h> > > struct dma_heap *sys_heap; > +struct dma_heap *sys_uncached_heap; > > struct system_heap_buffer { > struct dma_heap *heap; > @@ -31,6 +32,8 @@ struct system_heap_buffer { > struct sg_table sg_table; > int vmap_cnt; > void *vaddr; > + > + bool uncached; > }; > > struct dma_heap_attachment { > @@ -38,6 +41,8 @@ struct dma_heap_attachment { > struct sg_table *table; > struct list_head list; > bool mapped; > + > + bool uncached; > }; > > #define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \ > @@ -94,7 +99,7 @@ static int system_heap_attach(struct dma_buf *dmabuf, > a->dev = attachment->dev; > INIT_LIST_HEAD(&a->list); > a->mapped = false; > - > + a->uncached = buffer->uncached; > attachment->priv = a; > > mutex_lock(&buffer->lock); > @@ -124,9 +129,13 @@ static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attac > { > struct dma_heap_attachment *a = attachment->priv; > struct sg_table *table = a->table; > + int attr = 0; > int ret; > > - ret = dma_map_sgtable(attachment->dev, table, direction, 0); > + if (a->uncached) > + attr = DMA_ATTR_SKIP_CPU_SYNC; > + > + ret = dma_map_sgtable(attachment->dev, table, direction, attr); > if (ret) > return ERR_PTR(ret); > > @@ -139,9 +148,12 @@ static void system_heap_unmap_dma_buf(struct dma_buf_attachment *attachment, > enum dma_data_direction direction) > { > struct dma_heap_attachment *a = attachment->priv; > + int attr = 0; > > + if (a->uncached) > + attr = DMA_ATTR_SKIP_CPU_SYNC; > a->mapped = false; > - dma_unmap_sgtable(attachment->dev, table, direction, 0); > + dma_unmap_sgtable(attachment->dev, table, direction, attr); > } > > static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf, > @@ -150,6 +162,9 @@ static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf, > struct system_heap_buffer *buffer = dmabuf->priv; > struct dma_heap_attachment *a; > > + if (buffer->uncached) > + return 0; > + > mutex_lock(&buffer->lock); > > if (buffer->vmap_cnt) > @@ -171,6 +186,9 @@ static int system_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf, > struct system_heap_buffer *buffer = dmabuf->priv; > struct dma_heap_attachment *a; > > + if (buffer->uncached) > + return 0; > + > mutex_lock(&buffer->lock); > > if (buffer->vmap_cnt) > @@ -194,6 +212,9 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) > struct sg_page_iter piter; > int ret; > > + if (buffer->uncached) > + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); > + > for_each_sgtable_page(table, &piter, vma->vm_pgoff) { > struct page *page = sg_page_iter_page(&piter); > > @@ -215,8 +236,12 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer) > struct page **pages = vmalloc(sizeof(struct page *) * npages); > struct page **tmp = pages; > struct sg_page_iter piter; > + pgprot_t pgprot = PAGE_KERNEL; > void *vaddr; > > + if (buffer->uncached) > + pgprot = pgprot_writecombine(PAGE_KERNEL); I think this should go after the 'if (!pages)' check. Best to get the allocation failure check as close to the allocation as possible IMO. > + > if (!pages) > return ERR_PTR(-ENOMEM); > > @@ -225,7 +250,7 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer) > *tmp++ = sg_page_iter_page(&piter); > } > > - vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); > + vaddr = vmap(pages, npages, VM_MAP, pgprot); > vfree(pages); > > if (!vaddr) > @@ -278,6 +303,10 @@ static void system_heap_dma_buf_release(struct dma_buf *dmabuf) > int i; > > table = &buffer->sg_table; > + /* Unmap the uncached buffers from the heap device (pairs with the map on allocate) */ > + if (buffer->uncached) > + dma_unmap_sgtable(dma_heap_get_dev(buffer->heap), table, DMA_BIDIRECTIONAL, 0); > + > for_each_sg(table->sgl, sg, table->nents, i) { > struct page *page = sg_page(sg); > > @@ -320,10 +349,11 @@ static struct page *alloc_largest_available(unsigned long size, > return NULL; > } > > -static int system_heap_allocate(struct dma_heap *heap, > - unsigned long len, > - unsigned long fd_flags, > - unsigned long heap_flags) > +static int system_heap_do_allocate(struct dma_heap *heap, > + unsigned long len, > + unsigned long fd_flags, > + unsigned long heap_flags, > + bool uncached) > { > struct system_heap_buffer *buffer; > DEFINE_DMA_BUF_EXPORT_INFO(exp_info); > @@ -344,6 +374,7 @@ static int system_heap_allocate(struct dma_heap *heap, > mutex_init(&buffer->lock); > buffer->heap = heap; > buffer->len = len; > + buffer->uncached = uncached; > > INIT_LIST_HEAD(&pages); > i = 0; > @@ -393,6 +424,16 @@ static int system_heap_allocate(struct dma_heap *heap, > /* just return, as put will call release and that will free */ > return ret; > } > + > + /* > + * For uncached buffers, we need to initially flush cpu cache, since > + * the __GFP_ZERO on the allocation means the zeroing was done by the > + * cpu and thus it is likely cached. Map (and implicitly flush) it out > + * now so we don't get corruption later on. > + */ > + if (buffer->uncached) > + dma_map_sgtable(dma_heap_get_dev(heap), table, DMA_BIDIRECTIONAL, 0); Do we have to keep this mapping around for the entire lifetime of the buffer? Also, this problem (and solution) keeps lingering around. It really feels like there should be a better way to solve "clean the linear mapping all the way to DRAM", but I don't know what that should be. > + > return ret; > > free_pages: > @@ -410,10 +451,30 @@ static int system_heap_allocate(struct dma_heap *heap, > return ret; > } > > +static int system_heap_allocate(struct dma_heap *heap, > + unsigned long len, > + unsigned long fd_flags, > + unsigned long heap_flags) > +{ > + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, false); > +} > + > static const struct dma_heap_ops system_heap_ops = { > .allocate = system_heap_allocate, > }; > > +static int system_uncached_heap_allocate(struct dma_heap *heap, > + unsigned long len, > + unsigned long fd_flags, > + unsigned long heap_flags) > +{ > + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, true); > +} > + > +static const struct dma_heap_ops system_uncached_heap_ops = { > + .allocate = system_uncached_heap_allocate, > +}; > + > static int system_heap_create(void) > { > struct dma_heap_export_info exp_info; > @@ -426,6 +487,16 @@ static int system_heap_create(void) > if (IS_ERR(sys_heap)) > return PTR_ERR(sys_heap); > > + exp_info.name = "system-uncached"; > + exp_info.ops = &system_uncached_heap_ops; > + exp_info.priv = NULL; > + > + sys_uncached_heap = dma_heap_add(&exp_info); > + if (IS_ERR(sys_uncached_heap)) > + return PTR_ERR(sys_heap); > + In principle, there's a race here between the heap getting registered to sysfs and the dma_mask getting updated. I don't think it would cause a problem in practice, but with the API as it is, there's no way to avoid it - so I wonder if the dma_heap_get_dev() accessor isn't the right API design. Cheers, -Brian > + dma_coerce_mask_and_coherent(dma_heap_get_dev(sys_uncached_heap), DMA_BIT_MASK(64)); > + > return 0; > } > module_init(system_heap_create); > -- > 2.17.1 >