On Mon, 15 Nov 2021 14:19:21 +0000 Paul Cercueil <paul@xxxxxxxxxxxxxxx> wrote: > We can be certain that the input buffers will only be accessed by > userspace for reading, and output buffers will mostly be accessed by > userspace for writing. Mostly? Perhaps a little more info on why that's not 'only'. > > Therefore, it makes more sense to use only fully cached input buffers, > and to use the write-combine cache coherency setting for output buffers. > > This boosts performance, as the data written to the output buffers does > not have to be sync'd for coherency. It will halve performance if the > userspace application tries to read from the output buffer, but this > should never happen. > > Since we don't need to sync the cache when disabling CPU access either > for input buffers or output buffers, the .end_cpu_access() callback can > be dropped completely. We have an odd mix of coherent and non coherent DMA in here as you noted, but are you sure this is safe on all platforms? > > Signed-off-by: Paul Cercueil <paul@xxxxxxxxxxxxxxx> Any numbers to support this patch? The mapping types are performance optimisations so nice to know how much of a difference they make. > --- > drivers/iio/buffer/industrialio-buffer-dma.c | 82 +++++++++++++------- > 1 file changed, 54 insertions(+), 28 deletions(-) > > diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c > index 92356ee02f30..fb39054d8c15 100644 > --- a/drivers/iio/buffer/industrialio-buffer-dma.c > +++ b/drivers/iio/buffer/industrialio-buffer-dma.c > @@ -229,8 +229,33 @@ static int iio_buffer_dma_buf_mmap(struct dma_buf *dbuf, > if (vma->vm_ops->open) > vma->vm_ops->open(vma); > > - return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start, > - virt_to_page(block->vaddr)); > + if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) { > + /* > + * With an input buffer, userspace will only read the data and > + * never write. We can mmap the buffer fully cached. > + */ > + return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start, > + virt_to_page(block->vaddr)); > + } else { > + /* > + * With an output buffer, userspace will only write the data > + * and should rarely (if never) read from it. It is better to > + * use write-combine in this case. > + */ > + return dma_mmap_wc(dev, vma, block->vaddr, block->phys_addr, > + vma->vm_end - vma->vm_start); > + } > +} > + > +static void iio_dma_buffer_free_dmamem(struct iio_dma_buffer_block *block) > +{ > + struct device *dev = block->queue->dev; > + size_t size = PAGE_ALIGN(block->size); > + > + if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) > + dma_free_coherent(dev, size, block->vaddr, block->phys_addr); > + else > + dma_free_wc(dev, size, block->vaddr, block->phys_addr); > } > > static void iio_buffer_dma_buf_release(struct dma_buf *dbuf) > @@ -243,9 +268,7 @@ static void iio_buffer_dma_buf_release(struct dma_buf *dbuf) > > mutex_lock(&queue->lock); > > - dma_free_coherent(queue->dev, PAGE_ALIGN(block->size), > - block->vaddr, block->phys_addr); > - > + iio_dma_buffer_free_dmamem(block); > kfree(block); > > queue->num_blocks--; > @@ -268,19 +291,6 @@ static int iio_buffer_dma_buf_begin_cpu_access(struct dma_buf *dbuf, > return 0; > } > > -static int iio_buffer_dma_buf_end_cpu_access(struct dma_buf *dbuf, > - enum dma_data_direction dma_dir) > -{ > - struct iio_dma_buffer_block *block = dbuf->priv; > - struct device *dev = block->queue->dev; > - > - /* We only need to sync the cache for output buffers */ > - if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_OUT) > - dma_sync_single_for_device(dev, block->phys_addr, block->size, dma_dir); > - > - return 0; > -} > - > static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = { > .attach = iio_buffer_dma_buf_attach, > .map_dma_buf = iio_buffer_dma_buf_map, > @@ -288,9 +298,28 @@ static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = { > .mmap = iio_buffer_dma_buf_mmap, > .release = iio_buffer_dma_buf_release, > .begin_cpu_access = iio_buffer_dma_buf_begin_cpu_access, > - .end_cpu_access = iio_buffer_dma_buf_end_cpu_access, > }; > > +static int iio_dma_buffer_alloc_dmamem(struct iio_dma_buffer_block *block) > +{ > + struct device *dev = block->queue->dev; > + size_t size = PAGE_ALIGN(block->size); > + > + if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) { > + block->vaddr = dma_alloc_coherent(dev, size, > + &block->phys_addr, > + GFP_KERNEL); > + } else { > + block->vaddr = dma_alloc_wc(dev, size, > + &block->phys_addr, > + GFP_KERNEL); > + } > + if (!block->vaddr) > + return -ENOMEM; > + > + return 0; > +} > + > static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block( > struct iio_dma_buffer_queue *queue, size_t size, bool fileio) > { > @@ -303,12 +332,12 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block( > if (!block) > return ERR_PTR(-ENOMEM); > > - block->vaddr = dma_alloc_coherent(queue->dev, PAGE_ALIGN(size), > - &block->phys_addr, GFP_KERNEL); > - if (!block->vaddr) { > - err = -ENOMEM; > + block->size = size; > + block->queue = queue; > + > + err = iio_dma_buffer_alloc_dmamem(block); > + if (err) > goto err_free_block; > - } > > einfo.ops = &iio_dma_buffer_dmabuf_ops; > einfo.size = PAGE_ALIGN(size); > @@ -322,10 +351,8 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block( > } > > block->dmabuf = dmabuf; > - block->size = size; > block->bytes_used = size; > block->state = IIO_BLOCK_STATE_DONE; > - block->queue = queue; > block->fileio = fileio; > INIT_LIST_HEAD(&block->head); > > @@ -338,8 +365,7 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block( > return block; > > err_free_dma: > - dma_free_coherent(queue->dev, PAGE_ALIGN(size), > - block->vaddr, block->phys_addr); > + iio_dma_buffer_free_dmamem(block); > err_free_block: > kfree(block); > return ERR_PTR(err);