On Tue, 11 May 2021, Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> wrote: > The internal ttm_bo_util memcpy uses vmap functionality, and while it > probably might be possible to use it for copying in- and out of > sglist represented io memory, using io_mem_reserve() / io_mem_free() > callbacks, that would cause problems with fault(). > Instead, implement a method mapping page-by-page using kmap_local() > semantics. As an additional benefit we then avoid the occasional global > TLB flushes of vmap() and consuming vmap space, elimination of a critical > point of failure and with a slight change of semantics we could also push > the memcpy out async for testing and async driver develpment purposes. > Pushing out async can be done since there is no memory allocation going on > that could violate the dma_fence lockdep rules. > > Note that drivers that don't want to use struct io_mapping but relies on > memremap functionality, and that don't want to use scatterlists for > VRAM may well define specialized (hopefully reusable) iterators for their > particular environment. > > Cc: Christian König <christian.koenig@xxxxxxx> > Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/Makefile | 1 + > .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.c | 155 ++++++++++++++++++ > .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.h | 141 ++++++++++++++++ > drivers/gpu/drm/ttm/ttm_bo.c | 1 + > 4 files changed, 298 insertions(+) > create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c > create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h > > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile > index cb8823570996..958ccc1edfed 100644 > --- a/drivers/gpu/drm/i915/Makefile > +++ b/drivers/gpu/drm/i915/Makefile > @@ -155,6 +155,7 @@ gem-y += \ > gem/i915_gem_stolen.o \ > gem/i915_gem_throttle.o \ > gem/i915_gem_tiling.o \ > + gem/i915_gem_ttm_bo_util.o \ > gem/i915_gem_userptr.o \ > gem/i915_gem_wait.o \ > gem/i915_gemfs.o > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c > new file mode 100644 > index 000000000000..1116d7df1461 > --- /dev/null > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c > @@ -0,0 +1,155 @@ > +// SPDX-License-Identifier: MIT > +/* > + * Copyright © 2021 Intel Corporation > + */ > + > +/** > + * DOC: Usage and intentions. > + * > + * This file contains functionality that we might want to move into > + * ttm_bo_util.c if there is a common interest. > + * Currently a kmap_local only memcpy with support for page-based iomem regions, > + * and fast memcpy from write-combined memory. > + */ > + > +#include <linux/dma-buf-map.h> > +#include <linux/highmem.h> > +#include <linux/io-mapping.h> > +#include <linux/scatterlist.h> > + > +#include "i915_memcpy.h" > + > +#include "gem/i915_gem_ttm_bo_util.h" > + > +static void i915_ttm_kmap_iter_tt_kmap_local(struct i915_ttm_kmap_iter *iter, > + struct dma_buf_map *dmap, > + pgoff_t i) > +{ > + struct i915_ttm_kmap_iter_tt *iter_tt = > + container_of(iter, typeof(*iter_tt), base); > + > + dma_buf_map_set_vaddr(dmap, kmap_local_page(iter_tt->tt->pages[i])); > +} > + > +static void i915_ttm_kmap_iter_iomap_kmap_local(struct i915_ttm_kmap_iter *iter, > + struct dma_buf_map *dmap, > + pgoff_t i) > +{ > + struct i915_ttm_kmap_iter_iomap *iter_io = > + container_of(iter, typeof(*iter_io), base); > + void __iomem *addr; > + > +retry: > + while (i >= iter_io->cache.end) { > + iter_io->cache.sg = iter_io->cache.sg ? > + sg_next(iter_io->cache.sg) : iter_io->st->sgl; > + iter_io->cache.i = iter_io->cache.end; > + iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >> > + PAGE_SHIFT; > + iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) - > + iter_io->start; > + } > + > + if (i < iter_io->cache.i) { > + iter_io->cache.end = 0; > + iter_io->cache.sg = NULL; > + goto retry; > + } > + > + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs + > + (((resource_size_t)i - iter_io->cache.i) > + << PAGE_SHIFT)); > + dma_buf_map_set_vaddr_iomem(dmap, addr); > +} > + > +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops = { > + .kmap_local = i915_ttm_kmap_iter_tt_kmap_local > +}; > + > +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops = { > + .kmap_local = i915_ttm_kmap_iter_iomap_kmap_local > +}; > + > +static void kunmap_local_dma_buf_map(struct dma_buf_map *map) > +{ > + if (map->is_iomem) > + io_mapping_unmap_local(map->vaddr_iomem); > + else > + kunmap_local(map->vaddr); > +} > + > +/** > + * i915_ttm_move_memcpy - Helper to perform a memcpy ttm move operation. > + * @bo: The struct ttm_buffer_object. > + * @new_mem: The struct ttm_resource we're moving to (copy destination). > + * @new_kmap: A struct i915_ttm_kmap_iter representing the destination resource. > + * @old_kmap: A struct i915_ttm_kmap_iter representing the source resource. > + */ > +void i915_ttm_move_memcpy(struct ttm_buffer_object *bo, > + struct ttm_resource *new_mem, > + struct i915_ttm_kmap_iter *new_kmap, > + struct i915_ttm_kmap_iter *old_kmap) > +{ > + struct ttm_device *bdev = bo->bdev; > + struct ttm_resource_manager *man = ttm_manager_type(bdev, new_mem->mem_type); > + struct ttm_tt *ttm = bo->ttm; > + struct ttm_resource *old_mem = &bo->mem; > + struct ttm_resource old_copy = *old_mem; > + struct ttm_resource_manager *old_man = ttm_manager_type(bdev, old_mem->mem_type); > + struct dma_buf_map old_map, new_map; > + pgoff_t i; > + > + /* For the page-based allocator we need sgtable iterators as well.*/ > + > + /* Single TTM move. NOP */ > + if (old_man->use_tt && man->use_tt) > + goto done; > + > + /* Don't move nonexistent data. Clear destination instead. */ > + if (old_man->use_tt && !man->use_tt && > + (ttm == NULL || !ttm_tt_is_populated(ttm))) { > + if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC)) > + goto done; > + > + for (i = 0; i < new_mem->num_pages; ++i) { > + new_kmap->ops->kmap_local(new_kmap, &new_map, i); > + memset_io(new_map.vaddr_iomem, 0, PAGE_SIZE); > + kunmap_local_dma_buf_map(&new_map); > + } > + goto done; > + } > + > + for (i = 0; i < new_mem->num_pages; ++i) { > + new_kmap->ops->kmap_local(new_kmap, &new_map, i); > + old_kmap->ops->kmap_local(old_kmap, &old_map, i); > + if (!old_map.is_iomem || > + !i915_memcpy_from_wc(new_map.vaddr, old_map.vaddr, PAGE_SIZE)) { > + if (!old_map.is_iomem) { > + dma_buf_map_memcpy_to(&new_map, old_map.vaddr, > + PAGE_SIZE); > + } else if (!new_map.is_iomem) { > + memcpy_fromio(new_map.vaddr, old_map.vaddr_iomem, > + PAGE_SIZE); > + } else { > + pgoff_t j; > + u32 __iomem *src = old_map.vaddr_iomem; > + u32 __iomem *dst = new_map.vaddr_iomem; > + > + for (j = 0; j < (PAGE_SIZE >> 2); ++j) > + iowrite32(ioread32(src++), dst++); > + } > + } > + kunmap_local_dma_buf_map(&old_map); > + kunmap_local_dma_buf_map(&new_map); > + } > + > +done: > + old_copy = *old_mem; > + > + ttm_bo_assign_mem(bo, new_mem); > + > + if (!man->use_tt) > + ttm_bo_tt_destroy(bo); > + > + ttm_resource_free(bo, &old_copy); > +} > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h > new file mode 100644 > index 000000000000..82c92176718d > --- /dev/null > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h > @@ -0,0 +1,141 @@ > +/* SPDX-License-Identifier: MIT */ > +/* > + * Copyright © 2021 Intel Corporation > + */ > + > +/* > + * This files contains functionality that we might want to move into > + * ttm_bo_util.c if there is a common interest. > + */ > +#ifndef _I915_GEM_TTM_BO_UTIL_H_ > +#define _I915_GEM_TTM_BO_UTIL_H_ > + > +#include <drm/ttm/ttm_bo_driver.h> > +struct dma_buf_map; > +struct io_mapping; > +struct sg_table; > +struct scatterlist; > + > +struct ttm_tt; > +struct i915_ttm_kmap_iter; > + > +/** > + * struct i915_ttm_kmap_iter_ops - Ops structure for a struct > + * i915_ttm_kmap_iter. > + */ > +struct i915_ttm_kmap_iter_ops { > + /** > + * kmap_local - Map a PAGE_SIZE part of the resource using > + * kmap_local semantics. > + * @res_kmap: Pointer to the struct i915_ttm_kmap_iter representing > + * the resource. > + * @dmap: The struct dma_buf_map holding the virtual address after > + * the operation. > + * @i: The location within the resource to map. PAGE_SIZE granularity. > + */ > + void (*kmap_local)(struct i915_ttm_kmap_iter *res_kmap, > + struct dma_buf_map *dmap, pgoff_t i); > +}; > + > +/** > + * struct i915_ttm_kmap_iter - Iterator for kmap_local type operations on a > + * resource. > + * @ops: Pointer to the operations struct. > + * > + * This struct is intended to be embedded in a resource-specific specialization > + * implementing operations for the resource. > + * > + * Nothing stops us from extending the operations to vmap, vmap_pfn etc, > + * replacing some or parts of the ttm_bo_util. cpu-map functionality. > + */ > +struct i915_ttm_kmap_iter { > + const struct i915_ttm_kmap_iter_ops *ops; > +}; > + > +/** > + * struct i915_ttm_kmap_iter_tt - Specialization for a tt (page) backed struct > + * ttm_resource. > + * @base: Embedded struct i915_ttm_kmap_iter providing the usage interface > + * @tt: Cached struct ttm_tt. > + */ > +struct i915_ttm_kmap_iter_tt { > + struct i915_ttm_kmap_iter base; > + struct ttm_tt *tt; > +}; > + > +/** > + * struct i915_ttm_kmap_iter_iomap - Specialization for a struct io_mapping + > + * struct sg_table backed struct ttm_resource. > + * @base: Embedded struct i915_ttm_kmap_iter providing the usage interface. > + * @iomap: struct io_mapping representing the underlying linear io_memory. > + * @st: sg_table into @iomap, representing the memory of the struct ttm_resource. > + * @start: Offset that needs to be subtracted from @st to make > + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. > + * @cache: Scatterlist traversal cache for fast lookups. > + * @cache.sg: Pointer to the currently cached scatterlist segment. > + * @cache.i: First index of @sg. PAGE_SIZE granularity. > + * @cache.end: Last index + 1 of @sg. PAGE_SIZE granularity. > + * @cache.offs: First offset into @iomap of @sg. PAGE_SIZE granularity. > + */ > +struct i915_ttm_kmap_iter_iomap { > + struct i915_ttm_kmap_iter base; > + struct io_mapping *iomap; > + struct sg_table *st; > + resource_size_t start; > + struct { > + struct scatterlist *sg; > + pgoff_t i; > + pgoff_t end; > + pgoff_t offs; > + } cache; > +}; > + > +extern struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops; > +extern struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops; > + > +/** > + * i915_ttm_kmap_iter_iomap_init - Initialize a struct i915_ttm_kmap_iter_iomap > + * @iter_io: The struct i915_ttm_kmap_iter_iomap to initialize. > + * @iomap: The struct io_mapping representing the underlying linear io_memory. > + * @st: sg_table into @iomap, representing the memory of the struct > + * ttm_resource. > + * @start: Offset that needs to be subtracted from @st to make > + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. > + * > + * Return: Pointer to the embedded struct i915_ttm_kmap_iter. > + */ > +static inline struct i915_ttm_kmap_iter * > +i915_ttm_kmap_iter_iomap_init(struct i915_ttm_kmap_iter_iomap *iter_io, > + struct io_mapping *iomap, > + struct sg_table *st, > + resource_size_t start) > +{ > + iter_io->base.ops = &i915_ttm_kmap_iter_io_ops; > + iter_io->iomap = iomap; > + iter_io->st = st; > + iter_io->start = start; > + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); > + return &iter_io->base; > +} > + > +/** > + * ttm_kmap_iter_tt_init - Initialize a struct i915_ttm_kmap_iter_tt > + * @iter_tt: The struct i915_ttm_kmap_iter_tt to initialize. > + * @tt: Struct ttm_tt holding page pointers of the struct ttm_resource. > + * > + * Return: Pointer to the embedded struct i915_ttm_kmap_iter. > + */ > +static inline struct i915_ttm_kmap_iter * > +i915_ttm_kmap_iter_tt_init(struct i915_ttm_kmap_iter_tt *iter_tt, > + struct ttm_tt *tt) > +{ > + iter_tt->base.ops = &i915_ttm_kmap_iter_tt_ops; > + iter_tt->tt = tt; > + return &iter_tt->base; > +} Do there functions have a valid *performance* reason to be inline? I think that's pretty much the only valid reason. Having these inline forces i915_ttm_kmap_iter_*_ops extern, and they should really be static. Inline functions complicate header dependencies and leak the abstractions. BR, Jani. > + > +void i915_ttm_move_memcpy(struct ttm_buffer_object *bo, > + struct ttm_resource *new_mem, > + struct i915_ttm_kmap_iter *new_iter, > + struct i915_ttm_kmap_iter *old_iter); > +#endif > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c > index ca1b098b6a56..4479c55aaa1d 100644 > --- a/drivers/gpu/drm/ttm/ttm_bo.c > +++ b/drivers/gpu/drm/ttm/ttm_bo.c > @@ -1221,3 +1221,4 @@ void ttm_bo_tt_destroy(struct ttm_buffer_object *bo) > ttm_tt_destroy(bo->bdev, bo->ttm); > bo->ttm = NULL; > } > +EXPORT_SYMBOL(ttm_bo_tt_destroy); -- Jani Nikula, Intel Open Source Graphics Center