On Mon, 14 Jun 2021 at 11:32, Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> wrote: > > > On 6/14/21 12:20 PM, Matthew Auld wrote: > > On Mon, 14 Jun 2021 at 10:53, Thomas Hellström > > <thomas.hellstrom@xxxxxxxxxxxxxxx> wrote: > >> After a TTM move or object init we need to update the i915 gem flags and > >> caching settings to reflect the new placement. Currently caching settings > >> are not changed during the lifetime of an object, although that might > >> change moving forward if we run into performance issues or issues with > >> WC system page allocations. > >> Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the > >> various ways we previously used to detect this. > >> Finally, initialize the TTM object reserved to be able to update > >> flags and caching before anyone else gets hold of the object. > >> > >> Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> > >> --- > >> v2: > >> - Style fixes (Reported by Matthew Auld) > >> v3: > >> - More style fixes. Clarify why we're updating caching settings after move. > >> (Suggested by Matthew Auld) > >> --- > >> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 111 +++++++++++++++++++----- > >> 1 file changed, 89 insertions(+), 22 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c > >> index 33ab47f1e05b..5176682a7d19 100644 > >> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c > >> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c > >> @@ -70,6 +70,17 @@ static struct ttm_placement i915_sys_placement = { > >> .busy_placement = &lmem0_sys_placement_flags[1], > >> }; > >> > >> +static bool gpu_binds_iomem(struct ttm_resource *mem) > >> +{ > >> + return mem->mem_type != TTM_PL_SYSTEM; > >> +} > >> + > >> +static bool cpu_maps_iomem(struct ttm_resource *mem) > >> +{ > >> + /* Once / if we support GGTT, this is also false for cached ttm_tts */ > >> + return mem->mem_type != TTM_PL_SYSTEM; > >> +} > >> + > >> static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj); > >> > >> static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo, > >> @@ -175,6 +186,40 @@ static void i915_ttm_free_cached_io_st(struct drm_i915_gem_object *obj) > >> obj->ttm.cached_io_st = NULL; > >> } > >> > >> +static void > >> +i915_ttm_adjust_domains_after_cpu_move(struct drm_i915_gem_object *obj) > >> +{ > >> + struct ttm_buffer_object *bo = i915_gem_to_ttm(obj); > >> + > >> + if (cpu_maps_iomem(bo->resource) || bo->ttm->caching != ttm_cached) { > >> + obj->write_domain = I915_GEM_DOMAIN_WC; > >> + obj->read_domains = I915_GEM_DOMAIN_WC; > >> + } else { > >> + obj->write_domain = I915_GEM_DOMAIN_CPU; > >> + obj->read_domains = I915_GEM_DOMAIN_CPU; > >> + } > >> +} > >> + > >> +static void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj) > >> +{ > >> + struct drm_i915_private *i915 = to_i915(obj->base.dev); > >> + struct ttm_buffer_object *bo = i915_gem_to_ttm(obj); > >> + unsigned int cache_level; > >> + > >> + obj->mem_flags &= ~(I915_BO_FLAG_STRUCT_PAGE | I915_BO_FLAG_IOMEM); > >> + > >> + obj->mem_flags |= cpu_maps_iomem(bo->resource) ? I915_BO_FLAG_IOMEM : > >> + I915_BO_FLAG_STRUCT_PAGE; > >> + > >> + if ((HAS_LLC(i915) || HAS_SNOOP(i915)) && !gpu_binds_iomem(bo->resource) && > >> + bo->ttm->caching == ttm_cached) > >> + cache_level = I915_CACHE_LLC; > >> + else > >> + cache_level = I915_CACHE_NONE; > >> + > >> + i915_gem_object_set_cache_coherency(obj, cache_level); > >> +} > >> + > >> static void i915_ttm_purge(struct drm_i915_gem_object *obj) > >> { > >> struct ttm_buffer_object *bo = i915_gem_to_ttm(obj); > >> @@ -190,8 +235,10 @@ static void i915_ttm_purge(struct drm_i915_gem_object *obj) > >> > >> /* TTM's purge interface. Note that we might be reentering. */ > >> ret = ttm_bo_validate(bo, &place, &ctx); > >> - > >> if (!ret) { > >> + obj->write_domain = 0; > >> + obj->read_domains = 0; > >> + i915_ttm_adjust_gem_after_move(obj); > >> i915_ttm_free_cached_io_st(obj); > >> obj->mm.madv = __I915_MADV_PURGED; > >> } > >> @@ -273,12 +320,15 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj, > >> struct ttm_resource *res) > >> { > >> struct ttm_buffer_object *bo = i915_gem_to_ttm(obj); > >> - struct ttm_resource_manager *man = > >> - ttm_manager_type(bo->bdev, res->mem_type); > >> > >> - if (man->use_tt) > >> + if (!gpu_binds_iomem(res)) > >> return i915_ttm_tt_get_st(bo->ttm); > >> > >> + /* > >> + * If CPU mapping differs, we need to add the ttm_tt pages to > >> + * the resulting st. Might make sense for GGTT. > >> + */ > >> + GEM_WARN_ON(!cpu_maps_iomem(res)); > >> return intel_region_ttm_node_to_st(obj->mm.region, res); > >> } > >> > >> @@ -290,8 +340,6 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict, > >> struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo); > >> struct ttm_resource_manager *dst_man = > >> ttm_manager_type(bo->bdev, dst_mem->mem_type); > >> - struct ttm_resource_manager *src_man = > >> - ttm_manager_type(bo->bdev, bo->resource->mem_type); > >> struct intel_memory_region *dst_reg, *src_reg; > >> union { > >> struct ttm_kmap_iter_tt tt; > >> @@ -332,34 +380,36 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict, > >> if (IS_ERR(dst_st)) > >> return PTR_ERR(dst_st); > >> > >> - /* If we start mapping GGTT, we can no longer use man::use_tt here. */ > >> - dst_iter = dst_man->use_tt ? > >> + dst_iter = !cpu_maps_iomem(dst_mem) ? > >> ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm) : > >> ttm_kmap_iter_iomap_init(&_dst_iter.io, &dst_reg->iomap, > >> dst_st, dst_reg->region.start); > >> > >> - src_iter = src_man->use_tt ? > >> + src_iter = !cpu_maps_iomem(bo->resource) ? > >> ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm) : > >> ttm_kmap_iter_iomap_init(&_src_iter.io, &src_reg->iomap, > >> obj->ttm.cached_io_st, > >> src_reg->region.start); > >> > >> ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter); > >> + /* Below dst_mem becomes bo->resource. */ > >> ttm_bo_move_sync_cleanup(bo, dst_mem); > >> + i915_ttm_adjust_domains_after_cpu_move(obj); > >> i915_ttm_free_cached_io_st(obj); > >> > >> - if (!dst_man->use_tt) { > >> + if (gpu_binds_iomem(dst_mem) || cpu_maps_iomem(dst_mem)) { > >> obj->ttm.cached_io_st = dst_st; > >> obj->ttm.get_io_page.sg_pos = dst_st->sgl; > >> obj->ttm.get_io_page.sg_idx = 0; > >> } > >> > >> + i915_ttm_adjust_gem_after_move(obj); > >> return 0; > >> } > >> > >> static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem) > >> { > >> - if (mem->mem_type < I915_PL_LMEM0) > >> + if (!cpu_maps_iomem(mem)) > >> return 0; > >> > >> mem->bus.caching = ttm_write_combined; > >> @@ -421,6 +471,16 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj) > >> if (ret) > >> return ret == -ENOSPC ? -ENXIO : ret; > >> > >> + i915_ttm_adjust_lru(obj); > >> + if (bo->ttm && !ttm_tt_is_populated(bo->ttm)) { > >> + ret = ttm_tt_populate(bo->bdev, bo->ttm, &ctx); > >> + if (ret) > >> + return ret; > >> + > >> + i915_ttm_adjust_domains_after_cpu_move(obj); > >> + i915_ttm_adjust_gem_after_move(obj); > >> + } > >> + > >> /* Object either has a page vector or is an iomem object */ > >> st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st; > >> if (IS_ERR(st)) > >> @@ -428,8 +488,6 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj) > >> > >> __i915_gem_object_set_pages(obj, st, i915_sg_dma_sizes(st->sgl)); > >> > >> - i915_ttm_adjust_lru(obj); > >> - > >> return ret; > >> } > >> > >> @@ -563,6 +621,7 @@ static u64 i915_ttm_mmap_offset(struct drm_i915_gem_object *obj) > >> > >> const struct drm_i915_gem_object_ops i915_gem_ttm_obj_ops = { > >> .name = "i915_gem_object_ttm", > >> + .flags = I915_GEM_OBJECT_IS_SHRINKABLE, > > We set is_shrinkable for both lmem and smem? Does the current shrinker > > work with !shmem? I assume the put_pages() will just discard the page > > contents, which is not what we want? Maybe keep this disabled for now? > > Or am I missing something? > > This is to indicate to the gem shrinker that we want to be able to > determine dynamically whether the object is shrinkable (by the gem > shrinker) or not. (See below). It's intended for the situation where we > might want to use shmem to back cached-only ttm objects, and use our own > shrinker. > > But if you think it's better, we could flip that ops flag once we > actually have anything shrinkable in place. Let me know what you think. > The TTM shmem pool idea becomes a bit doubtful by the fact that > allocating some 8GB of system objects, populating and clearing them > appears to be some 6X faster with a straightforward alloc approach (like > TTM or the "internal" backend) than with shmem... I would vote for keeping it turned off for now. > > >> .get_pages = i915_ttm_get_pages, > >> .put_pages = i915_ttm_put_pages, > >> @@ -599,6 +658,10 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem, > >> { > >> static struct lock_class_key lock_class; > >> struct drm_i915_private *i915 = mem->i915; > >> + struct ttm_operation_ctx ctx = { > >> + .interruptible = true, > >> + .no_wait_gpu = false, > >> + }; > >> enum ttm_bo_type bo_type; > >> size_t alignment = 0; > >> int ret; > >> @@ -618,15 +681,14 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem, > >> i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags); > >> i915_gem_object_init_memory_region(obj, mem); > >> i915_gem_object_make_unshrinkable(obj); > > Here we set the object to unshrinkable. Oh, I totally missed that. But I don't think we are meant to call that here, especially with IS_SHRINKABLE, since object_set_pages will later make it shrinkable anyway, which looks like fun and games. > > /Thomas > >