On Mon, Aug 10, 2020 at 12:30:40PM +0200, Maarten Lankhorst wrote: > This reverts commit 9e0f9464e2ab36b864359a59b0e9058fdef0ce47, Missed one, you need to dim cite the above so the commit subject is included. Also too long sha1. -Daniel > and related commit 7ac2d2536dfa7 ("drm/i915/gem: Delete unused code"). > > Async GPU relocations are not the path forward, we want to remove > GPU accelerated relocation support eventually when userspace is fixed > to use VM_BIND, and this is the first step towards that. We will keep > async gpu relocations around for now, until userspace is fixed. > > Relocation support will be disabled completely on platforms where there > was never any userspace that depends on it, as the hardware doesn't > require it from at least gen9+ onward. For older platforms, the plan > is to use cpu relocations only. > > Signed-off-by: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxxxx> > --- > .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 319 ++++++++++++++++-- > .../i915/gem/selftests/i915_gem_execbuffer.c | 21 +- > 2 files changed, 313 insertions(+), 27 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c > index 24a1486d2dc5..c6a613d92a13 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c > @@ -46,6 +46,13 @@ struct eb_vma_array { > struct eb_vma vma[]; > }; > > +enum { > + FORCE_CPU_RELOC = 1, > + FORCE_GTT_RELOC, > + FORCE_GPU_RELOC, > +#define DBG_FORCE_RELOC 0 /* choose one of the above! */ > +}; > + > #define __EXEC_OBJECT_HAS_PIN BIT(31) > #define __EXEC_OBJECT_HAS_FENCE BIT(30) > #define __EXEC_OBJECT_NEEDS_MAP BIT(29) > @@ -261,6 +268,8 @@ struct i915_execbuffer { > */ > struct reloc_cache { > struct drm_mm_node node; /** temporary GTT binding */ > + unsigned long vaddr; /** Current kmap address */ > + unsigned long page; /** Currently mapped page index */ > unsigned int gen; /** Cached value of INTEL_GEN */ > bool use_64bit_reloc : 1; > bool has_llc : 1; > @@ -607,6 +616,23 @@ eb_add_vma(struct i915_execbuffer *eb, > } > } > > +static inline int use_cpu_reloc(const struct reloc_cache *cache, > + const struct drm_i915_gem_object *obj) > +{ > + if (!i915_gem_object_has_struct_page(obj)) > + return false; > + > + if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) > + return true; > + > + if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) > + return false; > + > + return (cache->has_llc || > + obj->cache_dirty || > + obj->cache_level != I915_CACHE_NONE); > +} > + > static int eb_reserve_vma(const struct i915_execbuffer *eb, > struct eb_vma *ev, > u64 pin_flags) > @@ -937,6 +963,8 @@ relocation_target(const struct drm_i915_gem_relocation_entry *reloc, > static void reloc_cache_init(struct reloc_cache *cache, > struct drm_i915_private *i915) > { > + cache->page = -1; > + cache->vaddr = 0; > /* Must be a variable in the struct to allow GCC to unroll. */ > cache->gen = INTEL_GEN(i915); > cache->has_llc = HAS_LLC(i915); > @@ -948,6 +976,25 @@ static void reloc_cache_init(struct reloc_cache *cache, > cache->target = NULL; > } > > +static inline void *unmask_page(unsigned long p) > +{ > + return (void *)(uintptr_t)(p & PAGE_MASK); > +} > + > +static inline unsigned int unmask_flags(unsigned long p) > +{ > + return p & ~PAGE_MASK; > +} > + > +#define KMAP 0x4 /* after CLFLUSH_FLAGS */ > + > +static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) > +{ > + struct drm_i915_private *i915 = > + container_of(cache, struct i915_execbuffer, reloc_cache)->i915; > + return &i915->ggtt; > +} > + > #define RELOC_TAIL 4 > > static int reloc_gpu_chain(struct reloc_cache *cache) > @@ -1060,6 +1107,186 @@ static int reloc_gpu_flush(struct reloc_cache *cache) > return err; > } > > +static void reloc_cache_reset(struct reloc_cache *cache) > +{ > + void *vaddr; > + > + if (!cache->vaddr) > + return; > + > + vaddr = unmask_page(cache->vaddr); > + if (cache->vaddr & KMAP) { > + if (cache->vaddr & CLFLUSH_AFTER) > + mb(); > + > + kunmap_atomic(vaddr); > + i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); > + } else { > + struct i915_ggtt *ggtt = cache_to_ggtt(cache); > + > + intel_gt_flush_ggtt_writes(ggtt->vm.gt); > + io_mapping_unmap_atomic((void __iomem *)vaddr); > + > + if (drm_mm_node_allocated(&cache->node)) { > + ggtt->vm.clear_range(&ggtt->vm, > + cache->node.start, > + cache->node.size); > + mutex_lock(&ggtt->vm.mutex); > + drm_mm_remove_node(&cache->node); > + mutex_unlock(&ggtt->vm.mutex); > + } else { > + i915_vma_unpin((struct i915_vma *)cache->node.mm); > + } > + } > + > + cache->vaddr = 0; > + cache->page = -1; > +} > + > +static void *reloc_kmap(struct drm_i915_gem_object *obj, > + struct reloc_cache *cache, > + unsigned long pageno) > +{ > + void *vaddr; > + struct page *page; > + > + if (cache->vaddr) { > + kunmap_atomic(unmask_page(cache->vaddr)); > + } else { > + unsigned int flushes; > + int err; > + > + err = i915_gem_object_prepare_write(obj, &flushes); > + if (err) > + return ERR_PTR(err); > + > + BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); > + BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); > + > + cache->vaddr = flushes | KMAP; > + cache->node.mm = (void *)obj; > + if (flushes) > + mb(); > + } > + > + page = i915_gem_object_get_page(obj, pageno); > + if (!obj->mm.dirty) > + set_page_dirty(page); > + > + vaddr = kmap_atomic(page); > + cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; > + cache->page = pageno; > + > + return vaddr; > +} > + > +static void *reloc_iomap(struct drm_i915_gem_object *obj, > + struct reloc_cache *cache, > + unsigned long page) > +{ > + struct i915_ggtt *ggtt = cache_to_ggtt(cache); > + unsigned long offset; > + void *vaddr; > + > + if (cache->vaddr) { > + intel_gt_flush_ggtt_writes(ggtt->vm.gt); > + io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); > + } else { > + struct i915_vma *vma; > + int err; > + > + if (i915_gem_object_is_tiled(obj)) > + return ERR_PTR(-EINVAL); > + > + if (use_cpu_reloc(cache, obj)) > + return NULL; > + > + i915_gem_object_lock(obj); > + err = i915_gem_object_set_to_gtt_domain(obj, true); > + i915_gem_object_unlock(obj); > + if (err) > + return ERR_PTR(err); > + > + vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, > + PIN_MAPPABLE | > + PIN_NONBLOCK /* NOWARN */ | > + PIN_NOEVICT); > + if (IS_ERR(vma)) { > + memset(&cache->node, 0, sizeof(cache->node)); > + mutex_lock(&ggtt->vm.mutex); > + err = drm_mm_insert_node_in_range > + (&ggtt->vm.mm, &cache->node, > + PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, > + 0, ggtt->mappable_end, > + DRM_MM_INSERT_LOW); > + mutex_unlock(&ggtt->vm.mutex); > + if (err) /* no inactive aperture space, use cpu reloc */ > + return NULL; > + } else { > + cache->node.start = vma->node.start; > + cache->node.mm = (void *)vma; > + } > + } > + > + offset = cache->node.start; > + if (drm_mm_node_allocated(&cache->node)) { > + ggtt->vm.insert_page(&ggtt->vm, > + i915_gem_object_get_dma_address(obj, page), > + offset, I915_CACHE_NONE, 0); > + } else { > + offset += page << PAGE_SHIFT; > + } > + > + vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, > + offset); > + cache->page = page; > + cache->vaddr = (unsigned long)vaddr; > + > + return vaddr; > +} > + > +static void *reloc_vaddr(struct drm_i915_gem_object *obj, > + struct reloc_cache *cache, > + unsigned long page) > +{ > + void *vaddr; > + > + if (cache->page == page) { > + vaddr = unmask_page(cache->vaddr); > + } else { > + vaddr = NULL; > + if ((cache->vaddr & KMAP) == 0) > + vaddr = reloc_iomap(obj, cache, page); > + if (!vaddr) > + vaddr = reloc_kmap(obj, cache, page); > + } > + > + return vaddr; > +} > + > +static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) > +{ > + if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { > + if (flushes & CLFLUSH_BEFORE) { > + clflushopt(addr); > + mb(); > + } > + > + *addr = value; > + > + /* > + * Writes to the same cacheline are serialised by the CPU > + * (including clflush). On the write path, we only require > + * that it hits memory in an orderly fashion and place > + * mb barriers at the start and end of the relocation phase > + * to ensure ordering of clflush wrt to the system. > + */ > + if (flushes & CLFLUSH_AFTER) > + clflushopt(addr); > + } else > + *addr = value; > +} > + > static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) > { > struct drm_i915_gem_object *obj = vma->obj; > @@ -1225,6 +1452,17 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb, > return cmd; > } > > +static inline bool use_reloc_gpu(struct i915_vma *vma) > +{ > + if (DBG_FORCE_RELOC == FORCE_GPU_RELOC) > + return true; > + > + if (DBG_FORCE_RELOC) > + return false; > + > + return !dma_resv_test_signaled_rcu(vma->resv, true); > +} > + > static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset) > { > struct page *page; > @@ -1239,10 +1477,10 @@ static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset) > return addr + offset_in_page(offset); > } > > -static int __reloc_entry_gpu(struct i915_execbuffer *eb, > - struct i915_vma *vma, > - u64 offset, > - u64 target_addr) > +static bool __reloc_entry_gpu(struct i915_execbuffer *eb, > + struct i915_vma *vma, > + u64 offset, > + u64 target_addr) > { > const unsigned int gen = eb->reloc_cache.gen; > unsigned int len; > @@ -1258,7 +1496,7 @@ static int __reloc_entry_gpu(struct i915_execbuffer *eb, > > batch = reloc_gpu(eb, vma, len); > if (IS_ERR(batch)) > - return PTR_ERR(batch); > + return false; > > addr = gen8_canonical_addr(vma->node.start + offset); > if (gen >= 8) { > @@ -1307,21 +1545,55 @@ static int __reloc_entry_gpu(struct i915_execbuffer *eb, > *batch++ = target_addr; > } > > - return 0; > + return true; > +} > + > +static bool reloc_entry_gpu(struct i915_execbuffer *eb, > + struct i915_vma *vma, > + u64 offset, > + u64 target_addr) > +{ > + if (eb->reloc_cache.vaddr) > + return false; > + > + if (!use_reloc_gpu(vma)) > + return false; > + > + return __reloc_entry_gpu(eb, vma, offset, target_addr); > } > > static u64 > -relocate_entry(struct i915_execbuffer *eb, > - struct i915_vma *vma, > +relocate_entry(struct i915_vma *vma, > const struct drm_i915_gem_relocation_entry *reloc, > + struct i915_execbuffer *eb, > const struct i915_vma *target) > { > u64 target_addr = relocation_target(reloc, target); > - int err; > - > - err = __reloc_entry_gpu(eb, vma, reloc->offset, target_addr); > - if (err) > - return err; > + u64 offset = reloc->offset; > + > + if (!reloc_entry_gpu(eb, vma, offset, target_addr)) { > + bool wide = eb->reloc_cache.use_64bit_reloc; > + void *vaddr; > + > +repeat: > + vaddr = reloc_vaddr(vma->obj, > + &eb->reloc_cache, > + offset >> PAGE_SHIFT); > + if (IS_ERR(vaddr)) > + return PTR_ERR(vaddr); > + > + GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32))); > + clflush_write32(vaddr + offset_in_page(offset), > + lower_32_bits(target_addr), > + eb->reloc_cache.vaddr); > + > + if (wide) { > + offset += sizeof(u32); > + target_addr >>= 32; > + wide = false; > + goto repeat; > + } > + } > > return target->node.start | UPDATE; > } > @@ -1386,7 +1658,8 @@ eb_relocate_entry(struct i915_execbuffer *eb, > * If the relocation already has the right value in it, no > * more work needs to be done. > */ > - if (gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset) > + if (!DBG_FORCE_RELOC && > + gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset) > return 0; > > /* Check that the relocation address is valid... */ > @@ -1418,7 +1691,7 @@ eb_relocate_entry(struct i915_execbuffer *eb, > ev->flags &= ~EXEC_OBJECT_ASYNC; > > /* and update the user's relocation entry */ > - return relocate_entry(eb, ev->vma, reloc, target->vma); > + return relocate_entry(ev->vma, reloc, eb, target->vma); > } > > static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) > @@ -1456,8 +1729,10 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) > * this is bad and so lockdep complains vehemently. > */ > copied = __copy_from_user(r, urelocs, count * sizeof(r[0])); > - if (unlikely(copied)) > - return -EFAULT; > + if (unlikely(copied)) { > + remain = -EFAULT; > + goto out; > + } > > remain -= count; > do { > @@ -1465,7 +1740,8 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) > > if (likely(offset == 0)) { > } else if ((s64)offset < 0) { > - return (int)offset; > + remain = (int)offset; > + goto out; > } else { > /* > * Note that reporting an error now > @@ -1495,8 +1771,9 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) > } while (r++, --count); > urelocs += ARRAY_SIZE(stack); > } while (remain); > - > - return 0; > +out: > + reloc_cache_reset(&eb->reloc_cache); > + return remain; > } > > static int eb_relocate(struct i915_execbuffer *eb) > @@ -2573,7 +2850,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, > eb.i915 = i915; > eb.file = file; > eb.args = args; > - if (!(args->flags & I915_EXEC_NO_RELOC)) > + if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) > args->flags |= __EXEC_HAS_RELOC; > > eb.exec = exec; > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c > index 57c14d3340cd..a49016f8ee0d 100644 > --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c > @@ -37,14 +37,20 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb, > return err; > > /* 8-Byte aligned */ > - err = __reloc_entry_gpu(eb, vma, offsets[0] * sizeof(u32), 0); > - if (err) > + if (!__reloc_entry_gpu(eb, vma, > + offsets[0] * sizeof(u32), > + 0)) { > + err = -EIO; > goto unpin_vma; > + } > > /* !8-Byte aligned */ > - err = __reloc_entry_gpu(eb, vma, offsets[1] * sizeof(u32), 1); > - if (err) > + if (!__reloc_entry_gpu(eb, vma, > + offsets[1] * sizeof(u32), > + 1)) { > + err = -EIO; > goto unpin_vma; > + } > > /* Skip to the end of the cmd page */ > i = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1; > @@ -54,9 +60,12 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb, > eb->reloc_cache.rq_size += i; > > /* Force batch chaining */ > - err = __reloc_entry_gpu(eb, vma, offsets[2] * sizeof(u32), 2); > - if (err) > + if (!__reloc_entry_gpu(eb, vma, > + offsets[2] * sizeof(u32), > + 2)) { > + err = -EIO; > goto unpin_vma; > + } > > GEM_BUG_ON(!eb->reloc_cache.rq); > rq = i915_request_get(eb->reloc_cache.rq); > -- > 2.28.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx