On to, 2016-06-09 at 12:29 +0100, Chris Wilson wrote: > When doing relocations, we have to obtain a mapping to the page > containing the target address. This is either a kmap or iomap depending > on GPU and its cache coherency. Neighbouring relocation entries are > typically within the same page and so we can cache our kmapping between > them and avoid those pesky TLB flushes. > > Note that there is some sleight-of-hand in how the slow relocate works > as the reloc_entry_cache implies pagefaults disabled (as we are inside a > kmap_atomic section). However, the slow relocate code is meant to be the > fallback from the atomic fast path failing. Fortunately it works as we > already have performed the copy_from_user for the relocation array (no > more pagefaults there) and the kmap_atomic cache is enabled after we > have waited upon an active buffer (so no more sleeping in atomic). > Magic! > You could also mention that you mangle the relocation <-> page logic, so this is not purely about caching. Or maybe even split it. Below comments, those addressed; Reviewed-by: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 160 +++++++++++++++++++---------- > 1 file changed, 106 insertions(+), 54 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index a29c4b6fea28..318c71b663f4 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -302,9 +302,50 @@ relocation_target(struct drm_i915_gem_relocation_entry *reloc, > return gen8_canonical_addr((int)reloc->delta + target_offset); > } > > +struct reloc_cache { > + void *vaddr; > + unsigned page; > + enum { KMAP, IOMAP } type; > +}; > + > +static void reloc_cache_init(struct reloc_cache *cache) > +{ > + cache->page = -1; > + cache->vaddr = NULL; > +} > + > +static void reloc_cache_fini(struct reloc_cache *cache) > +{ > + if (cache->vaddr == NULL) > + return; > + > + switch (cache->type) { > + case KMAP: kunmap_atomic(cache->vaddr); break; > + case IOMAP: io_mapping_unmap_atomic(cache->vaddr); break; > + } > +} > + > +static void *reloc_kmap(struct drm_i915_gem_object *obj, > + struct reloc_cache *cache, > + int page) > +{ > + if (cache->page == page) > + return cache->vaddr; > + > + if (cache->vaddr) > + kunmap_atomic(cache->vaddr); Maybe add some GEM_BUG_ON(cache->type != KMAP) here before running kunmap_atomic? Because that assumption is made. > + > + cache->page = page; > + cache->vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); > + cache->type = KMAP; > + > + return cache->vaddr; > +} > + > static int > relocate_entry_cpu(struct drm_i915_gem_object *obj, > struct drm_i915_gem_relocation_entry *reloc, > + struct reloc_cache *cache, > uint64_t target_offset) > { > struct drm_device *dev = obj->base.dev; > @@ -317,34 +358,47 @@ relocate_entry_cpu(struct drm_i915_gem_object *obj, > if (ret) > return ret; > > - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, > - reloc->offset >> PAGE_SHIFT)); > + vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT); > *(uint32_t *)(vaddr + page_offset) = lower_32_bits(delta); > > - if (INTEL_INFO(dev)->gen >= 8) { > - page_offset = offset_in_page(page_offset + sizeof(uint32_t)); > - > - if (page_offset == 0) { > - kunmap_atomic(vaddr); > - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, > - (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT)); > + if (INTEL_GEN(dev) >= 8) { > + page_offset += sizeof(uint32_t); > + if (page_offset == PAGE_SIZE) { > + vaddr = reloc_kmap(obj, cache, cache->page + 1); > + page_offset = 0; > } > - > *(uint32_t *)(vaddr + page_offset) = upper_32_bits(delta); > } > > - kunmap_atomic(vaddr); > - > return 0; > } > > +static void *reloc_iomap(struct drm_i915_private *i915, > + struct reloc_cache *cache, > + uint64_t offset) > +{ > + if (cache->page == offset >> PAGE_SHIFT) > + return cache->vaddr; > + > + if (cache->vaddr) > + io_mapping_unmap_atomic(cache->vaddr); > + > + cache->page = offset >> PAGE_SHIFT; > + cache->vaddr = > + io_mapping_map_atomic_wc(i915->ggtt.mappable, > + offset & PAGE_MASK); > + cache->type = IOMAP; > + > + return cache->vaddr; > +} > + > static int > relocate_entry_gtt(struct drm_i915_gem_object *obj, > struct drm_i915_gem_relocation_entry *reloc, > + struct reloc_cache *cache, > uint64_t target_offset) > { > struct drm_i915_private *dev_priv = to_i915(obj->base.dev); > - struct i915_ggtt *ggtt = &dev_priv->ggtt; > struct i915_vma *vma; > uint64_t delta = relocation_target(reloc, target_offset); > uint64_t offset; > @@ -366,28 +420,19 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj, > /* Map the page containing the relocation we're going to perform. */ > offset = vma->node.start; > offset += reloc->offset; > - reloc_page = io_mapping_map_atomic_wc(ggtt->mappable, > - offset & PAGE_MASK); > + reloc_page = reloc_iomap(dev_priv, cache, offset); > iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset)); > > if (INTEL_GEN(dev_priv) >= 8) { > offset += sizeof(uint32_t); > - > - if (offset_in_page(offset) == 0) { > - io_mapping_unmap_atomic(reloc_page); > - reloc_page = > - io_mapping_map_atomic_wc(ggtt->mappable, > - offset); > - } > - > + if (offset_in_page(offset) == 0) > + reloc_page = reloc_iomap(dev_priv, cache, offset); > iowrite32(upper_32_bits(delta), > reloc_page + offset_in_page(offset)); > } > > - io_mapping_unmap_atomic(reloc_page); > - > unpin: > - i915_vma_unpin(vma); > + __i915_vma_unpin(vma); > return ret; > } > > @@ -403,6 +448,7 @@ clflush_write32(void *addr, uint32_t value) > static int > relocate_entry_clflush(struct drm_i915_gem_object *obj, > struct drm_i915_gem_relocation_entry *reloc, > + struct reloc_cache *cache, For consistency with rest of the patch, I would put the cache as last argument, as it could easily be removed again in future, so it's the least important. > uint64_t target_offset) > { > struct drm_device *dev = obj->base.dev; > @@ -415,24 +461,18 @@ relocate_entry_clflush(struct drm_i915_gem_object *obj, > if (ret) > return ret; > > - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, > - reloc->offset >> PAGE_SHIFT)); > + vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT); > clflush_write32(vaddr + page_offset, lower_32_bits(delta)); > > - if (INTEL_INFO(dev)->gen >= 8) { > - page_offset = offset_in_page(page_offset + sizeof(uint32_t)); > - > - if (page_offset == 0) { > - kunmap_atomic(vaddr); > - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, > - (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT)); > + if (INTEL_GEN(dev) >= 8) { > + page_offset += sizeof(uint32_t); > + if (page_offset == PAGE_SIZE) { > + vaddr = reloc_kmap(obj, cache, cache->page + 1); > + page_offset = 0; > } > - > clflush_write32(vaddr + page_offset, upper_32_bits(delta)); > } > > - kunmap_atomic(vaddr); > - > return 0; > } > > @@ -453,7 +493,8 @@ static bool object_is_idle(struct drm_i915_gem_object *obj) > static int > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > struct eb_vmas *eb, > - struct drm_i915_gem_relocation_entry *reloc) > + struct drm_i915_gem_relocation_entry *reloc, > + struct reloc_cache *cache) > { > struct drm_device *dev = obj->base.dev; > struct drm_gem_object *target_obj; > @@ -537,11 +578,11 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > return -EFAULT; > > if (use_cpu_reloc(obj)) > - ret = relocate_entry_cpu(obj, reloc, target_offset); > + ret = relocate_entry_cpu(obj, reloc, cache, target_offset); > else if (obj->map_and_fenceable) > - ret = relocate_entry_gtt(obj, reloc, target_offset); > + ret = relocate_entry_gtt(obj, reloc, cache, target_offset); > else if (static_cpu_has(X86_FEATURE_CLFLUSH)) > - ret = relocate_entry_clflush(obj, reloc, target_offset); > + ret = relocate_entry_clflush(obj, reloc, cache, target_offset); > else { > WARN_ONCE(1, "Impossible case in relocation handling\n"); > ret = -ENODEV; > @@ -564,9 +605,11 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, > struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)]; > struct drm_i915_gem_relocation_entry __user *user_relocs; > struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; > - int remain, ret; > + struct reloc_cache cache; > + int remain, ret = 0; > > user_relocs = u64_to_user_ptr(entry->relocs_ptr); > + reloc_cache_init(&cache); > > remain = entry->relocation_count; > while (remain) { > @@ -576,19 +619,23 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, > count = ARRAY_SIZE(stack_reloc); > remain -= count; > > - if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) > - return -EFAULT; > + if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) { > + ret = -EFAULT; > + goto out; > + } > > do { > u64 offset = r->presumed_offset; > > - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r); > + ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache); > if (ret) > - return ret; > + goto out; > > if (r->presumed_offset != offset && > - __put_user(r->presumed_offset, &user_relocs->presumed_offset)) { > - return -EFAULT; > + __put_user(r->presumed_offset, > + &user_relocs->presumed_offset)) { > + ret = -EFAULT; > + goto out; > } > > user_relocs++; > @@ -596,7 +643,9 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, > } while (--count); > } > > - return 0; > +out: > + reloc_cache_fini(&cache); <NL> here to keep consistent between your next change. Regards, Joonas > + return ret; > #undef N_RELOC > } > > @@ -606,15 +655,18 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma, > struct drm_i915_gem_relocation_entry *relocs) > { > const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; > - int i, ret; > + struct reloc_cache cache; > + int i, ret = 0; > > + reloc_cache_init(&cache); > for (i = 0; i < entry->relocation_count; i++) { > - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]); > + ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache); > if (ret) > - return ret; > + break; > } > + reloc_cache_fini(&cache); > > - return 0; > + return ret; > } > > static int -- Joonas Lahtinen Open Source Technology Center Intel Corporation _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx