On Wed, Jan 14, 2015 at 11:20:56AM +0000, Chris Wilson wrote: > If the batch buffer is too large to fit into the aperture and we need a > GTT mapping for relocations, we currently fail. This only applies to a > subset of machines for a subset of environments, quite undesirable. We > can simply check after failing to insert the batch into the GTT as to > whether we only need a mappable binding for relocation and, if so, we can > revert to using a non-mappable binding and an alternate relocation > method. However, using relocate_entry_cpu() is excruciatingly slow for > large buffers on non-LLC as the entire buffer requires clflushing before > and after the relocation handling. Alternatively, we can implement a > third relocation method that only clflushes around the relocation entry. > This is still slower than updating through the GTT, so we prefer using > the GTT where possible, but is orders of magnitude faster as we > typically do not have to then clflush the entire buffer. > > An alternative idea of using a temporary WC mapping of the backing store > is promising (it should be faster than using the GTT itself), but > requires fairly extensive arch/x86 support - along the lines of > kmap_atomic_prof_pfn() (which is not universally implemented even for > x86). > > Testcase: igt/gem_exec_big #pnv,byt > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88392 > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 84 +++++++++++++++++++++++++----- > 1 file changed, 72 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index e3ef17783765..06bdf7670584 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -251,7 +251,6 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) > { > return (HAS_LLC(obj->base.dev) || > obj->base.write_domain == I915_GEM_DOMAIN_CPU || > - !obj->map_and_fenceable || > obj->cache_level != I915_CACHE_NONE); > } > > @@ -337,6 +336,51 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj, > return 0; > } > > +static void > +clflush_write32(void *addr, uint32_t value) > +{ > + /* This is not a fast path, so KISS. */ > + drm_clflush_virt_range(addr, sizeof(uint32_t)); > + *(uint32_t *)addr = value; > + drm_clflush_virt_range(addr, sizeof(uint32_t)); > +} > + > +static int > +relocate_entry_clflush(struct drm_i915_gem_object *obj, > + struct drm_i915_gem_relocation_entry *reloc, > + uint64_t target_offset) > +{ > + struct drm_device *dev = obj->base.dev; > + uint32_t page_offset = offset_in_page(reloc->offset); > + uint64_t delta = (int)reloc->delta + target_offset; > + char *vaddr; > + int ret; > + > + ret = i915_gem_object_set_to_gtt_domain(obj, true); > + if (ret) > + return ret; > + > + vaddr = kmap_atomic(i915_gem_object_get_page(obj, > + reloc->offset >> PAGE_SHIFT)); > + clflush_write32(vaddr + page_offset, lower_32_bits(delta)); > + > + if (INTEL_INFO(dev)->gen >= 8) { > + page_offset = offset_in_page(page_offset + sizeof(uint32_t)); > + > + if (page_offset == 0) { > + kunmap_atomic(vaddr); > + vaddr = kmap_atomic(i915_gem_object_get_page(obj, > + (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT)); > + } > + > + clflush_write32(vaddr + page_offset, upper_32_bits(delta)); > + } > + > + kunmap_atomic(vaddr); > + > + return 0; > +} > + > static int > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > struct eb_vmas *eb, > @@ -426,9 +470,12 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > > if (use_cpu_reloc(obj)) > ret = relocate_entry_cpu(obj, reloc, target_offset); > - else > + else if (obj->map_and_fenceable) > ret = relocate_entry_gtt(obj, reloc, target_offset); > - > + else if (cpu_has_clflush) > + ret = relocate_entry_clflush(obj, reloc, target_offset); > + else > + ret = -ENODEV; I think a DRM_ERROR here would be good since this should never happen. And why don't we have an errno for -EKERNELBUG ... > if (ret) > return ret; > > @@ -525,6 +572,12 @@ i915_gem_execbuffer_relocate(struct eb_vmas *eb) > return ret; > } > > +static bool only_mappable_for_reloc(unsigned int flags) > +{ > + return (flags & (EXEC_OBJECT_NEEDS_FENCE | __EXEC_OBJECT_NEEDS_MAP)) == > + __EXEC_OBJECT_NEEDS_MAP; > +} I'm slightly freaked out by us encoding this here without making it explicit in the flags. But I couldn't come up with a better idea? > + > static int > i915_gem_execbuffer_reserve_vma(struct i915_vma *vma, > struct intel_engine_cs *ring, > @@ -536,14 +589,21 @@ i915_gem_execbuffer_reserve_vma(struct i915_vma *vma, > int ret; > > flags = 0; > - if (entry->flags & __EXEC_OBJECT_NEEDS_MAP) > - flags |= PIN_GLOBAL | PIN_MAPPABLE; > - if (entry->flags & EXEC_OBJECT_NEEDS_GTT) > - flags |= PIN_GLOBAL; > - if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS) > - flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; > + if (!drm_mm_node_allocated(&vma->node)) { > + if (entry->flags & __EXEC_OBJECT_NEEDS_MAP) > + flags |= PIN_GLOBAL | PIN_MAPPABLE; > + if (entry->flags & EXEC_OBJECT_NEEDS_GTT) > + flags |= PIN_GLOBAL; > + if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS) > + flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; > + } Hm, aren't we always calling reserve un buffers we know we've just unbound? Keeping the flags computation would at least be a good selfcheck about the consistency of our placing decisions, so I'd like to keep it. > > ret = i915_gem_object_pin(obj, vma->vm, entry->alignment, flags); > + if ((ret == -ENOSPC || ret == -E2BIG) && > + only_mappable_for_reloc(entry->flags)) > + ret = i915_gem_object_pin(obj, vma->vm, > + entry->alignment, > + flags & ~(PIN_GLOBAL | PIN_MAPPABLE)); > if (ret) > return ret; > > @@ -605,13 +665,13 @@ eb_vma_misplaced(struct i915_vma *vma) > vma->node.start & (entry->alignment - 1)) > return true; > > - if (entry->flags & __EXEC_OBJECT_NEEDS_MAP && !obj->map_and_fenceable) > - return true; > - > if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS && > vma->node.start < BATCH_OFFSET_BIAS) > return true; > > + if (entry->flags & __EXEC_OBJECT_NEEDS_MAP && !obj->map_and_fenceable) > + return !only_mappable_for_reloc(entry->flags); Hm, this seems to contradict your commit message a bit since it'll result in a behavioural change of what we try to push into mappable for relocs. Shouldn't we instead just clear the NEEDS_MAP flag in reserve_vma when the mappable pin fails and we fall back? Besides the nitpicks on integration lgtm. -Daniel > + > return false; > } > > -- > 2.1.4 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx