On Tue, 2013-01-08 at 10:53 +0000, Chris Wilson wrote: > If the object lies outside of the mappable GTT aperture, do not force it > through the CPU domain for relocations, but simply flush the writes as > we perform them and then queue a chipset flush. > > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 87 ++++++++++++++++------------ > 1 file changed, 51 insertions(+), 36 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index 163bb52..daa5409 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -33,6 +33,9 @@ > #include "intel_drv.h" > #include <linux/dma_remapping.h> > > +#define __EXEC_OBJECT_HAS_PIN (1<<31) > +#define __EXEC_OBJECT_HAS_FENCE (1<<30) > + > struct eb_objects { > int and; > struct hlist_head buckets[0]; > @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb) > static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) > { > return (obj->base.write_domain == I915_GEM_DOMAIN_CPU || > - !obj->map_and_fenceable || > obj->cache_level != I915_CACHE_NONE); > } > > +static inline struct page * > +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset) > +{ > + offset -= obj->gtt_space->start; > + return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT); > +} > + > static int > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > struct eb_objects *eb, > @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > return -EFAULT; > > reloc->delta += target_offset; > + reloc->offset += obj->gtt_offset; > if (use_cpu_reloc(obj)) { > - uint32_t page_offset = reloc->offset & ~PAGE_MASK; > char *vaddr; > > - ret = i915_gem_object_set_to_cpu_domain(obj, 1); > + ret = i915_gem_object_set_to_cpu_domain(obj, true); > if (ret) > return ret; > > - vaddr = kmap_atomic(i915_gem_object_get_page(obj, > - reloc->offset >> PAGE_SHIFT)); > - *(uint32_t *)(vaddr + page_offset) = reloc->delta; > + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); > + *(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta; > kunmap_atomic(vaddr); > } else { > struct drm_i915_private *dev_priv = dev->dev_private; > - uint32_t __iomem *reloc_entry; > - void __iomem *reloc_page; > + unsigned page_offset; > > ret = i915_gem_object_set_to_gtt_domain(obj, true); > if (ret) > @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > return ret; > > /* Map the page containing the relocation we're going to perform. */ > - reloc->offset += obj->gtt_offset; > - reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, > - reloc->offset & PAGE_MASK); > - reloc_entry = (uint32_t __iomem *) > - (reloc_page + (reloc->offset & ~PAGE_MASK)); > - iowrite32(reloc->delta, reloc_entry); > - io_mapping_unmap_atomic(reloc_page); > + page_offset = offset_in_page(reloc->offset); > + > + if (reloc->offset < dev_priv->mm.gtt_mappable_end) { > + void __iomem *reloc_page; > + > + reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, > + reloc->offset & PAGE_MASK); > + iowrite32(reloc->delta, reloc_page + page_offset); > + io_mapping_unmap_atomic(reloc_page); > + } else { > + char *vaddr; > + > + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); > + > + drm_clflush_virt_range(vaddr + page_offset, 4); > + *(uint32_t *)(vaddr + page_offset) = reloc->delta; > + drm_clflush_virt_range(vaddr + page_offset, 4); Discussed this already to some degree with Chris, but I still think the first cache flush is redundant. > + > + kunmap_atomic(vaddr); > + > + obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU; > + } > } > > /* and update the user's relocation entry */ > @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev, > return ret; > } > > -#define __EXEC_OBJECT_HAS_PIN (1<<31) > -#define __EXEC_OBJECT_HAS_FENCE (1<<30) > - > -static int > -need_reloc_mappable(struct drm_i915_gem_object *obj) > -{ > - struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > - return entry->relocation_count && !use_cpu_reloc(obj); > -} > - > static int > i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, > struct intel_ring_buffer *ring) > @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, > struct drm_i915_private *dev_priv = obj->base.dev->dev_private; > struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4; > - bool need_fence, need_mappable; > + bool need_fence; > int ret; > > need_fence = > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > - ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false); > + ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false); > if (ret) > return ret; > > @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > INIT_LIST_HEAD(&ordered_objects); > while (!list_empty(objects)) { > struct drm_i915_gem_exec_object2 *entry; > - bool need_fence, need_mappable; > + bool need_fence; > > obj = list_first_entry(objects, > struct drm_i915_gem_object, > @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > - if (need_mappable) > + if (need_fence) > list_move(&obj->exec_list, &ordered_objects); > else > list_move_tail(&obj->exec_list, &ordered_objects); > @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > /* Unbind any ill-fitting objects or pin. */ > list_for_each_entry(obj, objects, exec_list) { > struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > - bool need_fence, need_mappable; > + bool need_fence; > > if (!obj->gtt_space) > continue; > @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) || > - (need_mappable && !obj->map_and_fenceable)) > + (need_fence && !obj->map_and_fenceable)) > ret = i915_gem_object_unbind(obj); > else > ret = i915_gem_execbuffer_reserve_object(obj, ring); > @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, > if (ret) > return ret; > > + flush_domains |= obj->base.write_domain; > + > if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) > i915_gem_clflush_object(obj); > > - flush_domains |= obj->base.write_domain; Looks like an unnecessary move. > + /* Used as an internal marker during relocation processing */ > + if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) { > + flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS; > + obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS; > + } > } > > if (flush_domains & I915_GEM_DOMAIN_CPU) > @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, > } > > /* Set the pending read domains for the batch buffer to COMMAND */ > - if (batch_obj->base.pending_write_domain) { > + if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) { > DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); > ret = -EINVAL; > goto err;