As mentioned in the previous commit, reads and writes from both the CPU and GPU go through the LLC. This gives us coherency between the CPU and GPU irrespective of the attribute settings either device sets. We can use to avoid having to clflush even uncached memory. Except for the scanout. The scanout resides within another functional block that does not use the LLC but reads directly from main memory. So in order to maintain coherency with the scanout, writes to uncached memory must be flushed. In order to optimize writes elsewhere, we start tracking whether an framebuffer is attached to an object. v2: Use pin_display tracking rather than fb_count (to ensure we flush cursors as well etc) and only force the clflush along explicit writes to the scanout paths (i.e. pin_to_display_plane and pwrite into scanout). Based on a patch by Ville Syrjälä. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_gem.c | 58 ++++++++++++++++-------------- drivers/gpu/drm/i915/i915_gem_exec.c | 2 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +- drivers/gpu/drm/i915/i915_gem_gtt.c | 2 +- 5 files changed, 36 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 3622ec2..1ffae08 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1863,7 +1863,7 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error) } void i915_gem_reset(struct drm_device *dev); -void i915_gem_clflush_object(struct drm_i915_gem_object *obj); +void i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); int __must_check i915_gem_object_set_domain(struct drm_i915_gem_object *obj, uint32_t read_domains, uint32_t write_domain); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c5e03ba..78535e9 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -38,7 +38,8 @@ #include <linux/dma-buf.h> static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj); -static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj); +static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj, + bool force); static __must_check int i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj, struct i915_address_space *vm, @@ -68,6 +69,14 @@ static bool cpu_cache_is_coherent(struct drm_device *dev, return HAS_LLC(dev) || level != I915_CACHE_NONE; } +static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj) +{ + if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) + return true; + + return obj->pin_display; +} + static inline void i915_gem_object_fence_lost(struct drm_i915_gem_object *obj) { if (obj->tiling_mode) @@ -830,8 +839,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, * write domain and manually flush cachelines (if required). This * optimizes for the case when the gpu will use the data * right away and we therefore have to clflush anyway. */ - if (obj->cache_level == I915_CACHE_NONE) - needs_clflush_after = 1; + needs_clflush_after = cpu_write_needs_clflush(obj); if (i915_gem_obj_bound_any(obj)) { ret = i915_gem_object_set_to_gtt_domain(obj, true); if (ret) @@ -921,7 +929,7 @@ out: */ if (!needs_clflush_after && obj->base.write_domain != I915_GEM_DOMAIN_CPU) { - i915_gem_clflush_object(obj); + i915_gem_clflush_object(obj, false); i915_gem_chipset_flush(dev); } } @@ -999,9 +1007,9 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, goto out; } - if (obj->cache_level == I915_CACHE_NONE && - obj->tiling_mode == I915_TILING_NONE && - obj->base.write_domain != I915_GEM_DOMAIN_CPU) { + if (obj->tiling_mode == I915_TILING_NONE && + obj->base.write_domain != I915_GEM_DOMAIN_CPU && + cpu_write_needs_clflush(obj)) { ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file); /* Note that the gtt paths might fail with non-page-backed user * pointers (e.g. gtt mappings when moving data between @@ -1350,8 +1358,8 @@ i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data, } /* Pinned buffers may be scanout, so flush the cache */ - if (obj->pin_count) - i915_gem_object_flush_cpu_write_domain(obj); + if (obj->pin_display) + i915_gem_object_flush_cpu_write_domain(obj, true); drm_gem_object_unreference(&obj->base); unlock: @@ -1721,7 +1729,7 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj) * hope for the best. */ WARN_ON(ret != -EIO); - i915_gem_clflush_object(obj); + i915_gem_clflush_object(obj, true); obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU; } @@ -3299,7 +3307,8 @@ err_unpin: } void -i915_gem_clflush_object(struct drm_i915_gem_object *obj) +i915_gem_clflush_object(struct drm_i915_gem_object *obj, + bool force) { /* If we don't have a page list set up, then we're not pinned * to GPU, and we can ignore the cache flush because it'll happen @@ -3323,7 +3332,7 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj) * snooping behaviour occurs naturally as the result of our domain * tracking. */ - if (obj->cache_level != I915_CACHE_NONE) + if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) return; trace_i915_gem_object_clflush(obj); @@ -3360,14 +3369,15 @@ i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj) /** Flushes the CPU write domain for the object if it's dirty. */ static void -i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj) +i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj, + bool force) { uint32_t old_write_domain; if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) return; - i915_gem_clflush_object(obj); + i915_gem_clflush_object(obj, force); i915_gem_chipset_flush(obj->base.dev); old_write_domain = obj->base.write_domain; obj->base.write_domain = 0; @@ -3401,7 +3411,7 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write) if (ret) return ret; - i915_gem_object_flush_cpu_write_domain(obj); + i915_gem_object_flush_cpu_write_domain(obj, false); /* Serialise direct access to this object with the barriers for * coherent writes from the GPU, by effectively invalidating the @@ -3491,7 +3501,11 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, obj, cache_level); } - if (cache_level == I915_CACHE_NONE) { + list_for_each_entry(vma, &obj->vma_list, vma_link) + vma->node.color = cache_level; + obj->cache_level = cache_level; + + if (cpu_write_needs_clflush(obj)) { u32 old_read_domains, old_write_domain; /* If we're coming from LLC cached, then we haven't @@ -3514,9 +3528,6 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, old_write_domain); } - list_for_each_entry(vma, &obj->vma_list, vma_link) - vma->node.color = cache_level; - obj->cache_level = cache_level; i915_gem_verify_gtt(dev); return 0; } @@ -3633,7 +3644,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, if (ret) goto err_unpin_display; - i915_gem_object_flush_cpu_write_domain(obj); + i915_gem_object_flush_cpu_write_domain(obj, true); old_write_domain = obj->base.write_domain; old_read_domains = obj->base.read_domains; @@ -3705,8 +3716,7 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) /* Flush the CPU cache if it's still invalid. */ if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) { - if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) - i915_gem_clflush_object(obj); + i915_gem_clflush_object(obj, false); obj->base.read_domains |= I915_GEM_DOMAIN_CPU; } @@ -3888,10 +3898,6 @@ i915_gem_pin_ioctl(struct drm_device *dev, void *data, obj->user_pin_count++; obj->pin_filp = file; - /* XXX - flush the CPU caches for pinned objects - * as the X server doesn't manage domains yet - */ - i915_gem_object_flush_cpu_write_domain(obj); args->offset = i915_gem_obj_ggtt_offset(obj); out: drm_gem_object_unreference(&obj->base); diff --git a/drivers/gpu/drm/i915/i915_gem_exec.c b/drivers/gpu/drm/i915/i915_gem_exec.c index d2ac077..6af0067 100644 --- a/drivers/gpu/drm/i915/i915_gem_exec.c +++ b/drivers/gpu/drm/i915/i915_gem_exec.c @@ -50,7 +50,7 @@ static int i915_gem_exec_flush_object(struct drm_i915_gem_object *obj, return ret; if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) { - i915_gem_clflush_object(obj); + i915_gem_clflush_object(obj, false); i915_gem_chipset_flush(obj->base.dev); obj->base.write_domain &= ~I915_GEM_DOMAIN_CPU; } diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 8ccc29a..e999578 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -716,7 +716,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, return ret; if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) - i915_gem_clflush_object(obj); + i915_gem_clflush_object(obj, false); flush_domains |= obj->base.write_domain; } diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index d29b1db..89d8cc8 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -487,7 +487,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev) dev_priv->gtt.base.total / PAGE_SIZE); list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) { - i915_gem_clflush_object(obj); + i915_gem_clflush_object(obj, obj->pin_display); i915_gem_gtt_bind_object(obj, obj->cache_level); } -- 1.8.4.rc1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx