If we're moving a bunch of buffers from the CPU domain to the GPU domain, and we've already blown out the entire cache via a wbinvd, there is nothing more to do. With this and the previous patches, I am seeing a 3x FPS increase on a certain benchmark which uses a giant 2d array texture. Unless I missed something in the code, it should only effect non-LLC i915 platforms. I haven't yet run any numbers for other benchmarks, nor have I attempted to check if various conformance tests still pass. v2: Rewrite the patch to be i915 only Obtain whether or not we wbinvd up front. Signed-off-by: Ben Widawsky <ben@xxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 8 ++++++++ drivers/gpu/drm/i915/i915_gem.c | 11 +++++------ drivers/gpu/drm/i915/i915_gem_execbuffer.c | 20 ++++++++++++++++---- drivers/gpu/drm/i915/intel_lrc.c | 10 ++++++++-- 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 90ff6aa..5d2f62d 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1643,6 +1643,7 @@ struct i915_workarounds { struct eb_vmas { struct list_head vmas; + bool do_wbinvd; int and; union { struct i915_vma *lut[0]; @@ -1913,6 +1914,8 @@ struct drm_i915_private { void (*stop_ring)(struct intel_engine_cs *ring); } gt; + size_t wbinvd_threshold; + uint32_t request_uniq; /* @@ -2810,6 +2813,11 @@ static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv) void i915_gem_reset(struct drm_device *dev); bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); +static inline bool cpu_cache_is_coherent(struct drm_device *dev, + enum i915_cache_level level) +{ + return HAS_LLC(dev) || level != I915_CACHE_NONE; +} int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj); int __must_check i915_gem_init(struct drm_device *dev); int i915_gem_init_rings(struct drm_device *dev); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index fc81889..5bfb332 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -61,12 +61,6 @@ static int i915_gem_shrinker_oom(struct notifier_block *nb, void *ptr); static unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv); -static bool cpu_cache_is_coherent(struct drm_device *dev, - enum i915_cache_level level) -{ - return HAS_LLC(dev) || level != I915_CACHE_NONE; -} - static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj) { if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) @@ -4878,6 +4872,11 @@ int i915_gem_init(struct drm_device *dev) dev_priv->gt.stop_ring = intel_logical_ring_stop; } + dev_priv->wbinvd_threshold = boot_cpu_data.x86_cache_size << 10; + /* Pick a high default in the unlikely case we got nothing */ + if (!dev_priv->wbinvd_threshold) + dev_priv->wbinvd_threshold = (8 << 20); + ret = i915_gem_init_userptr(dev); if (ret) goto out_unlock; diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 13ed13e..56f9268 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -50,7 +50,7 @@ eb_create(struct drm_i915_gem_execbuffer2 *args) unsigned size = args->buffer_count; size *= sizeof(struct i915_vma *); size += sizeof(struct eb_vmas); - eb = kmalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY); + eb = kzalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY); } if (eb == NULL) { @@ -78,6 +78,7 @@ eb_reset(struct eb_vmas *eb) { if (eb->and >= 0) memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head)); + eb->do_wbinvd = false; } static int @@ -154,6 +155,11 @@ eb_lookup_vmas(struct eb_vmas *eb, hlist_add_head(&vma->exec_node, &eb->buckets[handle & eb->and]); } + + if (vma->node.size >= to_i915(obj->base.dev)->wbinvd_threshold && + obj->base.write_domain & I915_GEM_DOMAIN_CPU && + !cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) + eb->do_wbinvd = true; ++i; } @@ -826,7 +832,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring, struct list_head *vmas = &eb->vmas; struct i915_vma *vma; uint32_t flush_domains = 0; - bool flush_chipset = false; + bool flush_chipset = eb->do_wbinvd; int ret; list_for_each_entry(vma, vmas, exec_list) { @@ -835,12 +841,18 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring, if (ret) return ret; + flush_domains |= obj->base.write_domain; + + if (eb->do_wbinvd) + continue; + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) flush_chipset |= i915_gem_clflush_object(obj, false); - - flush_domains |= obj->base.write_domain; } + if (eb->do_wbinvd) + wbinvd(); + if (flush_chipset) i915_gem_chipset_flush(ring->dev); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 03741f9..16ca4a2 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -586,12 +586,18 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, if (ret) return ret; + flush_domains |= obj->base.write_domain; + + if (eb->do_wbinvd) + continue; + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) i915_gem_clflush_object(obj, false); - - flush_domains |= obj->base.write_domain; } + if (eb->do_wbinvd) + wbinvd(); + if (flush_domains & I915_GEM_DOMAIN_GTT) wmb(); -- 2.3.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx