Flushing the cachelines for an object is slow, can be as much as 100ms for a large framebuffer. We currently do this under the struct_mutex BKL on execution or on pageflip. But now with the ability to add fences to obj->resv for both flips and execbuf (and we naturally wait on the fence before CPU access), we can move the clflush operation to a workqueue and signal a fence for completion, thereby doing the work asynchronously and not blocking the driver or its clients. Suggested-by: Akash Goel <akash.goel@xxxxxxxxx> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Akash Goel <akash.goel@xxxxxxxxx> --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/i915_drv.h | 8 +- drivers/gpu/drm/i915/i915_gem.c | 60 +++---------- drivers/gpu/drm/i915/i915_gem_clflush.c | 138 +++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 +- drivers/gpu/drm/i915/intel_display.c | 57 ++++++------ 6 files changed, 190 insertions(+), 80 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_gem_clflush.c diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 0857e5035f4d..6afd402e440b 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -29,6 +29,7 @@ i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o # GEM code i915-y += i915_cmd_parser.o \ i915_gem_batch_pool.o \ + i915_gem_clflush.o \ i915_gem_context.o \ i915_gem_dmabuf.o \ i915_gem_evict.o \ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 2754d5de76af..c80044267333 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3410,7 +3410,13 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error) void i915_gem_reset(struct drm_i915_private *dev_priv); void i915_gem_set_wedged(struct drm_i915_private *dev_priv); -bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); + +void i915_gem_clflush_init(struct drm_i915_private *i915); +int i915_gem_clflush_object(struct drm_i915_gem_object *obj, + unsigned int flags); +#define I915_CLFLUSH_FORCE BIT(0) +#define I915_CLFLUSH_SYNC BIT(1) + int __must_check i915_gem_init(struct drm_device *dev); int __must_check i915_gem_init_hw(struct drm_device *dev); void i915_gem_init_swizzling(struct drm_device *dev); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index cffe60237b6a..524f72774537 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -230,7 +230,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj) obj->mm.dirty = false; if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) - i915_gem_clflush_object(obj, false); + i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC); obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; @@ -1570,6 +1570,11 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, mutex_unlock(&dev->struct_mutex); + if (err == 0) + err = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT, + NULL); if (write_domain != 0) intel_fb_obj_invalidate(obj, write_origin(obj, write_domain)); @@ -3236,44 +3241,6 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) return ret; } -bool -i915_gem_clflush_object(struct drm_i915_gem_object *obj, - bool force) -{ - /* If we don't have a page list set up, then we're not pinned - * to GPU, and we can ignore the cache flush because it'll happen - * again at bind time. - */ - if (!obj->mm.pages) - return false; - - /* - * Stolen memory is always coherent with the GPU as it is explicitly - * marked as wc by the system, or the system is cache-coherent. - */ - if (obj->stolen || obj->phys_handle) - return false; - - /* If the GPU is snooping the contents of the CPU cache, - * we do not need to manually clear the CPU cache lines. However, - * the caches are only snooped when the render cache is - * flushed/invalidated. As we always have to emit invalidations - * and flushes when moving into and out of the RENDER domain, correct - * snooping behaviour occurs naturally as the result of our domain - * tracking. - */ - if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) { - obj->cache_dirty = true; - return false; - } - - trace_i915_gem_object_clflush(obj); - drm_clflush_sg(obj->mm.pages); - obj->cache_dirty = false; - - return true; -} - /** Flushes the GTT write domain for the object if it's dirty. */ static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj) @@ -3317,9 +3284,7 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj) if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) return; - if (i915_gem_clflush_object(obj, obj->pin_display)) - i915_gem_chipset_flush(to_i915(obj->base.dev)); - + i915_gem_clflush_object(obj, obj->pin_display); intel_fb_obj_flush(obj, false, ORIGIN_CPU); obj->base.write_domain = 0; @@ -3526,10 +3491,8 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, * object is now coherent at its new cache level (with respect * to the access domain). */ - if (obj->cache_dirty && cpu_write_needs_clflush(obj)) { - if (i915_gem_clflush_object(obj, true)) - i915_gem_chipset_flush(to_i915(obj->base.dev)); - } + if (obj->cache_dirty && cpu_write_needs_clflush(obj)) + i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE); return 0; } @@ -3759,8 +3722,7 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) /* Flush the CPU cache if it's still invalid. */ if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) { - i915_gem_clflush_object(obj, false); - + i915_gem_clflush_object(obj, 0); obj->base.read_domains |= I915_GEM_DOMAIN_CPU; } @@ -4815,6 +4777,8 @@ int i915_gem_init(struct drm_device *dev) mutex_lock(&dev->struct_mutex); + i915_gem_clflush_init(dev_priv); + if (!i915.enable_execlists) { dev_priv->gt.resume = intel_legacy_submission_resume; dev_priv->gt.cleanup_engine = intel_engine_cleanup; diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c new file mode 100644 index 000000000000..aae2155a51a8 --- /dev/null +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c @@ -0,0 +1,138 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "i915_drv.h" + +static DEFINE_SPINLOCK(clflush_lock); +static u64 clflush_context; + +struct clflush_fence { + struct dma_fence dma; + struct work_struct work; + struct drm_i915_gem_object *obj; +}; + +static const char *i915_clflush_get_driver_name(struct dma_fence *fence) +{ + return "i915"; +} + +static const char *i915_clflush_get_timeline_name(struct dma_fence *fence) +{ + return "clflush"; +} + +static bool i915_clflush_enable_signaling(struct dma_fence *fence) +{ + return true; +} + +const struct dma_fence_ops i915_clflush_ops = { + .get_driver_name = i915_clflush_get_driver_name, + .get_timeline_name = i915_clflush_get_timeline_name, + .enable_signaling = i915_clflush_enable_signaling, + .wait = dma_fence_default_wait, +}; + +static bool cpu_cache_is_coherent(struct drm_device *dev, + enum i915_cache_level level) +{ + return HAS_LLC(dev) || level != I915_CACHE_NONE; +} + +static void i915_clflush_work(struct work_struct *work) +{ + struct clflush_fence *fence = container_of(work, typeof(*fence), work); + + drm_clflush_sg(fence->obj->mm.pages); + i915_gem_object_put(fence->obj); + + dma_fence_signal(&fence->dma); + dma_fence_put(&fence->dma); +} + +int i915_gem_clflush_object(struct drm_i915_gem_object *obj, + unsigned int flags) +{ + struct clflush_fence *fence; + + /* If we don't have a page list set up, then we're not pinned + * to GPU, and we can ignore the cache flush because it'll happen + * again at bind time. + */ + if (!obj->mm.pages) + return false; + + /* + * Stolen memory is always coherent with the GPU as it is explicitly + * marked as wc by the system, or the system is cache-coherent. + */ + if (obj->stolen || obj->phys_handle) + return false; + + /* If the GPU is snooping the contents of the CPU cache, + * we do not need to manually clear the CPU cache lines. However, + * the caches are only snooped when the render cache is + * flushed/invalidated. As we always have to emit invalidations + * and flushes when moving into and out of the RENDER domain, correct + * snooping behaviour occurs naturally as the result of our domain + * tracking. + */ + if (!(flags & I915_CLFLUSH_FORCE) && + cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) { + obj->cache_dirty = true; + return false; + } + + trace_i915_gem_object_clflush(obj); + + fence = NULL; + ww_mutex_lock(&obj->resv->lock, NULL); + if (!(flags & I915_CLFLUSH_SYNC) && + reservation_object_reserve_shared(obj->resv) == 0) + fence = kmalloc(sizeof(*fence), GFP_KERNEL); + if (fence) { + dma_fence_init(&fence->dma, + &i915_clflush_ops, + &clflush_lock, + clflush_context, + 0); + + fence->obj = i915_gem_object_get(obj); + INIT_WORK(&fence->work, i915_clflush_work); + schedule_work(&fence->work); + + reservation_object_add_shared_fence(obj->resv, &fence->dma); + } else { + drm_clflush_sg(obj->mm.pages); + } + ww_mutex_unlock(&obj->resv->lock); + obj->cache_dirty = false; + return 0; +} + +void i915_gem_clflush_init(struct drm_i915_private *i915) +{ + clflush_context = dma_fence_context_alloc(1); +} diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 55a8db2690c6..467725259f8c 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1115,13 +1115,13 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC) continue; + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) + i915_gem_clflush_object(obj, false); + ret = i915_gem_request_await_object (req, obj, obj->base.pending_write_domain); if (ret) return ret; - - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) - i915_gem_clflush_object(obj, false); } /* Unconditionally flush any chipset caches (for streaming writes). */ diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 6e7f8a47b862..8ee4e84dc1b8 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14155,12 +14155,39 @@ intel_prepare_plane_fb(struct drm_plane *plane, { struct intel_atomic_state *intel_state = to_intel_atomic_state(new_state->state); - struct drm_device *dev = plane->dev; - struct drm_i915_private *dev_priv = to_i915(dev); + struct drm_i915_private *dev_priv = to_i915(plane->dev); struct drm_framebuffer *fb = new_state->fb; struct drm_i915_gem_object *obj = intel_fb_obj(fb); int ret; + if (obj) { + if (plane->type == DRM_PLANE_TYPE_CURSOR && + INTEL_INFO(dev_priv)->cursor_needs_physical) { + int align = IS_I830(dev_priv) ? 16 * 1024 : 256; + + ret = i915_gem_object_attach_phys(obj, align); + if (ret) { + DRM_DEBUG_KMS("failed to attach phys object\n"); + return ret; + } + } else { + struct i915_vma *vma; + + vma = intel_pin_and_fence_fb_obj(fb, new_state->rotation); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + to_intel_plane_state(new_state)->vma = vma; + if (to_intel_plane_state(plane->state)->vma != vma) { + struct intel_crtc_state *crtc_state; + + crtc_state = intel_atomic_get_crtc_state(new_state->state, + to_intel_crtc(new_state->crtc)); + crtc_state->fb_changed = true; + } + } + } + if (plane->state->fb && old_plane_needs_modeset(plane, new_state)) { struct drm_i915_gem_object *old_obj = intel_fb_obj(plane->state->fb); @@ -14207,32 +14234,6 @@ intel_prepare_plane_fb(struct drm_plane *plane, i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY); } - if (plane->type == DRM_PLANE_TYPE_CURSOR && - INTEL_INFO(dev)->cursor_needs_physical) { - int align = IS_I830(dev_priv) ? 16 * 1024 : 256; - - ret = i915_gem_object_attach_phys(obj, align); - if (ret) { - DRM_DEBUG_KMS("failed to attach phys object\n"); - return ret; - } - } else { - struct i915_vma *vma; - - vma = intel_pin_and_fence_fb_obj(fb, new_state->rotation); - if (IS_ERR(vma)) - return PTR_ERR(vma); - - to_intel_plane_state(new_state)->vma = vma; - if (to_intel_plane_state(plane->state)->vma != vma) { - struct intel_crtc_state *crtc_state; - - crtc_state = intel_atomic_get_crtc_state(new_state->state, - to_intel_crtc(new_state->crtc)); - crtc_state->fb_changed = true; - } - } - return 0; } -- 2.10.2 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx