From: John Harrison <John.C.Harrison@xxxxxxxxx> The scheduler has always tracked batch buffer dependencies based on DRM object usage. This means that it will not submit a batch on one ring that has outstanding dependencies still executing on other rings. This is exactly the same synchronisation performed by i915_gem_object_sync() using hardware semaphores where available and CPU stalls where not (e.g. in execlist mode and/or on Gen8 hardware). Unfortunately, when a batch buffer is submitted to the driver the _object_sync() call happens first. Thus in case where hardware semaphores are disabled, the driver has already stalled until the dependency has been resolved. This patch adds an optimisation to _object_sync() to ignore the synchronisation in the case where it will subsequently be handled by the scheduler. This removes the driver stall and (in the single application case) provides near hardware semaphore performance even when hardware semaphores are disabled. In a busy system where there is other work that can be executed on the stalling ring, it provides better than hardware semaphore performance as it removes the stall from both the driver and from the hardware. There is also a theory that this method should improve power usage as hardware semaphores are apparently not very power efficient - the stalled ring does not go into as low a power a state as when it is genuinely idle. The optimisation is to check whether both ends of the synchronisation are batch buffer requests. If they are, then the scheduler will have the inter-dependency tracked and managed. If one or other end is not a batch buffer request (e.g. a page flip) then the code falls back to the CPU stall or hardware semaphore as appropriate. To check whether the existing usage is a batch buffer, the code simply calls the 'are you tracking this request' function of the scheduler on the object's last_read_req member. To check whether the new usage is a batch buffer, a flag is passed in from the caller. Change-Id: Idc16e19b5a4dc8b3782ce9db44dd3df445f396c1 Issue: VIZ-5566 Signed-off-by: John Harrison <John.C.Harrison@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_gem.c | 19 +++++++++++++++---- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +- drivers/gpu/drm/i915/intel_lrc.c | 2 +- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e230632..e4bef2c 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2895,7 +2895,7 @@ int __must_check i915_mutex_lock_interruptible(struct drm_device *dev); #endif int i915_gem_object_sync(struct drm_i915_gem_object *obj, struct intel_engine_cs *to, - struct drm_i915_gem_request **to_req); + struct drm_i915_gem_request **to_req, bool to_batch); void i915_vma_move_to_active(struct i915_vma *vma, struct drm_i915_gem_request *req); int i915_gem_dumb_create(struct drm_file *file_priv, diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 7308838..e0dca8c 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3507,7 +3507,7 @@ static int __i915_gem_object_sync(struct drm_i915_gem_object *obj, struct intel_engine_cs *to, struct drm_i915_gem_request *from_req, - struct drm_i915_gem_request **to_req) + struct drm_i915_gem_request **to_req, bool to_batch) { struct intel_engine_cs *from; int ret; @@ -3519,6 +3519,15 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj, if (i915_gem_request_completed(from_req)) return 0; + /* + * The scheduler will manage inter-ring object dependencies + * as long as both to and from requests are scheduler managed + * (i.e. batch buffers). + */ + if (to_batch && + i915_scheduler_is_request_tracked(from_req, NULL, NULL)) + return 0; + if (!i915_semaphore_is_enabled(obj->base.dev)) { struct drm_i915_private *i915 = to_i915(obj->base.dev); ret = __i915_wait_request(from_req, @@ -3569,6 +3578,8 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj, * @to_req: request we wish to use the object for. See below. * This will be allocated and returned if a request is * required but not passed in. + * @to_batch: is the sync request on behalf of batch buffer submission? + * If so then the scheduler can (potentially) manage the synchronisation. * * This code is meant to abstract object synchronization with the GPU. * Calling with NULL implies synchronizing the object with the CPU @@ -3599,7 +3610,7 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj, int i915_gem_object_sync(struct drm_i915_gem_object *obj, struct intel_engine_cs *to, - struct drm_i915_gem_request **to_req) + struct drm_i915_gem_request **to_req, bool to_batch) { const bool readonly = obj->base.pending_write_domain == 0; struct drm_i915_gem_request *req[I915_NUM_RINGS]; @@ -3621,7 +3632,7 @@ i915_gem_object_sync(struct drm_i915_gem_object *obj, req[n++] = obj->last_read_req[i]; } for (i = 0; i < n; i++) { - ret = __i915_gem_object_sync(obj, to, req[i], to_req); + ret = __i915_gem_object_sync(obj, to, req[i], to_req, to_batch); if (ret) return ret; } @@ -4568,7 +4579,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, u32 old_read_domains, old_write_domain; int ret; - ret = i915_gem_object_sync(obj, pipelined, pipelined_request); + ret = i915_gem_object_sync(obj, pipelined, pipelined_request, false); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 5fd07ae..82d3975 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -910,7 +910,7 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, struct drm_i915_gem_object *obj = vma->obj; if (obj->active & other_rings) { - ret = i915_gem_object_sync(obj, req->ring, &req); + ret = i915_gem_object_sync(obj, req->ring, &req, true); if (ret) return ret; } diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 3cedddb..659c9d9 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -624,7 +624,7 @@ static int execlists_move_to_gpu(struct drm_i915_gem_request *req, struct drm_i915_gem_object *obj = vma->obj; if (obj->active & other_rings) { - ret = i915_gem_object_sync(obj, req->ring, &req); + ret = i915_gem_object_sync(obj, req->ring, &req, true); if (ret) return ret; } -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx