Use is_locked parameter in __i915_wait_request() to determine if a thread should be forced to back off and retry or if it can continue sleeping. Don't return -EIO from __i915_wait_request since that is bad for the upper layers, only -EAGAIN to signify reset in progress. Also, use is_locked in trace_i915_gem_request_wait_begin() trace point for more accurate reflection of current thread's lock state. Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem.c | 66 ++++++++++++++++++++++++++++++--------- drivers/gpu/drm/i915/i915_trace.h | 15 +++------ 2 files changed, 56 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 340418b..3339365 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1245,7 +1245,6 @@ int __i915_wait_request(struct drm_i915_gem_request *req, unsigned long timeout_expire; s64 before, now; int ret; - int reset_in_progress = 0; WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled"); @@ -1262,28 +1261,67 @@ int __i915_wait_request(struct drm_i915_gem_request *req, return -ENODEV; /* Record current time in case interrupted by signal, or wedged */ - trace_i915_gem_request_wait_begin(req); + trace_i915_gem_request_wait_begin(req, is_locked); before = ktime_get_raw_ns(); for (;;) { struct timer_list timer; + bool full_gpu_reset_completed_unlocked = false; + bool reset_in_progress_locked = false; prepare_to_wait(&ring->irq_queue, &wait, interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); - /* We need to check whether any gpu reset happened in between - * the caller grabbing the seqno and now ... */ - reset_in_progress = - i915_gem_check_wedge(ring->dev->dev_private, NULL, interruptible); + /* + * Rules for waiting with/without struct_mutex held in the face + * of asynchronous TDR hang detection/recovery: + * + * ("reset in progress" = TDR has detected a hang, hang + * recovery may or may not have commenced) + * + * 1. Is this thread holding the struct_mutex and is any of the + * following true? + * + * a) Is any kind of reset in progress? + * b) Has a full GPU reset happened while this thread were + * sleeping? + * + * If so: + * Return -EAGAIN. The caller should interpret this as: + * Release struct_mutex, try to acquire struct_mutex + * (through i915_mutex_lock_interruptible(), which will + * fail as long as any reset is in progress) and retry the + * call to __i915_wait_request(), which hopefully will go + * better as soon as the hang has been resolved. + * + * 2. Is this thread not holding the struct_mutex and has a + * full GPU reset completed? + * + * If so: + * Return 0. Since the request has been purged there is no + * requests left to wait for. Just go home. + * + * 3. Is this thread not holding the struct_mutex and is any + * kind of reset in progress? + * + * If so: + * This thread may keep on waiting. + */ - if ((reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) || - reset_in_progress) { + reset_in_progress_locked = + (((i915_gem_check_wedge(ring->dev->dev_private, NULL, interruptible)) || + (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))) && + is_locked); - /* ... but upgrade the -EAGAIN to an -EIO if the gpu - * is truely gone. */ - if (reset_in_progress) - ret = reset_in_progress; - else - ret = -EAGAIN; + if (reset_in_progress_locked) { + ret = -EAGAIN; + break; + } + + full_gpu_reset_completed_unlocked = + (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)); + + if (full_gpu_reset_completed_unlocked) { + ret = 0; break; } diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index b970259..fe0524e 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -556,8 +556,8 @@ DEFINE_EVENT(i915_gem_request, i915_gem_request_complete, ); TRACE_EVENT(i915_gem_request_wait_begin, - TP_PROTO(struct drm_i915_gem_request *req), - TP_ARGS(req), + TP_PROTO(struct drm_i915_gem_request *req, bool blocking), + TP_ARGS(req, blocking), TP_STRUCT__entry( __field(u32, dev) @@ -566,25 +566,18 @@ TRACE_EVENT(i915_gem_request_wait_begin, __field(bool, blocking) ), - /* NB: the blocking information is racy since mutex_is_locked - * doesn't check that the current thread holds the lock. The only - * other option would be to pass the boolean information of whether - * or not the class was blocking down through the stack which is - * less desirable. - */ TP_fast_assign( struct intel_engine_cs *ring = i915_gem_request_get_ring(req); __entry->dev = ring->dev->primary->index; __entry->ring = ring->id; __entry->seqno = i915_gem_request_get_seqno(req); - __entry->blocking = - mutex_is_locked(&ring->dev->struct_mutex); + __entry->blocking = blocking; ), TP_printk("dev=%u, ring=%u, seqno=%u, blocking=%s", __entry->dev, __entry->ring, - __entry->seqno, __entry->blocking ? "yes (NB)" : "no") + __entry->seqno, __entry->blocking ? "yes" : "no") ); DEFINE_EVENT(i915_gem_request, i915_gem_request_wait_end, -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx