From: Tomas Elf <tomas.elf@xxxxxxxxx> Use is_locked parameter in __i915_wait_request() to determine if a thread should be forced to back off and retry or if it can continue sleeping. Don't return -EIO from __i915_wait_request since that is bad for the upper layers, only -EAGAIN to signify reset in progress. (unless the driver is terminally wedged, in which case there's no mode of recovery left to attempt) Also, use is_locked in trace_i915_gem_request_wait_begin() trace point for more accurate reflection of current thread's lock state. Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem.c | 88 ++++++++++++++++++++++++++++++++------- drivers/gpu/drm/i915/i915_trace.h | 15 ++----- 2 files changed, 78 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 7122315..80df5b5 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1287,7 +1287,6 @@ int __i915_wait_request(struct drm_i915_gem_request *req, unsigned long timeout_expire; s64 before, now; int ret; - int reset_in_progress = 0; WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled"); @@ -1312,7 +1311,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req, gen6_rps_boost(dev_priv, rps, req->emitted_jiffies); /* Record current time in case interrupted by signal, or wedged */ - trace_i915_gem_request_wait_begin(req); + trace_i915_gem_request_wait_begin(req, is_locked); before = ktime_get_raw_ns(); /* Optimistic spin for the next jiffie before touching IRQs */ @@ -1327,23 +1326,84 @@ int __i915_wait_request(struct drm_i915_gem_request *req, for (;;) { struct timer_list timer; + bool full_gpu_reset_completed_unlocked = false; + bool reset_in_progress_locked = false; prepare_to_wait(&ring->irq_queue, &wait, state); - /* We need to check whether any gpu reset happened in between - * the caller grabbing the seqno and now ... */ - reset_in_progress = - i915_gem_check_wedge(ring->dev->dev_private, NULL, interruptible); + /* + * Rules for waiting with/without struct_mutex held during + * asynchronous TDR hang detection/recovery: + * + * ("reset in progress" = TDR has detected a hang, hang + * recovery may or may not have commenced) + * + * 1. Is the driver terminally wedged? If so, return -EIO since + * there is no point in having the caller retry - the driver + * is irrecoverably stuck. + * + * 2. Is this thread holding the struct_mutex and is any of the + * following true? + * + * a) Is any kind of reset in progress? + * b) Has a full GPU reset happened while this thread were + * sleeping? + * + * If so: + * Return -EAGAIN. The caller should interpret this as: + * Release struct_mutex, try to acquire struct_mutex + * (through i915_mutex_lock_interruptible(), which will + * fail as long as any reset is in progress) and retry the + * call to __i915_wait_request(), which hopefully will go + * better as soon as the hang has been resolved. + * + * 3. Is this thread not holding the struct_mutex and has a + * full GPU reset completed? (that is, the reset count has + * changed but there is currenly no full GPU reset in + * progress?) + * + * If so: + * Return 0. Since the request has been purged there is no + * requests left to wait for. Just go home. + * + * 4. Is this thread not holding the struct_mutex and is any + * kind of reset in progress? + * + * If so: + * This thread may keep on waiting. + */ - if ((reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) || - reset_in_progress) { + if (i915_terminally_wedged(&dev_priv->gpu_error)) { + ret = -EIO; + break; + } - /* ... but upgrade the -EAGAIN to an -EIO if the gpu - * is truely gone. */ - if (reset_in_progress) - ret = reset_in_progress; - else - ret = -EAGAIN; + reset_in_progress_locked = + (((i915_gem_check_wedge(ring->dev->dev_private, NULL, interruptible)) || + (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))) && + is_locked); + + if (reset_in_progress_locked) { + /* + * If the caller is holding the struct_mutex throw them + * out since TDR needs access to it. + */ + ret = -EAGAIN; + break; + } + + full_gpu_reset_completed_unlocked = + (((reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) && + (!i915_reset_in_progress(&dev_priv->gpu_error)))); + + if (full_gpu_reset_completed_unlocked) { + /* + * Full GPU reset without holding the struct_mutex has + * completed - just return. If recovery is still in + * progress the thread will keep on sleeping until + * recovery is complete. + */ + ret = 0; break; } diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index 5c15d43..7dcac93 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -591,8 +591,8 @@ DEFINE_EVENT(i915_gem_request, i915_gem_request_complete, ); TRACE_EVENT(i915_gem_request_wait_begin, - TP_PROTO(struct drm_i915_gem_request *req), - TP_ARGS(req), + TP_PROTO(struct drm_i915_gem_request *req, bool blocking), + TP_ARGS(req, blocking), TP_STRUCT__entry( __field(u32, dev) @@ -601,25 +601,18 @@ TRACE_EVENT(i915_gem_request_wait_begin, __field(bool, blocking) ), - /* NB: the blocking information is racy since mutex_is_locked - * doesn't check that the current thread holds the lock. The only - * other option would be to pass the boolean information of whether - * or not the class was blocking down through the stack which is - * less desirable. - */ TP_fast_assign( struct intel_engine_cs *ring = i915_gem_request_get_ring(req); __entry->dev = ring->dev->primary->index; __entry->ring = ring->id; __entry->seqno = i915_gem_request_get_seqno(req); - __entry->blocking = - mutex_is_locked(&ring->dev->struct_mutex); + __entry->blocking = blocking; ), TP_printk("dev=%u, ring=%u, seqno=%u, blocking=%s", __entry->dev, __entry->ring, - __entry->seqno, __entry->blocking ? "yes (NB)" : "no") + __entry->seqno, __entry->blocking ? "yes" : "no") ); DEFINE_EVENT(i915_gem_request, i915_gem_request_wait_end, -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx