Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > I915_RESET_IN_PROGRESS is being used for both signaling the requirement > to i915_mutex_lock_interruptible() to avoid taking the struct_mutex and > to instruct a waiter (already holding the struct_mutex) to perform the > reset. To allow for a little more coordination, split these two meaning > into a couple of distinct flags. I915_RESET_BACKOFF tells > i915_mutex_lock_interruptible() not to acquire the mutex and > I915_RESET_HANDOFF tells the waiter to call i915_reset(). > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> + Acked-by: Michel Thierry <michel.thierry@xxxxxxxxx> from earlier version. -Mika > --- > drivers/gpu/drm/i915/i915_debugfs.c | 16 +++++----- > drivers/gpu/drm/i915/i915_drv.c | 7 +++-- > drivers/gpu/drm/i915/i915_drv.h | 40 +++++++++++++++++++++--- > drivers/gpu/drm/i915/i915_gem.c | 5 +-- > drivers/gpu/drm/i915/i915_gem_request.c | 2 +- > drivers/gpu/drm/i915/i915_irq.c | 38 ++++------------------ > drivers/gpu/drm/i915/intel_display.c | 4 +-- > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 18 +++++++---- > 8 files changed, 71 insertions(+), 59 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 9db6b041a799..5fdf8c137235 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) > enum intel_engine_id id; > > if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags)) > - seq_printf(m, "Wedged\n"); > - if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags)) > - seq_printf(m, "Reset in progress\n"); > + seq_puts(m, "Wedged\n"); > + if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) > + seq_puts(m, "Reset in progress: struct_mutex backoff\n"); > + if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags)) > + seq_puts(m, "Reset in progress: reset handoff to waiter\n"); > if (waitqueue_active(&dev_priv->gpu_error.wait_queue)) > - seq_printf(m, "Waiter holding struct mutex\n"); > + seq_puts(m, "Waiter holding struct mutex\n"); > if (waitqueue_active(&dev_priv->gpu_error.reset_queue)) > - seq_printf(m, "struct_mutex blocked for reset\n"); > + seq_puts(m, "struct_mutex blocked for reset\n"); > > if (!i915.enable_hangcheck) { > - seq_printf(m, "Hangcheck disabled\n"); > + seq_puts(m, "Hangcheck disabled\n"); > return 0; > } > > @@ -4121,7 +4123,7 @@ i915_wedged_set(void *data, u64 val) > * while it is writing to 'i915_wedged' > */ > > - if (i915_reset_in_progress(&dev_priv->gpu_error)) > + if (i915_reset_backoff(&dev_priv->gpu_error)) > return -EAGAIN; > > i915_handle_error(dev_priv, val, > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index 9164167cd147..be3c81221d11 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv) > int ret; > > lockdep_assert_held(&dev_priv->drm.struct_mutex); > + GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); > > - if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags)) > + if (!test_bit(I915_RESET_HANDOFF, &error->flags)) > return; > > /* Clear any previous failed attempts at recovery. Time to try again. */ > @@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv) > wakeup: > i915_gem_reset_finish(dev_priv); > enable_irq(dev_priv->drm.irq); > - wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS); > + > + clear_bit(I915_RESET_HANDOFF, &error->flags); > + wake_up_bit(&error->flags, I915_RESET_HANDOFF); > return; > > error: > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 5156bcc59dea..0b02a6d710e1 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -1595,8 +1595,33 @@ struct i915_gpu_error { > */ > unsigned long reset_count; > > + /** > + * flags: Control various stages of the GPU reset > + * > + * #I915_RESET_BACKOFF - When we start a reset, we want to stop any > + * other users acquiring the struct_mutex. To do this we set the > + * #I915_RESET_BACKOFF bit in the error flags when we detect a reset > + * and then check for that bit before acquiring the struct_mutex (in > + * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a > + * secondary role in preventing two concurrent global reset attempts. > + * > + * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the > + * struct_mutex. We try to acquire the struct_mutex in the reset worker, > + * but it may be held by some long running waiter (that we cannot > + * interrupt without causing trouble). Once we are ready to do the GPU > + * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If > + * they already hold the struct_mutex and want to participate they can > + * inspect the bit and do the reset directly, otherwise the worker > + * waits for the struct_mutex. > + * > + * #I915_WEDGED - If reset fails and we can no longer use the GPU, > + * we set the #I915_WEDGED bit. Prior to command submission, e.g. > + * i915_gem_request_alloc(), this bit is checked and the sequence > + * aborted (with -EIO reported to userspace) if set. > + */ > unsigned long flags; > -#define I915_RESET_IN_PROGRESS 0 > +#define I915_RESET_BACKOFF 0 > +#define I915_RESET_HANDOFF 1 > #define I915_WEDGED (BITS_PER_LONG - 1) > > /** > @@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine); > > void i915_gem_retire_requests(struct drm_i915_private *dev_priv); > > -static inline bool i915_reset_in_progress(struct i915_gpu_error *error) > +static inline bool i915_reset_backoff(struct i915_gpu_error *error) > +{ > + return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags)); > +} > + > +static inline bool i915_reset_handoff(struct i915_gpu_error *error) > { > - return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags)); > + return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags)); > } > > static inline bool i915_terminally_wedged(struct i915_gpu_error *error) > @@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error) > return unlikely(test_bit(I915_WEDGED, &error->flags)); > } > > -static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error) > +static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error) > { > - return i915_reset_in_progress(error) | i915_terminally_wedged(error); > + return i915_reset_backoff(error) | i915_terminally_wedged(error); > } > > static inline u32 i915_reset_count(struct i915_gpu_error *error) > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index d87983ba536f..ecd1b038318a 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error) > > might_sleep(); > > - if (!i915_reset_in_progress(error)) > - return 0; > - > /* > * Only wait 10 seconds for the gpu reset to complete to avoid hanging > * userspace. If it takes that long something really bad is going on and > * we should simply try to bail out and fail as gracefully as possible. > */ > ret = wait_event_interruptible_timeout(error->reset_queue, > - !i915_reset_in_progress(error), > + !i915_reset_backoff(error), > I915_RESET_TIMEOUT); > if (ret == 0) { > DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); > diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c > index 1e1d9f2072cd..0e8d1010cecb 100644 > --- a/drivers/gpu/drm/i915/i915_gem_request.c > +++ b/drivers/gpu/drm/i915/i915_gem_request.c > @@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req, > > static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request) > { > - if (likely(!i915_reset_in_progress(&request->i915->gpu_error))) > + if (likely(!i915_reset_handoff(&request->i915->gpu_error))) > return false; > > __set_current_state(TASK_RUNNING); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index e646c4eba65d..495e6a79cacf 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg) > return ret; > } > > -static void i915_error_wake_up(struct drm_i915_private *dev_priv) > -{ > - /* > - * Notify all waiters for GPU completion events that reset state has > - * been changed, and that they need to restart their wait after > - * checking for potential errors (and bail out to drop locks if there is > - * a gpu reset pending so that i915_error_work_func can acquire them). > - */ > - > - /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */ > - wake_up_all(&dev_priv->gpu_error.wait_queue); > - > - /* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */ > - wake_up_all(&dev_priv->pending_flip_queue); > -} > - > /** > * i915_reset_and_wakeup - do process context error handling work > * @dev_priv: i915 device private > @@ -2676,6 +2660,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > intel_runtime_pm_get(dev_priv); > intel_prepare_reset(dev_priv); > > + set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags); > + wake_up_all(&dev_priv->gpu_error.wait_queue); > + > do { > /* > * All state reset _must_ be completed before we update the > @@ -2690,7 +2677,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > > /* We need to wait for anyone holding the lock to wakeup */ > } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags, > - I915_RESET_IN_PROGRESS, > + I915_RESET_HANDOFF, > TASK_UNINTERRUPTIBLE, > HZ)); > > @@ -2705,6 +2692,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > * Note: The wake_up also serves as a memory barrier so that > * waiters see the updated value of the dev_priv->gpu_error. > */ > + clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags); > wake_up_all(&dev_priv->gpu_error.reset_queue); > } > > @@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private *dev_priv, > if (!engine_mask) > return; > > - if (test_and_set_bit(I915_RESET_IN_PROGRESS, > + if (test_and_set_bit(I915_RESET_BACKOFF, > &dev_priv->gpu_error.flags)) > return; > > - /* > - * Wakeup waiting processes so that the reset function > - * i915_reset_and_wakeup doesn't deadlock trying to grab > - * various locks. By bumping the reset counter first, the woken > - * processes will see a reset in progress and back off, > - * releasing their locks and then wait for the reset completion. > - * We must do this for _all_ gpu waiters that might hold locks > - * that the reset work needs to acquire. > - * > - * Note: The wake_up also provides a memory barrier to ensure that the > - * waiters see the updated value of the reset flags. > - */ > - i915_error_wake_up(dev_priv); > - > i915_reset_and_wakeup(dev_priv); > } > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > index 4b73513b46c1..5959c9b6dc97 100644 > --- a/drivers/gpu/drm/i915/intel_display.c > +++ b/drivers/gpu/drm/i915/intel_display.c > @@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc) > { > struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error; > > - if (i915_reset_in_progress(error)) > + if (i915_reset_backoff(error)) > return true; > > if (crtc->reset_count != i915_reset_count(error)) > @@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > goto cleanup; > > intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error); > - if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) { > + if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) { > ret = -EIO; > goto unlock; > } > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index d4acee6730e9..6ec7c731a267 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -301,7 +301,8 @@ static int igt_global_reset(void *arg) > > /* Check that we can issue a global GPU reset */ > > - set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags); > + set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); > + set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags); > > mutex_lock(&i915->drm.struct_mutex); > reset_count = i915_reset_count(&i915->gpu_error); > @@ -314,7 +315,8 @@ static int igt_global_reset(void *arg) > } > mutex_unlock(&i915->drm.struct_mutex); > > - GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags)); > + GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); > + clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); > if (i915_terminally_wedged(&i915->gpu_error)) > err = -EIO; > > @@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq) > > reset_count = i915_reset_count(&rq->i915->gpu_error); > > - set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags); > + set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags); > wake_up_all(&rq->i915->gpu_error.wait_queue); > > return reset_count; > @@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg) > > /* Check that we detect a stuck waiter and issue a reset */ > > - set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags); > + set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); > > mutex_lock(&i915->drm.struct_mutex); > err = hang_init(&h, i915); > @@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg) > err = timeout; > goto out_rq; > } > - GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags)); > > + GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); > if (i915_reset_count(&i915->gpu_error) == reset_count) { > pr_err("No GPU reset recorded!\n"); > err = -EINVAL; > @@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg) > hang_fini(&h); > unlock: > mutex_unlock(&i915->drm.struct_mutex); > + clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); > > if (i915_terminally_wedged(&i915->gpu_error)) > return -EIO; > @@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg) > if (!igt_can_mi_store_dword_imm(i915)) > return 0; > > + set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); > mutex_lock(&i915->drm.struct_mutex); > err = hang_init(&h, i915); > if (err) > @@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg) > > i915_reset(i915); > > - GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, > + GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, > &i915->gpu_error.flags)); > + > if (prev->fence.error != -EIO) { > pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", > prev->fence.error); > @@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg) > hang_fini(&h); > unlock: > mutex_unlock(&i915->drm.struct_mutex); > + clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); > > if (i915_terminally_wedged(&i915->gpu_error)) > return -EIO; > -- > 2.11.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx