Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Now that the submission backends are controlled via their own spinlocks, > with a wave of a magic wand we can lift the struct_mutex requirement > around GPU reset. That is we allow the submission frontend (userspace) > to keep on submitting while we process the GPU reset as we can suspend > the backend independently. > > The major change is around the backoff/handoff strategy for performing > the reset. With no mutex deadlock, we no longer have to coordinate with > any waiter, and just perform the reset immediately. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 38 +- > drivers/gpu/drm/i915/i915_drv.h | 5 - > drivers/gpu/drm/i915/i915_gem.c | 18 +- > drivers/gpu/drm/i915/i915_gem_fence_reg.h | 1 - > drivers/gpu/drm/i915/i915_gem_gtt.h | 1 + > drivers/gpu/drm/i915/i915_gpu_error.c | 104 +++-- > drivers/gpu/drm/i915/i915_gpu_error.h | 28 +- > drivers/gpu/drm/i915/i915_request.c | 47 --- > drivers/gpu/drm/i915/i915_reset.c | 397 ++++++++---------- > drivers/gpu/drm/i915/i915_reset.h | 3 + > drivers/gpu/drm/i915/intel_engine_cs.c | 6 +- > drivers/gpu/drm/i915/intel_guc_submission.c | 5 +- > drivers/gpu/drm/i915/intel_hangcheck.c | 28 +- > drivers/gpu/drm/i915/intel_lrc.c | 92 ++-- > drivers/gpu/drm/i915/intel_overlay.c | 2 - > drivers/gpu/drm/i915/intel_ringbuffer.c | 91 ++-- > drivers/gpu/drm/i915/intel_ringbuffer.h | 17 +- > .../gpu/drm/i915/selftests/intel_hangcheck.c | 57 +-- > .../drm/i915/selftests/intel_workarounds.c | 3 - > .../gpu/drm/i915/selftests/mock_gem_device.c | 4 +- > 20 files changed, 393 insertions(+), 554 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 24d6d4ce14ef..3ec369980d40 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) > seq_puts(m, "Wedged\n"); > if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) > seq_puts(m, "Reset in progress: struct_mutex backoff\n"); > - if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags)) > - seq_puts(m, "Reset in progress: reset handoff to waiter\n"); > if (waitqueue_active(&dev_priv->gpu_error.wait_queue)) > seq_puts(m, "Waiter holding struct mutex\n"); > if (waitqueue_active(&dev_priv->gpu_error.reset_queue)) > @@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) > struct rb_node *rb; > > seq_printf(m, "%s:\n", engine->name); > - seq_printf(m, "\tseqno = %x [current %x, last %x]\n", > + seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n", > engine->hangcheck.seqno, seqno[id], > - intel_engine_last_submit(engine)); > - seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n", > + intel_engine_last_submit(engine), > + jiffies_to_msecs(jiffies - > + engine->hangcheck.action_timestamp)); > + seq_printf(m, "\twaiters? %s, fake irq active? %s\n", > yesno(intel_engine_has_waiter(engine)), > yesno(test_bit(engine->id, > - &dev_priv->gpu_error.missed_irq_rings)), > - yesno(engine->hangcheck.stalled), > - yesno(engine->hangcheck.wedged)); > + &dev_priv->gpu_error.missed_irq_rings))); > > spin_lock_irq(&b->rb_lock); > for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) { > @@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) > seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", > (long long)engine->hangcheck.acthd, > (long long)acthd[id]); > - seq_printf(m, "\taction = %s(%d) %d ms ago\n", > - hangcheck_action_to_str(engine->hangcheck.action), > - engine->hangcheck.action, > - jiffies_to_msecs(jiffies - > - engine->hangcheck.action_timestamp)); Yeah it is a time for sample and most decision are on top of seqno. Welcomed compression. > > if (engine->id == RCS) { > seq_puts(m, "\tinstdone read =\n"); > @@ -3886,8 +3879,6 @@ static int > i915_wedged_set(void *data, u64 val) *hones his axe* > { > struct drm_i915_private *i915 = data; > - struct intel_engine_cs *engine; > - unsigned int tmp; > > /* > * There is no safeguard against this debugfs entry colliding > @@ -3900,18 +3891,8 @@ i915_wedged_set(void *data, u64 val) > if (i915_reset_backoff(&i915->gpu_error)) > return -EAGAIN; > > - for_each_engine_masked(engine, i915, val, tmp) { > - engine->hangcheck.seqno = intel_engine_get_seqno(engine); > - engine->hangcheck.stalled = true; > - } > - > i915_handle_error(i915, val, I915_ERROR_CAPTURE, > "Manually set wedged engine mask = %llx", val); > - > - wait_on_bit(&i915->gpu_error.flags, > - I915_RESET_HANDOFF, > - TASK_UNINTERRUPTIBLE); > - > return 0; > } > > @@ -4066,13 +4047,8 @@ i915_drop_caches_set(void *data, u64 val) > mutex_unlock(&i915->drm.struct_mutex); > } > > - if (val & DROP_RESET_ACTIVE && > - i915_terminally_wedged(&i915->gpu_error)) { > + if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error)) > i915_handle_error(i915, ALL_ENGINES, 0, NULL); > - wait_on_bit(&i915->gpu_error.flags, > - I915_RESET_HANDOFF, > - TASK_UNINTERRUPTIBLE); > - } > > fs_reclaim_acquire(GFP_KERNEL); > if (val & DROP_BOUND) > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 03db011caa8e..59a7e90113d7 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error) > return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags)); > } > > -static inline bool i915_reset_handoff(struct i915_gpu_error *error) > -{ > - return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags)); > -} > - > static inline bool i915_terminally_wedged(struct i915_gpu_error *error) > { > return unlikely(test_bit(I915_WEDGED, &error->flags)); > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index b359390ba22c..d20b42386c3c 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj, > struct intel_rps_client *rps_client) > { > might_sleep(); > -#if IS_ENABLED(CONFIG_LOCKDEP) > - GEM_BUG_ON(debug_locks && > - !!lockdep_is_held(&obj->base.dev->struct_mutex) != > - !!(flags & I915_WAIT_LOCKED)); > -#endif > GEM_BUG_ON(timeout < 0); > > timeout = i915_gem_object_wait_reservation(obj->resv, > @@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915) > > GEM_TRACE("\n"); > > - mutex_lock(&i915->drm.struct_mutex); > - > wakeref = intel_runtime_pm_get(i915); > intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); > > @@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915) > intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); > intel_runtime_pm_put(i915, wakeref); > unset_wedged looks ok, I should have faith as I reviewed the patch. In retrospect READ_ONCE on gt.scratch might have been good at rising suspicion, even tho superfluous. Looks like that engines we are saved by the timeline lock. Andw we have layed some GEM_BUG_ON mines there so we will hear the explosions if any. > + mutex_lock(&i915->drm.struct_mutex); > i915_gem_contexts_lost(i915); > mutex_unlock(&i915->drm.struct_mutex); > } > @@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915) > wakeref = intel_runtime_pm_get(i915); > intel_suspend_gt_powersave(i915); > > + flush_workqueue(i915->wq); I don't know what is happening here. Why don't we need the i915_gem_drain_workqueue in here? > + > mutex_lock(&i915->drm.struct_mutex); > > /* > @@ -4563,11 +4559,9 @@ int i915_gem_suspend(struct drm_i915_private *i915) > i915_retire_requests(i915); /* ensure we flush after wedging */ > > mutex_unlock(&i915->drm.struct_mutex); > + i915_reset_flush(i915); > > - intel_uc_suspend(i915); > - > - cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work); > - cancel_delayed_work_sync(&i915->gt.retire_work); > + drain_delayed_work(&i915->gt.retire_work); Hangcheck is inside reset flush but why the change for retire? > > /* > * As the idle_work is rearming if it detects a race, play safe and > @@ -4575,6 +4569,8 @@ int i915_gem_suspend(struct drm_i915_private *i915) > */ > drain_delayed_work(&i915->gt.idle_work); > > + intel_uc_suspend(i915); > + > /* > * Assert that we successfully flushed all the work and > * reset the GPU back to its idle, low power state. > diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h > index 99a31ded4dfd..09dcaf14121b 100644 > --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h > +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h > @@ -50,4 +50,3 @@ struct drm_i915_fence_reg { > }; > > #endif > - > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h > index 9229b03d629b..a0039ea97cdc 100644 > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h > @@ -39,6 +39,7 @@ > #include <linux/pagevec.h> > > #include "i915_request.h" > +#include "i915_reset.h" > #include "i915_selftest.h" > #include "i915_timeline.h" > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 1f8e80e31b49..4eef0462489c 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, > err_printf(m, " waiting: %s\n", yesno(ee->waiting)); > err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); > err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); > - err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled)); > - err_printf(m, " hangcheck action: %s\n", > - hangcheck_action_to_str(ee->hangcheck_action)); > - err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n", > + err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", > jiffies_to_msecs(ee->hangcheck_timestamp - epoch), > ee->hangcheck_timestamp, > ee->hangcheck_timestamp == epoch ? "; epoch" : ""); > @@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, > jiffies_to_msecs(error->capture - error->epoch)); > > for (i = 0; i < ARRAY_SIZE(error->engine); i++) { > - if (error->engine[i].hangcheck_stalled && > - error->engine[i].context.pid) { > - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", > - engine_name(m->i915, i), > - error->engine[i].context.comm, > - error->engine[i].context.pid, > - error->engine[i].context.ban_score, > - bannable(&error->engine[i].context)); > - } > + if (!error->engine[i].context.pid) > + continue; > + > + err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", > + engine_name(m->i915, i), > + error->engine[i].context.comm, > + error->engine[i].context.pid, > + error->engine[i].context.ban_score, > + bannable(&error->engine[i].context)); > } > err_printf(m, "Reset count: %u\n", error->reset_count); > err_printf(m, "Suspend count: %u\n", error->suspend_count); > @@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err, > return i; > } > > -/* Generate a semi-unique error code. The code is not meant to have meaning, The > +/* > + * Generate a semi-unique error code. The code is not meant to have meaning, The > * code's only purpose is to try to prevent false duplicated bug reports by > * grossly estimating a GPU error state. > * > @@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err, > * > * It's only a small step better than a random number in its current form. > */ > -static u32 i915_error_generate_code(struct drm_i915_private *dev_priv, > - struct i915_gpu_state *error, > - int *engine_id) > +static u32 i915_error_generate_code(struct i915_gpu_state *error, > + unsigned long engine_mask) > { > - u32 error_code = 0; > - int i; > - > - /* IPEHR would be an ideal way to detect errors, as it's the gross > + /* > + * IPEHR would be an ideal way to detect errors, as it's the gross > * measure of "the command that hung." However, has some very common > * synchronization commands which almost always appear in the case > * strictly a client bug. Use instdone to differentiate those some. > */ > - for (i = 0; i < I915_NUM_ENGINES; i++) { > - if (error->engine[i].hangcheck_stalled) { > - if (engine_id) > - *engine_id = i; > + if (engine_mask) { > + struct drm_i915_error_engine *ee = > + &error->engine[ffs(engine_mask)]; > > - return error->engine[i].ipehr ^ > - error->engine[i].instdone.instdone; > - } > + return ee->ipehr ^ ee->instdone.instdone; > } > > - return error_code; > + return 0; > } > > static void gem_record_fences(struct i915_gpu_state *error) > @@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error, > } > > ee->idle = intel_engine_is_idle(engine); > - ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; > - ee->hangcheck_action = engine->hangcheck.action; > - ee->hangcheck_stalled = engine->hangcheck.stalled; > + if (!ee->idle) > + ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; > ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, > engine); > > @@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error) > error->pgtbl_er = I915_READ(PGTBL_ER); > } > > -static void i915_error_capture_msg(struct drm_i915_private *dev_priv, > - struct i915_gpu_state *error, > - u32 engine_mask, > - const char *error_msg) > +static const char * > +error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg) > { > - u32 ecode; > - int engine_id = -1, len; > + int len; > + int i; > > - ecode = i915_error_generate_code(dev_priv, error, &engine_id); > + for (i = 0; i < ARRAY_SIZE(error->engine); i++) > + if (!error->engine[i].context.pid) > + engines &= ~BIT(i); No more grouping for driver internal hangs...? > > len = scnprintf(error->error_msg, sizeof(error->error_msg), > - "GPU HANG: ecode %d:%d:0x%08x", > - INTEL_GEN(dev_priv), engine_id, ecode); > - > - if (engine_id != -1 && error->engine[engine_id].context.pid) > + "GPU HANG: ecode %d:%lx:0x%08x", > + INTEL_GEN(error->i915), engines, > + i915_error_generate_code(error, engines)); > + if (engines) { > + /* Just show the first executing process, more is confusing */ > + i = ffs(engines); then why not just make the ecode accepting single engine and move it here. > len += scnprintf(error->error_msg + len, > sizeof(error->error_msg) - len, > ", in %s [%d]", > - error->engine[engine_id].context.comm, > - error->engine[engine_id].context.pid); > + error->engine[i].context.comm, > + error->engine[i].context.pid); > + } > + if (msg) > + len += scnprintf(error->error_msg + len, > + sizeof(error->error_msg) - len, > + ", %s", msg); > > - scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, > - ", reason: %s, action: %s", > - error_msg, > - engine_mask ? "reset" : "continue"); > + return error->error_msg; > } > > static void capture_gen_state(struct i915_gpu_state *error) > @@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error) > for (i = 0; i < ARRAY_SIZE(error->engine); i++) { > const struct drm_i915_error_engine *ee = &error->engine[i]; > > - if (ee->hangcheck_stalled && > + if (ee->hangcheck_timestamp && > time_before(ee->hangcheck_timestamp, epoch)) > epoch = ee->hangcheck_timestamp; > } > @@ -1921,7 +1916,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915) > * i915_capture_error_state - capture an error record for later analysis > * @i915: i915 device > * @engine_mask: the mask of engines triggering the hang > - * @error_msg: a message to insert into the error capture header > + * @msg: a message to insert into the error capture header > * > * Should be called when an error is detected (either a hang or an error > * interrupt) to capture error state from the time of the error. Fills > @@ -1929,8 +1924,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915) > * to pick up. > */ > void i915_capture_error_state(struct drm_i915_private *i915, > - u32 engine_mask, > - const char *error_msg) > + unsigned long engine_mask, > + const char *msg) > { > static bool warned; > struct i915_gpu_state *error; > @@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915, > if (IS_ERR(error)) > return; > > - i915_error_capture_msg(i915, error, engine_mask, error_msg); > - DRM_INFO("%s\n", error->error_msg); > + dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); > > if (!error->simulated) { > spin_lock_irqsave(&i915->gpu_error.lock, flags); > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h > index 604291f7762d..231173786eae 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.h > +++ b/drivers/gpu/drm/i915/i915_gpu_error.h > @@ -85,8 +85,6 @@ struct i915_gpu_state { > bool waiting; > int num_waiters; > unsigned long hangcheck_timestamp; > - bool hangcheck_stalled; > - enum intel_engine_hangcheck_action hangcheck_action; > struct i915_address_space *vm; > int num_requests; > u32 reset_count; > @@ -197,6 +195,8 @@ struct i915_gpu_state { > struct scatterlist *sgl, *fit; > }; > > +struct i915_gpu_restart; > + > struct i915_gpu_error { > /* For hangcheck timer */ > #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ > @@ -247,15 +247,6 @@ struct i915_gpu_error { > * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a > * secondary role in preventing two concurrent global reset attempts. > * > - * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the > - * struct_mutex. We try to acquire the struct_mutex in the reset worker, > - * but it may be held by some long running waiter (that we cannot > - * interrupt without causing trouble). Once we are ready to do the GPU > - * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If > - * they already hold the struct_mutex and want to participate they can > - * inspect the bit and do the reset directly, otherwise the worker > - * waits for the struct_mutex. > - * > * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to > * acquire the struct_mutex to reset an engine, we need an explicit > * flag to prevent two concurrent reset attempts in the same engine. > @@ -269,20 +260,13 @@ struct i915_gpu_error { > */ > unsigned long flags; > #define I915_RESET_BACKOFF 0 > -#define I915_RESET_HANDOFF 1 > -#define I915_RESET_MODESET 2 > -#define I915_RESET_ENGINE 3 > +#define I915_RESET_MODESET 1 > +#define I915_RESET_ENGINE 2 > #define I915_WEDGED (BITS_PER_LONG - 1) > > /** Number of times an engine has been reset */ > u32 reset_engine_count[I915_NUM_ENGINES]; > > - /** Set of stalled engines with guilty requests, in the current reset */ > - u32 stalled_mask; > - > - /** Reason for the current *global* reset */ > - const char *reason; > - > struct mutex wedge_mutex; /* serialises wedging/unwedging */ > > /** > @@ -299,6 +283,8 @@ struct i915_gpu_error { > > /* For missed irq/seqno simulation. */ > unsigned long test_irq_rings; > + > + struct i915_gpu_restart *restart; > }; > > struct drm_i915_error_state_buf { > @@ -320,7 +306,7 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); > > struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915); > void i915_capture_error_state(struct drm_i915_private *dev_priv, > - u32 engine_mask, > + unsigned long engine_mask, > const char *error_msg); > > static inline struct i915_gpu_state * > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > index 5e178f5ac18b..80232de8e2be 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -1083,18 +1083,6 @@ static bool __i915_spin_request(const struct i915_request *rq, > return false; > } > > -static bool __i915_wait_request_check_and_reset(struct i915_request *request) > -{ > - struct i915_gpu_error *error = &request->i915->gpu_error; > - > - if (likely(!i915_reset_handoff(error))) > - return false; > - > - __set_current_state(TASK_RUNNING); > - i915_reset(request->i915, error->stalled_mask, error->reason); > - return true; > -} > - > /** > * i915_request_wait - wait until execution of request has finished > * @rq: the request to wait upon > @@ -1120,17 +1108,10 @@ long i915_request_wait(struct i915_request *rq, > { > const int state = flags & I915_WAIT_INTERRUPTIBLE ? > TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; > - wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue; > - DEFINE_WAIT_FUNC(reset, default_wake_function); > DEFINE_WAIT_FUNC(exec, default_wake_function); > struct intel_wait wait; > > might_sleep(); > -#if IS_ENABLED(CONFIG_LOCKDEP) > - GEM_BUG_ON(debug_locks && > - !!lockdep_is_held(&rq->i915->drm.struct_mutex) != > - !!(flags & I915_WAIT_LOCKED)); > -#endif > GEM_BUG_ON(timeout < 0); > > if (i915_request_completed(rq)) > @@ -1140,11 +1121,7 @@ long i915_request_wait(struct i915_request *rq, > return -ETIME; > > trace_i915_request_wait_begin(rq, flags); > - > add_wait_queue(&rq->execute, &exec); > - if (flags & I915_WAIT_LOCKED) > - add_wait_queue(errq, &reset); > - > intel_wait_init(&wait); > if (flags & I915_WAIT_PRIORITY) > i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT); > @@ -1155,10 +1132,6 @@ long i915_request_wait(struct i915_request *rq, > if (intel_wait_update_request(&wait, rq)) > break; > > - if (flags & I915_WAIT_LOCKED && > - __i915_wait_request_check_and_reset(rq)) > - continue; > - > if (signal_pending_state(state, current)) { > timeout = -ERESTARTSYS; > goto complete; > @@ -1188,9 +1161,6 @@ long i915_request_wait(struct i915_request *rq, > */ > goto wakeup; > > - if (flags & I915_WAIT_LOCKED) > - __i915_wait_request_check_and_reset(rq); > - > for (;;) { > if (signal_pending_state(state, current)) { > timeout = -ERESTARTSYS; > @@ -1214,21 +1184,6 @@ long i915_request_wait(struct i915_request *rq, > if (i915_request_completed(rq)) > break; > > - /* > - * If the GPU is hung, and we hold the lock, reset the GPU > - * and then check for completion. On a full reset, the engine's > - * HW seqno will be advanced passed us and we are complete. > - * If we do a partial reset, we have to wait for the GPU to > - * resume and update the breadcrumb. > - * > - * If we don't hold the mutex, we can just wait for the worker > - * to come along and update the breadcrumb (either directly > - * itself, or indirectly by recovering the GPU). > - */ > - if (flags & I915_WAIT_LOCKED && > - __i915_wait_request_check_and_reset(rq)) > - continue; > - > /* Only spin if we know the GPU is processing this request */ > if (__i915_spin_request(rq, wait.seqno, state, 2)) > break; > @@ -1242,8 +1197,6 @@ long i915_request_wait(struct i915_request *rq, > intel_engine_remove_wait(rq->engine, &wait); > complete: > __set_current_state(TASK_RUNNING); > - if (flags & I915_WAIT_LOCKED) > - remove_wait_queue(errq, &reset); > remove_wait_queue(&rq->execute, &exec); > trace_i915_request_wait_end(rq); > > diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c > index 2961c21d9420..064fc6da1512 100644 > --- a/drivers/gpu/drm/i915/i915_reset.c > +++ b/drivers/gpu/drm/i915/i915_reset.c > @@ -5,6 +5,7 @@ > */ > > #include <linux/sched/mm.h> > +#include <linux/stop_machine.h> > > #include "i915_drv.h" > #include "i915_gpu_error.h" > @@ -17,22 +18,23 @@ static void engine_skip_context(struct i915_request *rq) > struct intel_engine_cs *engine = rq->engine; > struct i915_gem_context *hung_ctx = rq->gem_context; > struct i915_timeline *timeline = rq->timeline; > - unsigned long flags; > > + lockdep_assert_held(&engine->timeline.lock); > GEM_BUG_ON(timeline == &engine->timeline); > > - spin_lock_irqsave(&engine->timeline.lock, flags); > spin_lock(&timeline->lock); > > - list_for_each_entry_continue(rq, &engine->timeline.requests, link) > - if (rq->gem_context == hung_ctx) > - i915_request_skip(rq, -EIO); > + if (rq->global_seqno) { > + list_for_each_entry_continue(rq, > + &engine->timeline.requests, link) > + if (rq->gem_context == hung_ctx) > + i915_request_skip(rq, -EIO); > + } > > list_for_each_entry(rq, &timeline->requests, link) > i915_request_skip(rq, -EIO); > > spin_unlock(&timeline->lock); > - spin_unlock_irqrestore(&engine->timeline.lock, flags); > } > > static void client_mark_guilty(struct drm_i915_file_private *file_priv, > @@ -59,7 +61,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv, > } > } > > -static void context_mark_guilty(struct i915_gem_context *ctx) > +static bool context_mark_guilty(struct i915_gem_context *ctx) > { > unsigned int score; > bool banned, bannable; > @@ -72,7 +74,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx) > > /* Cool contexts don't accumulate client ban score */ > if (!bannable) > - return; > + return false; > > if (banned) { > DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n", > @@ -83,6 +85,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx) > > if (!IS_ERR_OR_NULL(ctx->file_priv)) > client_mark_guilty(ctx->file_priv, ctx); > + > + return banned; > } > > static void context_mark_innocent(struct i915_gem_context *ctx) > @@ -90,6 +94,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx) > atomic_inc(&ctx->active_count); > } > > +void i915_reset_request(struct i915_request *rq, bool guilty) > +{ > + lockdep_assert_held(&rq->engine->timeline.lock); > + GEM_BUG_ON(i915_request_completed(rq)); > + > + if (guilty) { > + i915_request_skip(rq, -EIO); > + if (context_mark_guilty(rq->gem_context)) > + engine_skip_context(rq); > + } else { > + dma_fence_set_error(&rq->fence, -EAGAIN); > + context_mark_innocent(rq->gem_context); > + } > +} > + > static void gen3_stop_engine(struct intel_engine_cs *engine) > { > struct drm_i915_private *dev_priv = engine->i915; > @@ -533,22 +552,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask) > int retry; > int ret; > > - /* > - * We want to perform per-engine reset from atomic context (e.g. > - * softirq), which imposes the constraint that we cannot sleep. > - * However, experience suggests that spending a bit of time waiting > - * for a reset helps in various cases, so for a full-device reset > - * we apply the opposite rule and wait if we want to. As we should > - * always follow up a failed per-engine reset with a full device reset, > - * being a little faster, stricter and more error prone for the > - * atomic case seems an acceptable compromise. > - * > - * Unfortunately this leads to a bimodal routine, when the goal was > - * to have a single reset function that worked for resetting any > - * number of engines simultaneously. > - */ > - might_sleep_if(engine_mask == ALL_ENGINES); Oh here it is. I was after this on atomic resets. > - > /* > * If the power well sleeps during the reset, the reset > * request may be dropped and never completes (causing -EIO). > @@ -580,8 +583,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask) > } > if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES) > break; > - > - cond_resched(); > } > intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); > > @@ -620,11 +621,8 @@ int intel_reset_guc(struct drm_i915_private *i915) > * Ensure irq handler finishes, and not run again. > * Also return the active request so that we only search for it once. > */ > -static struct i915_request * > -reset_prepare_engine(struct intel_engine_cs *engine) > +static void reset_prepare_engine(struct intel_engine_cs *engine) > { > - struct i915_request *rq; > - > /* > * During the reset sequence, we must prevent the engine from > * entering RC6. As the context state is undefined until we restart > @@ -633,162 +631,86 @@ reset_prepare_engine(struct intel_engine_cs *engine) > * GPU state upon resume, i.e. fail to restart after a reset. > */ > intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL); > - > - rq = engine->reset.prepare(engine); > - if (rq && rq->fence.error == -EIO) > - rq = ERR_PTR(-EIO); /* Previous reset failed! */ > - > - return rq; > + engine->reset.prepare(engine); > } > > -static int reset_prepare(struct drm_i915_private *i915) > +static void reset_prepare(struct drm_i915_private *i915) > { > struct intel_engine_cs *engine; > - struct i915_request *rq; > enum intel_engine_id id; > - int err = 0; > > - for_each_engine(engine, i915, id) { > - rq = reset_prepare_engine(engine); > - if (IS_ERR(rq)) { > - err = PTR_ERR(rq); > - continue; > - } > - > - engine->hangcheck.active_request = rq; > - } > + for_each_engine(engine, i915, id) > + reset_prepare_engine(engine); > > - i915_gem_revoke_fences(i915); > intel_uc_sanitize(i915); > - > - return err; > } > > -/* Returns the request if it was guilty of the hang */ > -static struct i915_request * > -reset_request(struct intel_engine_cs *engine, > - struct i915_request *rq, > - bool stalled) > +static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask) > { > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + int err; > + > /* > - * The guilty request will get skipped on a hung engine. > - * > - * Users of client default contexts do not rely on logical > - * state preserved between batches so it is safe to execute > - * queued requests following the hang. Non default contexts > - * rely on preserved state, so skipping a batch loses the > - * evolution of the state and it needs to be considered corrupted. > - * Executing more queued batches on top of corrupted state is > - * risky. But we take the risk by trying to advance through > - * the queued requests in order to make the client behaviour > - * more predictable around resets, by not throwing away random > - * amount of batches it has prepared for execution. Sophisticated > - * clients can use gem_reset_stats_ioctl and dma fence status > - * (exported via sync_file info ioctl on explicit fences) to observe > - * when it loses the context state and should rebuild accordingly. > - * > - * The context ban, and ultimately the client ban, mechanism are safety > - * valves if client submission ends up resulting in nothing more than > - * subsequent hangs. > + * Everything depends on having the GTT running, so we need to start > + * there. > */ > + err = i915_ggtt_enable_hw(i915); > + if (err) > + return err; > > - if (i915_request_completed(rq)) { > - GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n", > - engine->name, rq->global_seqno, > - rq->fence.context, rq->fence.seqno, > - intel_engine_get_seqno(engine)); > - stalled = false; > - } > - > - if (stalled) { > - context_mark_guilty(rq->gem_context); > - i915_request_skip(rq, -EIO); > + for_each_engine(engine, i915, id) > + intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id)); > > - /* If this context is now banned, skip all pending requests. */ > - if (i915_gem_context_is_banned(rq->gem_context)) > - engine_skip_context(rq); > - } else { > - /* > - * Since this is not the hung engine, it may have advanced > - * since the hang declaration. Double check by refinding > - * the active request at the time of the reset. > - */ > - rq = i915_gem_find_active_request(engine); > - if (rq) { > - unsigned long flags; > - > - context_mark_innocent(rq->gem_context); > - dma_fence_set_error(&rq->fence, -EAGAIN); > - > - /* Rewind the engine to replay the incomplete rq */ > - spin_lock_irqsave(&engine->timeline.lock, flags); > - rq = list_prev_entry(rq, link); > - if (&rq->link == &engine->timeline.requests) > - rq = NULL; > - spin_unlock_irqrestore(&engine->timeline.lock, flags); > - } > - } > + i915_gem_restore_fences(i915); > > - return rq; > + return err; > } > > -static void reset_engine(struct intel_engine_cs *engine, > - struct i915_request *rq, > - bool stalled) > +static void reset_finish_engine(struct intel_engine_cs *engine) > { > - if (rq) > - rq = reset_request(engine, rq, stalled); > - > - /* Setup the CS to resume from the breadcrumb of the hung request */ > - engine->reset.reset(engine, rq); > + engine->reset.finish(engine); > + intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); > } > > -static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask) > +struct i915_gpu_restart { > + struct work_struct work; > + struct drm_i915_private *i915; > +}; > + > +static void restart_work(struct work_struct *work) > { > + struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work); > + struct drm_i915_private *i915 = arg->i915; > struct intel_engine_cs *engine; > enum intel_engine_id id; > + intel_wakeref_t wakeref; > > - lockdep_assert_held(&i915->drm.struct_mutex); > + wakeref = intel_runtime_pm_get(i915); > + mutex_lock(&i915->drm.struct_mutex); > > - i915_retire_requests(i915); Can't do this anymore yes. What will be the effect of delaying this and the other explicit retirements? Are we more prone to starvation? > + smp_store_mb(i915->gpu_error.restart, NULL); Checkpatch might want a comment for the mb. > > for_each_engine(engine, i915, id) { > - struct intel_context *ce; > - > - reset_engine(engine, > - engine->hangcheck.active_request, > - stalled_mask & ENGINE_MASK(id)); > - ce = fetch_and_zero(&engine->last_retired_context); > - if (ce) > - intel_context_unpin(ce); > + struct i915_request *rq; > > /* > * Ostensibily, we always want a context loaded for powersaving, > * so if the engine is idle after the reset, send a request > * to load our scratch kernel_context. > - * > - * More mysteriously, if we leave the engine idle after a reset, > - * the next userspace batch may hang, with what appears to be > - * an incoherent read by the CS (presumably stale TLB). An > - * empty request appears sufficient to paper over the glitch. > */ > - if (intel_engine_is_idle(engine)) { > - struct i915_request *rq; > + if (!intel_engine_is_idle(engine)) > + continue; Why did you remove the comment on needing a empty request? Also if the request causing nonidle could be troublesome one, from troublesome context, why not just skip the idle check and always add one for kernel ctx? > > - rq = i915_request_alloc(engine, i915->kernel_context); > - if (!IS_ERR(rq)) > - i915_request_add(rq); > - } > + rq = i915_request_alloc(engine, i915->kernel_context); > + if (!IS_ERR(rq)) > + i915_request_add(rq); > } > > - i915_gem_restore_fences(i915); > -} > - > -static void reset_finish_engine(struct intel_engine_cs *engine) > -{ > - engine->reset.finish(engine); > + mutex_unlock(&i915->drm.struct_mutex); > + intel_runtime_pm_put(i915, wakeref); > > - intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); > + kfree(arg); > } > > static void reset_finish(struct drm_i915_private *i915) > @@ -796,11 +718,30 @@ static void reset_finish(struct drm_i915_private *i915) > struct intel_engine_cs *engine; > enum intel_engine_id id; > > - lockdep_assert_held(&i915->drm.struct_mutex); > - > - for_each_engine(engine, i915, id) { > - engine->hangcheck.active_request = NULL; > + for_each_engine(engine, i915, id) > reset_finish_engine(engine); > +} > + > +static void reset_restart(struct drm_i915_private *i915) > +{ > + struct i915_gpu_restart *arg; > + > + /* > + * Following the reset, ensure that we always reload context for > + * powersaving, and to correct engine->last_retired_context. Since > + * this requires us to submit a request, queue a worker to do that > + * task for us to evade any locking here. > + */ Nice, this was/will be helpful! > + if (READ_ONCE(i915->gpu_error.restart)) > + return; > + > + arg = kmalloc(sizeof(*arg), GFP_KERNEL); > + if (arg) { > + arg->i915 = i915; > + INIT_WORK(&arg->work, restart_work); > + > + WRITE_ONCE(i915->gpu_error.restart, arg); > + queue_work(i915->wq, &arg->work); > } > } > > @@ -889,8 +830,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) > struct i915_timeline *tl; > bool ret = false; > > - lockdep_assert_held(&i915->drm.struct_mutex); > - > if (!test_bit(I915_WEDGED, &error->flags)) > return true; > > @@ -913,9 +852,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) > */ > list_for_each_entry(tl, &i915->gt.timelines, link) { > struct i915_request *rq; > + long timeout; > > - rq = i915_gem_active_peek(&tl->last_request, > - &i915->drm.struct_mutex); > + rq = i915_gem_active_get_unlocked(&tl->last_request); > if (!rq) > continue; > > @@ -930,12 +869,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) > * and when the seqno passes the fence, the signaler > * then signals the fence waking us up). > */ > - if (dma_fence_default_wait(&rq->fence, true, > - MAX_SCHEDULE_TIMEOUT) < 0) > + timeout = dma_fence_default_wait(&rq->fence, true, > + MAX_SCHEDULE_TIMEOUT); > + i915_request_put(rq); > + if (timeout < 0) > goto unlock; > } > - i915_retire_requests(i915); > - GEM_BUG_ON(i915->gt.active_requests); > > intel_engines_sanitize(i915, false); > > @@ -949,7 +888,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) > * context and do not require stop_machine(). > */ > intel_engines_reset_default_submission(i915); > - i915_gem_contexts_lost(i915); > > GEM_TRACE("end\n"); > > @@ -962,6 +900,43 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) > return ret; > } > > +struct __i915_reset { > + struct drm_i915_private *i915; > + unsigned int stalled_mask; > +}; > + > +static int __i915_reset__BKL(void *data) > +{ > + struct __i915_reset *arg = data; > + int err; > + > + err = intel_gpu_reset(arg->i915, ALL_ENGINES); > + if (err) > + return err; > + > + return gt_reset(arg->i915, arg->stalled_mask); > +} > + > +#if 0 > +#define __do_reset(fn, arg) stop_machine(fn, arg, NULL) Lets remove the machinery to select reset stop_machine and the #include. > +#else > +#define __do_reset(fn, arg) fn(arg) > +#endif > + > +static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask) > +{ > + struct __i915_reset arg = { i915, stalled_mask }; > + int err, i; > + > + err = __do_reset(__i915_reset__BKL, &arg); > + for (i = 0; err && i < 3; i++) { > + msleep(100); > + err = __do_reset(__i915_reset__BKL, &arg); > + } > + > + return err; > +} > + > /** > * i915_reset - reset chip after a hang > * @i915: #drm_i915_private to reset > @@ -987,31 +962,22 @@ void i915_reset(struct drm_i915_private *i915, > { > struct i915_gpu_error *error = &i915->gpu_error; > int ret; > - int i; > > GEM_TRACE("flags=%lx\n", error->flags); > > might_sleep(); What will? I didn't spot anything in execlists_reset_prepare. > - lockdep_assert_held(&i915->drm.struct_mutex); > assert_rpm_wakelock_held(i915); > GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); > > - if (!test_bit(I915_RESET_HANDOFF, &error->flags)) > - return; > - > /* Clear any previous failed attempts at recovery. Time to try again. */ > if (!i915_gem_unset_wedged(i915)) > - goto wakeup; > + return; > > if (reason) > dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); > error->reset_count++; > > - ret = reset_prepare(i915); > - if (ret) { > - dev_err(i915->drm.dev, "GPU recovery failed\n"); > - goto taint; > - } > + reset_prepare(i915); > > if (!intel_has_gpu_reset(i915)) { > if (i915_modparams.reset) > @@ -1021,32 +987,11 @@ void i915_reset(struct drm_i915_private *i915, > goto error; > } > > - for (i = 0; i < 3; i++) { > - ret = intel_gpu_reset(i915, ALL_ENGINES); > - if (ret == 0) > - break; > - > - msleep(100); > - } > - if (ret) { > + if (do_reset(i915, stalled_mask)) { > dev_err(i915->drm.dev, "Failed to reset chip\n"); > goto taint; > } > > - /* Ok, now get things going again... */ > - > - /* > - * Everything depends on having the GTT running, so we need to start > - * there. > - */ > - ret = i915_ggtt_enable_hw(i915); > - if (ret) { > - DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n", > - ret); > - goto error; > - } > - > - gt_reset(i915, stalled_mask); > intel_overlay_reset(i915); > > /* > @@ -1068,9 +1013,8 @@ void i915_reset(struct drm_i915_private *i915, > > finish: > reset_finish(i915); > -wakeup: > - clear_bit(I915_RESET_HANDOFF, &error->flags); > - wake_up_bit(&error->flags, I915_RESET_HANDOFF); > + if (!i915_terminally_wedged(error)) > + reset_restart(i915); > return; > > taint: > @@ -1089,7 +1033,6 @@ void i915_reset(struct drm_i915_private *i915, > add_taint(TAINT_WARN, LOCKDEP_STILL_OK); > error: > i915_gem_set_wedged(i915); > - i915_retire_requests(i915); > goto finish; > } > > @@ -1115,18 +1058,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915, > int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) > { > struct i915_gpu_error *error = &engine->i915->gpu_error; > - struct i915_request *active_request; > int ret; > > GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); > GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); > > - active_request = reset_prepare_engine(engine); > - if (IS_ERR_OR_NULL(active_request)) { > - /* Either the previous reset failed, or we pardon the reset. */ > - ret = PTR_ERR(active_request); > - goto out; > - } > + if (i915_seqno_passed(intel_engine_get_seqno(engine), > + intel_engine_last_submit(engine))) > + return 0; You seem to have a patch to remove this shortly after so squash? I need to restock on coffee at this point. -Mika > + > + reset_prepare_engine(engine); > > if (msg) > dev_notice(engine->i915->drm.dev, > @@ -1150,7 +1091,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) > * active request and can drop it, adjust head to skip the offending > * request to resume executing remaining requests in the queue. > */ > - reset_engine(engine, active_request, true); > + intel_engine_reset(engine, true); > > /* > * The engine and its registers (and workarounds in case of render) > @@ -1187,30 +1128,7 @@ static void i915_reset_device(struct drm_i915_private *i915, > i915_wedge_on_timeout(&w, i915, 5 * HZ) { > intel_prepare_reset(i915); > > - error->reason = reason; > - error->stalled_mask = engine_mask; > - > - /* Signal that locked waiters should reset the GPU */ > - smp_mb__before_atomic(); > - set_bit(I915_RESET_HANDOFF, &error->flags); > - wake_up_all(&error->wait_queue); > - > - /* > - * Wait for anyone holding the lock to wakeup, without > - * blocking indefinitely on struct_mutex. > - */ > - do { > - if (mutex_trylock(&i915->drm.struct_mutex)) { > - i915_reset(i915, engine_mask, reason); > - mutex_unlock(&i915->drm.struct_mutex); > - } > - } while (wait_on_bit_timeout(&error->flags, > - I915_RESET_HANDOFF, > - TASK_UNINTERRUPTIBLE, > - 1)); > - > - error->stalled_mask = 0; > - error->reason = NULL; > + i915_reset(i915, engine_mask, reason); > > intel_finish_reset(i915); > } > @@ -1366,6 +1284,25 @@ void i915_handle_error(struct drm_i915_private *i915, > intel_runtime_pm_put(i915, wakeref); > } > > +bool i915_reset_flush(struct drm_i915_private *i915) > +{ > + int err; > + > + cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work); > + > + flush_workqueue(i915->wq); > + GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart)); > + > + mutex_lock(&i915->drm.struct_mutex); > + err = i915_gem_wait_for_idle(i915, > + I915_WAIT_LOCKED | > + I915_WAIT_FOR_IDLE_BOOST, > + MAX_SCHEDULE_TIMEOUT); > + mutex_unlock(&i915->drm.struct_mutex); > + > + return !err; > +} > + > static void i915_wedge_me(struct work_struct *work) > { > struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); > diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h > index b6a519bde67d..f2d347f319df 100644 > --- a/drivers/gpu/drm/i915/i915_reset.h > +++ b/drivers/gpu/drm/i915/i915_reset.h > @@ -29,6 +29,9 @@ void i915_reset(struct drm_i915_private *i915, > int i915_reset_engine(struct intel_engine_cs *engine, > const char *reason); > > +void i915_reset_request(struct i915_request *rq, bool guilty); > +bool i915_reset_flush(struct drm_i915_private *i915); > + > bool intel_has_gpu_reset(struct drm_i915_private *i915); > bool intel_has_reset_engine(struct drm_i915_private *i915); > > diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c > index 2f3c71f6d313..fc52737751e7 100644 > --- a/drivers/gpu/drm/i915/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/intel_engine_cs.c > @@ -1071,10 +1071,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force) > if (!reset_engines(i915) && !force) > return; > > - for_each_engine(engine, i915, id) { > - if (engine->reset.reset) > - engine->reset.reset(engine, NULL); > - } > + for_each_engine(engine, i915, id) > + intel_engine_reset(engine, false); > } > > /** > diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c > index ab1c49b106f2..7217c7e3ee8d 100644 > --- a/drivers/gpu/drm/i915/intel_guc_submission.c > +++ b/drivers/gpu/drm/i915/intel_guc_submission.c > @@ -834,8 +834,7 @@ static void guc_submission_tasklet(unsigned long data) > spin_unlock_irqrestore(&engine->timeline.lock, flags); > } > > -static struct i915_request * > -guc_reset_prepare(struct intel_engine_cs *engine) > +static void guc_reset_prepare(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > > @@ -861,8 +860,6 @@ guc_reset_prepare(struct intel_engine_cs *engine) > */ > if (engine->i915->guc.preempt_wq) > flush_workqueue(engine->i915->guc.preempt_wq); > - > - return i915_gem_find_active_request(engine); > } > > /* > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c > index 741441daae32..5662d6fed523 100644 > --- a/drivers/gpu/drm/i915/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c > @@ -25,6 +25,17 @@ > #include "i915_drv.h" > #include "i915_reset.h" > > +struct hangcheck { > + u64 acthd; > + u32 seqno; > + enum intel_engine_hangcheck_action action; > + unsigned long action_timestamp; > + int deadlock; > + struct intel_instdone instdone; > + bool wedged:1; > + bool stalled:1; > +}; > + > static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone) > { > u32 tmp = current_instdone | *old_instdone; > @@ -119,25 +130,22 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) > } > > static void hangcheck_load_sample(struct intel_engine_cs *engine, > - struct intel_engine_hangcheck *hc) > + struct hangcheck *hc) > { > hc->acthd = intel_engine_get_active_head(engine); > hc->seqno = intel_engine_get_seqno(engine); > } > > static void hangcheck_store_sample(struct intel_engine_cs *engine, > - const struct intel_engine_hangcheck *hc) > + const struct hangcheck *hc) > { > engine->hangcheck.acthd = hc->acthd; > engine->hangcheck.seqno = hc->seqno; > - engine->hangcheck.action = hc->action; > - engine->hangcheck.stalled = hc->stalled; > - engine->hangcheck.wedged = hc->wedged; > } > > static enum intel_engine_hangcheck_action > hangcheck_get_action(struct intel_engine_cs *engine, > - const struct intel_engine_hangcheck *hc) > + const struct hangcheck *hc) > { > if (engine->hangcheck.seqno != hc->seqno) > return ENGINE_ACTIVE_SEQNO; > @@ -149,7 +157,7 @@ hangcheck_get_action(struct intel_engine_cs *engine, > } > > static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, > - struct intel_engine_hangcheck *hc) > + struct hangcheck *hc) > { > unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; > > @@ -265,19 +273,19 @@ static void i915_hangcheck_elapsed(struct work_struct *work) > intel_uncore_arm_unclaimed_mmio_detection(dev_priv); > > for_each_engine(engine, dev_priv, id) { > - struct intel_engine_hangcheck hc; > + struct hangcheck hc; > > hangcheck_load_sample(engine, &hc); > hangcheck_accumulate_sample(engine, &hc); > hangcheck_store_sample(engine, &hc); > > - if (engine->hangcheck.stalled) { > + if (hc.stalled) { > hung |= intel_engine_flag(engine); > if (hc.action != ENGINE_DEAD) > stuck |= intel_engine_flag(engine); > } > > - if (engine->hangcheck.wedged) > + if (hc.wedged) > wedged |= intel_engine_flag(engine); > } > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 28d183439952..c11cbf34258d 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -136,6 +136,7 @@ > #include <drm/i915_drm.h> > #include "i915_drv.h" > #include "i915_gem_render_state.h" > +#include "i915_reset.h" > #include "i915_vgpu.h" > #include "intel_lrc_reg.h" > #include "intel_mocs.h" > @@ -288,7 +289,8 @@ static void unwind_wa_tail(struct i915_request *rq) > assert_ring_tail_valid(rq->ring, rq->tail); > } > > -static void __unwind_incomplete_requests(struct intel_engine_cs *engine) > +static struct i915_request * > +__unwind_incomplete_requests(struct intel_engine_cs *engine) > { > struct i915_request *rq, *rn, *active = NULL; > struct list_head *uninitialized_var(pl); > @@ -330,6 +332,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine) > list_move_tail(&active->sched.link, > i915_sched_lookup_priolist(engine, prio)); > } > + > + return active; > } > > void > @@ -1743,11 +1747,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine) > return 0; > } > > -static struct i915_request * > -execlists_reset_prepare(struct intel_engine_cs *engine) > +static void execlists_reset_prepare(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > - struct i915_request *request, *active; > unsigned long flags; > > GEM_TRACE("%s: depth<-%d\n", engine->name, > @@ -1763,59 +1765,21 @@ execlists_reset_prepare(struct intel_engine_cs *engine) > * prevents the race. > */ > __tasklet_disable_sync_once(&execlists->tasklet); > + GEM_BUG_ON(!reset_in_progress(execlists)); > > + /* And flush any current direct submission. */ > spin_lock_irqsave(&engine->timeline.lock, flags); > - > - /* > - * We want to flush the pending context switches, having disabled > - * the tasklet above, we can assume exclusive access to the execlists. > - * For this allows us to catch up with an inflight preemption event, > - * and avoid blaming an innocent request if the stall was due to the > - * preemption itself. > - */ > - process_csb(engine); > - > - /* > - * The last active request can then be no later than the last request > - * now in ELSP[0]. So search backwards from there, so that if the GPU > - * has advanced beyond the last CSB update, it will be pardoned. > - */ > - active = NULL; > - request = port_request(execlists->port); > - if (request) { > - /* > - * Prevent the breadcrumb from advancing before we decide > - * which request is currently active. > - */ > - intel_engine_stop_cs(engine); > - > - list_for_each_entry_from_reverse(request, > - &engine->timeline.requests, > - link) { > - if (__i915_request_completed(request, > - request->global_seqno)) > - break; > - > - active = request; > - } > - } > - > + process_csb(engine); /* drain preemption events */ > spin_unlock_irqrestore(&engine->timeline.lock, flags); > - > - return active; > } > > -static void execlists_reset(struct intel_engine_cs *engine, > - struct i915_request *request) > +static void execlists_reset(struct intel_engine_cs *engine, bool stalled) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > + struct i915_request *rq; > unsigned long flags; > u32 *regs; > > - GEM_TRACE("%s request global=%d, current=%d\n", > - engine->name, request ? request->global_seqno : 0, > - intel_engine_get_seqno(engine)); > - > spin_lock_irqsave(&engine->timeline.lock, flags); > > /* > @@ -1830,12 +1794,18 @@ static void execlists_reset(struct intel_engine_cs *engine, > execlists_cancel_port_requests(execlists); > > /* Push back any incomplete requests for replay after the reset. */ > - __unwind_incomplete_requests(engine); > + rq = __unwind_incomplete_requests(engine); > > /* Following the reset, we need to reload the CSB read/write pointers */ > reset_csb_pointers(&engine->execlists); > > - spin_unlock_irqrestore(&engine->timeline.lock, flags); > + GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n", > + engine->name, > + rq ? rq->global_seqno : 0, > + intel_engine_get_seqno(engine), > + yesno(stalled)); > + if (!rq) > + goto out_unlock; > > /* > * If the request was innocent, we leave the request in the ELSP > @@ -1848,8 +1818,9 @@ static void execlists_reset(struct intel_engine_cs *engine, > * and have to at least restore the RING register in the context > * image back to the expected values to skip over the guilty request. > */ > - if (!request || request->fence.error != -EIO) > - return; > + i915_reset_request(rq, stalled); > + if (!stalled) > + goto out_unlock; > > /* > * We want a simple context + ring to execute the breadcrumb update. > @@ -1859,25 +1830,23 @@ static void execlists_reset(struct intel_engine_cs *engine, > * future request will be after userspace has had the opportunity > * to recreate its own state. > */ > - regs = request->hw_context->lrc_reg_state; > + regs = rq->hw_context->lrc_reg_state; > if (engine->pinned_default_state) { > memcpy(regs, /* skip restoring the vanilla PPHWSP */ > engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, > engine->context_size - PAGE_SIZE); > } > - execlists_init_reg_state(regs, > - request->gem_context, engine, request->ring); > + execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring); > > /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */ > - regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma); > - > - request->ring->head = intel_ring_wrap(request->ring, request->postfix); > - regs[CTX_RING_HEAD + 1] = request->ring->head; > + regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(rq->ring->vma); > > - intel_ring_update_space(request->ring); > + rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix); > + regs[CTX_RING_HEAD + 1] = rq->ring->head; > + intel_ring_update_space(rq->ring); > > - /* Reset WaIdleLiteRestore:bdw,skl as well */ > - unwind_wa_tail(request); > +out_unlock: > + spin_unlock_irqrestore(&engine->timeline.lock, flags); > } > > static void execlists_reset_finish(struct intel_engine_cs *engine) > @@ -1890,6 +1859,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine) > * to sleep before we restart and reload a context. > * > */ > + GEM_BUG_ON(!reset_in_progress(execlists)); > if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) > execlists->tasklet.func(execlists->tasklet.data); > > diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c > index c81db81e4416..f68c7975006c 100644 > --- a/drivers/gpu/drm/i915/intel_overlay.c > +++ b/drivers/gpu/drm/i915/intel_overlay.c > @@ -478,8 +478,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv) > if (!overlay) > return; > > - intel_overlay_release_old_vid(overlay); > - How to compensate for this? > overlay->old_xscale = 0; > overlay->old_yscale = 0; > overlay->crtc = NULL; > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index 26b7274a2d43..662907e1a286 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -33,6 +33,7 @@ > > #include "i915_drv.h" > #include "i915_gem_render_state.h" > +#include "i915_reset.h" > #include "i915_trace.h" > #include "intel_drv.h" > #include "intel_workarounds.h" > @@ -707,52 +708,80 @@ static int init_ring_common(struct intel_engine_cs *engine) > return ret; > } > > -static struct i915_request *reset_prepare(struct intel_engine_cs *engine) > +static void reset_prepare(struct intel_engine_cs *engine) > { > intel_engine_stop_cs(engine); > - return i915_gem_find_active_request(engine); > } > > -static void skip_request(struct i915_request *rq) > +static void reset_ring(struct intel_engine_cs *engine, bool stalled) > { > - void *vaddr = rq->ring->vaddr; > + struct i915_timeline *tl = &engine->timeline; > + struct i915_request *pos, *rq; > + unsigned long flags; > u32 head; > > - head = rq->infix; > - if (rq->postfix < head) { > - memset32(vaddr + head, MI_NOOP, > - (rq->ring->size - head) / sizeof(u32)); > - head = 0; > + rq = NULL; > + spin_lock_irqsave(&tl->lock, flags); > + list_for_each_entry(pos, &tl->requests, link) { > + if (!__i915_request_completed(pos, pos->global_seqno)) { > + rq = pos; > + break; > + } > } > - memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32)); > -} > - > -static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq) > -{ > - GEM_TRACE("%s request global=%d, current=%d\n", > - engine->name, rq ? rq->global_seqno : 0, > - intel_engine_get_seqno(engine)); > > + GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n", > + engine->name, > + rq ? rq->global_seqno : 0, > + intel_engine_get_seqno(engine), > + yesno(stalled)); > /* > - * Try to restore the logical GPU state to match the continuation > - * of the request queue. If we skip the context/PD restore, then > - * the next request may try to execute assuming that its context > - * is valid and loaded on the GPU and so may try to access invalid > - * memory, prompting repeated GPU hangs. > + * The guilty request will get skipped on a hung engine. > * > - * If the request was guilty, we still restore the logical state > - * in case the next request requires it (e.g. the aliasing ppgtt), > - * but skip over the hung batch. > + * Users of client default contexts do not rely on logical > + * state preserved between batches so it is safe to execute > + * queued requests following the hang. Non default contexts > + * rely on preserved state, so skipping a batch loses the > + * evolution of the state and it needs to be considered corrupted. > + * Executing more queued batches on top of corrupted state is > + * risky. But we take the risk by trying to advance through > + * the queued requests in order to make the client behaviour > + * more predictable around resets, by not throwing away random > + * amount of batches it has prepared for execution. Sophisticated > + * clients can use gem_reset_stats_ioctl and dma fence status > + * (exported via sync_file info ioctl on explicit fences) to observe > + * when it loses the context state and should rebuild accordingly. > * > - * If the request was innocent, we try to replay the request with > - * the restored context. > + * The context ban, and ultimately the client ban, mechanism are safety > + * valves if client submission ends up resulting in nothing more than > + * subsequent hangs. > */ > + > if (rq) { > - /* If the rq hung, jump to its breadcrumb and skip the batch */ > - rq->ring->head = intel_ring_wrap(rq->ring, rq->head); > - if (rq->fence.error == -EIO) > - skip_request(rq); > + /* > + * Try to restore the logical GPU state to match the > + * continuation of the request queue. If we skip the > + * context/PD restore, then the next request may try to execute > + * assuming that its context is valid and loaded on the GPU and > + * so may try to access invalid memory, prompting repeated GPU > + * hangs. > + * > + * If the request was guilty, we still restore the logical > + * state in case the next request requires it (e.g. the > + * aliasing ppgtt), but skip over the hung batch. > + * > + * If the request was innocent, we try to replay the request > + * with the restored context. > + */ > + i915_reset_request(rq, stalled); > + > + GEM_BUG_ON(rq->ring != engine->buffer); > + head = rq->head; > + } else { > + head = engine->buffer->tail; > } > + engine->buffer->head = intel_ring_wrap(engine->buffer, head); > + > + spin_unlock_irqrestore(&tl->lock, flags); > } > > static void reset_finish(struct intel_engine_cs *engine) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index c3ef0f9bf321..32ed44196c1a 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -120,13 +120,8 @@ struct intel_instdone { > struct intel_engine_hangcheck { > u64 acthd; > u32 seqno; > - enum intel_engine_hangcheck_action action; > unsigned long action_timestamp; > - int deadlock; > struct intel_instdone instdone; > - struct i915_request *active_request; > - bool stalled:1; > - bool wedged:1; > }; > > struct intel_ring { > @@ -444,9 +439,8 @@ struct intel_engine_cs { > int (*init_hw)(struct intel_engine_cs *engine); > > struct { > - struct i915_request *(*prepare)(struct intel_engine_cs *engine); > - void (*reset)(struct intel_engine_cs *engine, > - struct i915_request *rq); > + void (*prepare)(struct intel_engine_cs *engine); > + void (*reset)(struct intel_engine_cs *engine, bool stalled); > void (*finish)(struct intel_engine_cs *engine); > } reset; > > @@ -1018,6 +1012,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset) > return cs; > } > > +static inline void intel_engine_reset(struct intel_engine_cs *engine, > + bool stalled) > +{ > + if (engine->reset.reset) > + engine->reset.reset(engine, stalled); > +} > + > void intel_engines_sanitize(struct drm_i915_private *i915, bool force); > > bool intel_engine_is_idle(struct intel_engine_cs *engine); > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index 12550b55c42f..67431355cd6e 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -363,9 +363,7 @@ static int igt_global_reset(void *arg) > /* Check that we can issue a global GPU reset */ > > igt_global_reset_lock(i915); > - set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags); > > - mutex_lock(&i915->drm.struct_mutex); > reset_count = i915_reset_count(&i915->gpu_error); > > i915_reset(i915, ALL_ENGINES, NULL); > @@ -374,9 +372,7 @@ static int igt_global_reset(void *arg) > pr_err("No GPU reset recorded!\n"); > err = -EINVAL; > } > - mutex_unlock(&i915->drm.struct_mutex); > > - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); > igt_global_reset_unlock(i915); > > if (i915_terminally_wedged(&i915->gpu_error)) > @@ -399,9 +395,7 @@ static int igt_wedged_reset(void *arg) > i915_gem_set_wedged(i915); > GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error)); > > - set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags); > i915_reset(i915, ALL_ENGINES, NULL); > - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); > > intel_runtime_pm_put(i915, wakeref); > mutex_unlock(&i915->drm.struct_mutex); > @@ -511,7 +505,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) > break; > } > > - if (!wait_for_idle(engine)) { > + if (!i915_reset_flush(i915)) { > struct drm_printer p = > drm_info_printer(i915->drm.dev); > > @@ -903,20 +897,13 @@ static int igt_reset_engines(void *arg) > return 0; > } > > -static u32 fake_hangcheck(struct i915_request *rq, u32 mask) > +static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask) > { > - struct i915_gpu_error *error = &rq->i915->gpu_error; > - u32 reset_count = i915_reset_count(error); > - > - error->stalled_mask = mask; > - > - /* set_bit() must be after we have setup the backchannel (mask) */ > - smp_mb__before_atomic(); > - set_bit(I915_RESET_HANDOFF, &error->flags); > + u32 count = i915_reset_count(&i915->gpu_error); > > - wake_up_all(&error->wait_queue); > + i915_reset(i915, mask, NULL); > > - return reset_count; > + return count; > } > > static int igt_reset_wait(void *arg) > @@ -962,7 +949,7 @@ static int igt_reset_wait(void *arg) > goto out_rq; > } > > - reset_count = fake_hangcheck(rq, ALL_ENGINES); > + reset_count = fake_hangcheck(i915, ALL_ENGINES); > > timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10); > if (timeout < 0) { > @@ -972,7 +959,6 @@ static int igt_reset_wait(void *arg) > goto out_rq; > } > > - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); > if (i915_reset_count(&i915->gpu_error) == reset_count) { > pr_err("No GPU reset recorded!\n"); > err = -EINVAL; > @@ -1162,7 +1148,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915, > } > > out_reset: > - fake_hangcheck(rq, intel_engine_flag(rq->engine)); > + fake_hangcheck(rq->i915, intel_engine_flag(rq->engine)); > > if (tsk) { > struct igt_wedge_me w; > @@ -1341,12 +1327,7 @@ static int igt_reset_queue(void *arg) > goto fini; > } > > - reset_count = fake_hangcheck(prev, ENGINE_MASK(id)); > - > - i915_reset(i915, ENGINE_MASK(id), NULL); > - > - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, > - &i915->gpu_error.flags)); > + reset_count = fake_hangcheck(i915, ENGINE_MASK(id)); > > if (prev->fence.error != -EIO) { > pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", > @@ -1565,6 +1546,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine, > pr_err("%s(%s): Failed to start request %llx, at %x\n", > __func__, engine->name, > rq->fence.seqno, hws_seqno(&h, rq)); > + i915_gem_set_wedged(i915); > err = -EIO; > } > > @@ -1588,7 +1570,6 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine, > static void force_reset(struct drm_i915_private *i915) > { > i915_gem_set_wedged(i915); > - set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags); > i915_reset(i915, 0, NULL); > } > > @@ -1618,6 +1599,26 @@ static int igt_atomic_reset(void *arg) > if (i915_terminally_wedged(&i915->gpu_error)) > goto unlock; > > + if (intel_has_gpu_reset(i915)) { > + const typeof(*phases) *p; > + > + for (p = phases; p->name; p++) { > + GEM_TRACE("intel_gpu_reset under %s\n", p->name); > + > + p->critical_section_begin(); > + err = intel_gpu_reset(i915, ALL_ENGINES); > + p->critical_section_end(); > + > + if (err) { > + pr_err("intel_gpu_reset failed under %s\n", > + p->name); > + goto out; > + } > + } > + > + force_reset(i915); > + } > + > if (intel_has_reset_engine(i915)) { > struct intel_engine_cs *engine; > enum intel_engine_id id; > diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c > index a8cac56be835..b15c4f26c593 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c > +++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c > @@ -214,7 +214,6 @@ static int check_whitelist(struct i915_gem_context *ctx, > > static int do_device_reset(struct intel_engine_cs *engine) > { > - set_bit(I915_RESET_HANDOFF, &engine->i915->gpu_error.flags); > i915_reset(engine->i915, ENGINE_MASK(engine->id), "live_workarounds"); > return 0; > } > @@ -394,7 +393,6 @@ static int > live_gpu_reset_gt_engine_workarounds(void *arg) > { > struct drm_i915_private *i915 = arg; > - struct i915_gpu_error *error = &i915->gpu_error; > intel_wakeref_t wakeref; > struct wa_lists lists; > bool ok; > @@ -413,7 +411,6 @@ live_gpu_reset_gt_engine_workarounds(void *arg) > if (!ok) > goto out; > > - set_bit(I915_RESET_HANDOFF, &error->flags); > i915_reset(i915, ALL_ENGINES, "live_workarounds"); > > ok = verify_gt_engine_wa(i915, &lists, "after reset"); > diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c > index 5477ad4a7e7d..8ab5a2688a0c 100644 > --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c > +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c > @@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev) > i915_gem_contexts_lost(i915); > mutex_unlock(&i915->drm.struct_mutex); > > - cancel_delayed_work_sync(&i915->gt.retire_work); > - cancel_delayed_work_sync(&i915->gt.idle_work); > + drain_delayed_work(&i915->gt.retire_work); > + drain_delayed_work(&i915->gt.idle_work); > i915_gem_drain_workqueue(i915); > > mutex_lock(&i915->drm.struct_mutex); > -- > 2.20.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx