Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > We call i915_gem_reset_prepare_engine() during reset and then upon > wedging if the reset fails. Unfortunately, kthread_park and similar do > not support being called recursively and so we must count the number of > times we prepare for reset and only actually prepare on the outermost > layer. (Similarly for finish on unwinding the onion.) > > [ 87.705581] WARNING: CPU: 2 PID: 1377 at kernel/kthread.c:505 kthread_park+0x55/0x60 > [ 87.705583] Modules linked in: snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm broadcom bcm_phy_lib tg3 mei_me prime_numbers mei lpc_ich > [ 87.705618] CPU: 2 PID: 1377 Comm: gem_eio Tainted: G U 4.17.0-rc5-CI-CI_DRM_4177+ #1 > [ 87.705620] Hardware name: Dell Inc. XPS 8300 /0Y2MRG, BIOS A06 10/17/2011 > [ 87.705622] RIP: 0010:kthread_park+0x55/0x60 > [ 87.705624] RSP: 0018:ffffc9000051bac0 EFLAGS: 00010202 > [ 87.705627] RAX: 0000000000000004 RBX: ffff88021ca13de8 RCX: 0000000000000001 > [ 87.705629] RDX: 0000000080000001 RSI: ffffffff821228a9 RDI: ffff88020e8f0040 > [ 87.705630] RBP: ffff880215937670 R08: 00000000bae32d65 R09: 0000000000000000 > [ 87.705632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8802159376b0 > [ 87.705634] R13: ffff880215937670 R14: ffff880215930000 R15: ffffffffa01c8d60 > [ 87.705636] FS: 00007f0c32061980(0000) GS:ffff88022fa80000(0000) knlGS:0000000000000000 > [ 87.705637] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 87.705639] CR2: 00007f0c32094000 CR3: 000000021a0d4004 CR4: 00000000000606e0 > [ 87.705641] Call Trace: > [ 87.705668] i915_gem_reset_prepare_engine+0x1d/0xa0 [i915] > [ 87.705694] i915_gem_set_wedged+0x7b/0x1e0 [i915] > [ 87.705699] ? __drm_printfn_info+0x20/0x20 > [ 87.705722] i915_reset+0x14a/0x290 [i915] > [ 87.705743] i915_reset_device+0x1fb/0x290 [i915] > [ 87.705767] ? __intel_get_crtc_scanline+0x1c0/0x1c0 [i915] > [ 87.705772] ? work_on_cpu_safe+0x50/0x50 > [ 87.705798] i915_handle_error+0x207/0x4a0 [i915] > [ 87.705810] ? __might_fault+0x39/0x90 > [ 87.705835] i915_wedged_set+0x7f/0xc0 [i915] > [ 87.705841] simple_attr_write+0xb0/0xd0 > [ 87.705847] full_proxy_write+0x51/0x80 > [ 87.705852] __vfs_write+0x31/0x160 > [ 87.705857] ? rcu_read_lock_sched_held+0x6f/0x80 > [ 87.705860] ? rcu_sync_lockdep_assert+0x29/0x50 > [ 87.705862] ? __sb_start_write+0x152/0x1f0 > [ 87.705864] ? __sb_start_write+0x168/0x1f0 > [ 87.705868] vfs_write+0xbd/0x1a0 > [ 87.705872] ksys_write+0x50/0xc0 > [ 87.705877] do_syscall_64+0x55/0x190 > [ 87.705880] entry_SYSCALL_64_after_hwframe+0x49/0xbe > [ 87.705882] RIP: 0033:0x7f0c315df281 > [ 87.705884] RSP: 002b:00007ffc9c990328 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 > [ 87.705887] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0c315df281 > [ 87.705889] RDX: 0000000000000002 RSI: 000055a5e23ef276 RDI: 0000000000000047 > [ 87.705890] RBP: 00007ffc9c990350 R08: 0000000000000000 R09: 0000000000000034 > [ 87.705892] R10: 0000000000000000 R11: 0000000000000246 R12: 000055a5e23ebc50 > [ 87.705894] R13: 00007ffc9c990dc0 R14: 0000000000000000 R15: 0000000000000000 > [ 87.705902] Code: 00 31 ed 48 39 c7 74 0e e8 79 db 00 00 48 8d 7b 18 e8 a0 05 88 00 89 e8 5b 5d c3 0f 0b bd da ff ff ff 89 e8 5b 5d c3 0f 0b eb b7 <0f> 0b bd f0 ff ff ff eb e2 66 90 41 57 41 56 49 c7 c6 f4 ff ff > > References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem.c | 52 ++++++++++++++++++------- > drivers/gpu/drm/i915/intel_engine_cs.c | 1 + > drivers/gpu/drm/i915/intel_ringbuffer.h | 3 ++ > 3 files changed, 43 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 0a2070112b66..b169b630bf78 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -2997,14 +2997,11 @@ i915_gem_find_active_request(struct intel_engine_cs *engine) > return active; > } > > -/* > - * Ensure irq handler finishes, and not run again. > - * Also return the active request so that we only search for it once. > - */ > -struct i915_request * > -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) > +static void __i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) > { > - struct i915_request *request = NULL; > + mutex_lock(&engine->reset_lock); > + if (engine->reset_depth++) > + goto unlock; > > /* > * During the reset sequence, we must prevent the engine from > @@ -3057,6 +3054,38 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) > if (engine->i915->guc.preempt_wq) > flush_workqueue(engine->i915->guc.preempt_wq); > > +unlock: > + mutex_unlock(&engine->reset_lock); > +} > + > +static void __i915_gem_reset_finish_engine(struct intel_engine_cs *engine) > +{ > + mutex_lock(&engine->reset_lock); > + > + GEM_BUG_ON(engine->reset_depth); !engine->reset_depth -Mika > + if (--engine->reset_depth) > + goto unlock; > + > + tasklet_enable(&engine->execlists.tasklet); > + kthread_unpark(engine->breadcrumbs.signaler); > + > + intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); > + > +unlock: > + mutex_unlock(&engine->reset_lock); > +} > + > +/* > + * Ensure irq handler finishes, and not run again. > + * Also return the active request so that we only search for it once. > + */ > +struct i915_request * > +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) > +{ > + struct i915_request *request = NULL; > + > + __i915_gem_reset_prepare_engine(engine); > + > if (engine->irq_seqno_barrier) > engine->irq_seqno_barrier(engine); > > @@ -3265,10 +3294,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv, > > void i915_gem_reset_finish_engine(struct intel_engine_cs *engine) > { > - tasklet_enable(&engine->execlists.tasklet); > - kthread_unpark(engine->breadcrumbs.signaler); > - > - intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); > + __i915_gem_reset_finish_engine(engine); > } > > void i915_gem_reset_finish(struct drm_i915_private *dev_priv) > @@ -3332,7 +3358,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915) > * for which we haven't set the fence error to EIO yet). > */ > for_each_engine(engine, i915, id) { > - i915_gem_reset_prepare_engine(engine); > + __i915_gem_reset_prepare_engine(engine); > > engine->submit_request = nop_submit_request; > engine->schedule = NULL; > @@ -3380,7 +3406,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915) > intel_engine_last_submit(engine)); > spin_unlock_irqrestore(&engine->timeline.lock, flags); > > - i915_gem_reset_finish_engine(engine); > + __i915_gem_reset_finish_engine(engine); > } > > GEM_TRACE("end\n"); > diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c > index 6bfd7e3ed152..0de489da514e 100644 > --- a/drivers/gpu/drm/i915/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/intel_engine_cs.c > @@ -306,6 +306,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv, > /* Nothing to do here, execute in order of dependencies */ > engine->schedule = NULL; > > + mutex_init(&engine->reset_lock); > seqlock_init(&engine->stats.lock); > > ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier); > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 010750e8ee44..ec2b359f3e8b 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -566,6 +566,9 @@ struct intel_engine_cs { > > struct intel_engine_hangcheck hangcheck; > > + struct mutex reset_lock; > + unsigned int reset_depth; > + > #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) > #define I915_ENGINE_SUPPORTS_STATS BIT(1) > #define I915_ENGINE_HAS_PREEMPTION BIT(2) > -- > 2.17.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx