We call i915_gem_reset_prepare_engine() during reset and then upon wedging if the reset fails. Unfortunately, kthread_park and similar do not support being called recursively and so we must count the number of times we prepare for reset and only actually prepare on the outermost layer. (Similarly for finish on unwinding the onion.) [ 87.705581] WARNING: CPU: 2 PID: 1377 at kernel/kthread.c:505 kthread_park+0x55/0x60 [ 87.705583] Modules linked in: snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm broadcom bcm_phy_lib tg3 mei_me prime_numbers mei lpc_ich [ 87.705618] CPU: 2 PID: 1377 Comm: gem_eio Tainted: G U 4.17.0-rc5-CI-CI_DRM_4177+ #1 [ 87.705620] Hardware name: Dell Inc. XPS 8300 /0Y2MRG, BIOS A06 10/17/2011 [ 87.705622] RIP: 0010:kthread_park+0x55/0x60 [ 87.705624] RSP: 0018:ffffc9000051bac0 EFLAGS: 00010202 [ 87.705627] RAX: 0000000000000004 RBX: ffff88021ca13de8 RCX: 0000000000000001 [ 87.705629] RDX: 0000000080000001 RSI: ffffffff821228a9 RDI: ffff88020e8f0040 [ 87.705630] RBP: ffff880215937670 R08: 00000000bae32d65 R09: 0000000000000000 [ 87.705632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8802159376b0 [ 87.705634] R13: ffff880215937670 R14: ffff880215930000 R15: ffffffffa01c8d60 [ 87.705636] FS: 00007f0c32061980(0000) GS:ffff88022fa80000(0000) knlGS:0000000000000000 [ 87.705637] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 87.705639] CR2: 00007f0c32094000 CR3: 000000021a0d4004 CR4: 00000000000606e0 [ 87.705641] Call Trace: [ 87.705668] i915_gem_reset_prepare_engine+0x1d/0xa0 [i915] [ 87.705694] i915_gem_set_wedged+0x7b/0x1e0 [i915] [ 87.705699] ? __drm_printfn_info+0x20/0x20 [ 87.705722] i915_reset+0x14a/0x290 [i915] [ 87.705743] i915_reset_device+0x1fb/0x290 [i915] [ 87.705767] ? __intel_get_crtc_scanline+0x1c0/0x1c0 [i915] [ 87.705772] ? work_on_cpu_safe+0x50/0x50 [ 87.705798] i915_handle_error+0x207/0x4a0 [i915] [ 87.705810] ? __might_fault+0x39/0x90 [ 87.705835] i915_wedged_set+0x7f/0xc0 [i915] [ 87.705841] simple_attr_write+0xb0/0xd0 [ 87.705847] full_proxy_write+0x51/0x80 [ 87.705852] __vfs_write+0x31/0x160 [ 87.705857] ? rcu_read_lock_sched_held+0x6f/0x80 [ 87.705860] ? rcu_sync_lockdep_assert+0x29/0x50 [ 87.705862] ? __sb_start_write+0x152/0x1f0 [ 87.705864] ? __sb_start_write+0x168/0x1f0 [ 87.705868] vfs_write+0xbd/0x1a0 [ 87.705872] ksys_write+0x50/0xc0 [ 87.705877] do_syscall_64+0x55/0x190 [ 87.705880] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 87.705882] RIP: 0033:0x7f0c315df281 [ 87.705884] RSP: 002b:00007ffc9c990328 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 87.705887] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0c315df281 [ 87.705889] RDX: 0000000000000002 RSI: 000055a5e23ef276 RDI: 0000000000000047 [ 87.705890] RBP: 00007ffc9c990350 R08: 0000000000000000 R09: 0000000000000034 [ 87.705892] R10: 0000000000000000 R11: 0000000000000246 R12: 000055a5e23ebc50 [ 87.705894] R13: 00007ffc9c990dc0 R14: 0000000000000000 R15: 0000000000000000 [ 87.705902] Code: 00 31 ed 48 39 c7 74 0e e8 79 db 00 00 48 8d 7b 18 e8 a0 05 88 00 89 e8 5b 5d c3 0f 0b bd da ff ff ff 89 e8 5b 5d c3 0f 0b eb b7 <0f> 0b bd f0 ff ff ff eb e2 66 90 41 57 41 56 49 c7 c6 f4 ff ff References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem.c | 52 ++++++++++++++++++------- drivers/gpu/drm/i915/intel_engine_cs.c | 1 + drivers/gpu/drm/i915/intel_ringbuffer.h | 3 ++ 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 0a2070112b66..b169b630bf78 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2997,14 +2997,11 @@ i915_gem_find_active_request(struct intel_engine_cs *engine) return active; } -/* - * Ensure irq handler finishes, and not run again. - * Also return the active request so that we only search for it once. - */ -struct i915_request * -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) +static void __i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) { - struct i915_request *request = NULL; + mutex_lock(&engine->reset_lock); + if (engine->reset_depth++) + goto unlock; /* * During the reset sequence, we must prevent the engine from @@ -3057,6 +3054,38 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) if (engine->i915->guc.preempt_wq) flush_workqueue(engine->i915->guc.preempt_wq); +unlock: + mutex_unlock(&engine->reset_lock); +} + +static void __i915_gem_reset_finish_engine(struct intel_engine_cs *engine) +{ + mutex_lock(&engine->reset_lock); + + GEM_BUG_ON(engine->reset_depth); + if (--engine->reset_depth) + goto unlock; + + tasklet_enable(&engine->execlists.tasklet); + kthread_unpark(engine->breadcrumbs.signaler); + + intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); + +unlock: + mutex_unlock(&engine->reset_lock); +} + +/* + * Ensure irq handler finishes, and not run again. + * Also return the active request so that we only search for it once. + */ +struct i915_request * +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine) +{ + struct i915_request *request = NULL; + + __i915_gem_reset_prepare_engine(engine); + if (engine->irq_seqno_barrier) engine->irq_seqno_barrier(engine); @@ -3265,10 +3294,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv, void i915_gem_reset_finish_engine(struct intel_engine_cs *engine) { - tasklet_enable(&engine->execlists.tasklet); - kthread_unpark(engine->breadcrumbs.signaler); - - intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); + __i915_gem_reset_finish_engine(engine); } void i915_gem_reset_finish(struct drm_i915_private *dev_priv) @@ -3332,7 +3358,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915) * for which we haven't set the fence error to EIO yet). */ for_each_engine(engine, i915, id) { - i915_gem_reset_prepare_engine(engine); + __i915_gem_reset_prepare_engine(engine); engine->submit_request = nop_submit_request; engine->schedule = NULL; @@ -3380,7 +3406,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915) intel_engine_last_submit(engine)); spin_unlock_irqrestore(&engine->timeline.lock, flags); - i915_gem_reset_finish_engine(engine); + __i915_gem_reset_finish_engine(engine); } GEM_TRACE("end\n"); diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index 6bfd7e3ed152..0de489da514e 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -306,6 +306,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv, /* Nothing to do here, execute in order of dependencies */ engine->schedule = NULL; + mutex_init(&engine->reset_lock); seqlock_init(&engine->stats.lock); ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 010750e8ee44..ec2b359f3e8b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -566,6 +566,9 @@ struct intel_engine_cs { struct intel_engine_hangcheck hangcheck; + struct mutex reset_lock; + unsigned int reset_depth; + #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) #define I915_ENGINE_SUPPORTS_STATS BIT(1) #define I915_ENGINE_HAS_PREEMPTION BIT(2) -- 2.17.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx