Re: [PATCH v7 3/5] drm/nouveau: Fix deadlock with fb_helper with async RPM requests

Daniel Vetter <daniel@xxxxxxxx> · Tue, 14 Aug 2018 10:53:32 +0200

On Mon, Aug 13, 2018 at 03:16:43PM -0400, Lyude Paul wrote:
> I'm sure I don't need to tell you that fb_helper's locking is a mess.

Uh, loaded statement :-) Especially since we've just reworked all the
fbdev locking to untangle it from drm_modeset locking. What's still a mess
about it? It's documented, and very clearly wrapped around all the
drm_modeset locks. If there's gaps, please bring them up.

> That being said; fb_helper's locking mess can seriously complicate the
> runtime suspend/resume operations of drivers because it can invoke
> atomic commits and connector probing from anywhere that calls
> drm_fb_helper_hotplug_event(). Since most drivers use
> drm_fb_helper_output_poll_changed() as their output_poll_changed
> handler, this can happen in every single context that can fire off a
> hotplug event. An example:
> 
> [  246.669625] INFO: task kworker/4:0:37 blocked for more than 120 seconds.
> [  246.673398]       Not tainted 4.18.0-rc5Lyude-Test+ #2
> [  246.675271] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  246.676527] kworker/4:0     D    0    37      2 0x80000000
> [  246.677580] Workqueue: events output_poll_execute [drm_kms_helper]
> [  246.678704] Call Trace:
> [  246.679753]  __schedule+0x322/0xaf0
> [  246.680916]  schedule+0x33/0x90
> [  246.681924]  schedule_preempt_disabled+0x15/0x20
> [  246.683023]  __mutex_lock+0x569/0x9a0
> [  246.684035]  ? kobject_uevent_env+0x117/0x7b0
> [  246.685132]  ? drm_fb_helper_hotplug_event.part.28+0x20/0xb0 [drm_kms_helper]
> [  246.686179]  mutex_lock_nested+0x1b/0x20
> [  246.687278]  ? mutex_lock_nested+0x1b/0x20
> [  246.688307]  drm_fb_helper_hotplug_event.part.28+0x20/0xb0 [drm_kms_helper]
> [  246.689420]  drm_fb_helper_output_poll_changed+0x23/0x30 [drm_kms_helper]
> [  246.690462]  drm_kms_helper_hotplug_event+0x2a/0x30 [drm_kms_helper]
> [  246.691570]  output_poll_execute+0x198/0x1c0 [drm_kms_helper]
> [  246.692611]  process_one_work+0x231/0x620
> [  246.693725]  worker_thread+0x214/0x3a0
> [  246.694756]  kthread+0x12b/0x150
> [  246.695856]  ? wq_pool_ids_show+0x140/0x140
> [  246.696888]  ? kthread_create_worker_on_cpu+0x70/0x70
> [  246.697998]  ret_from_fork+0x3a/0x50
> [  246.699034] INFO: task kworker/0:1:60 blocked for more than 120 seconds.
> [  246.700153]       Not tainted 4.18.0-rc5Lyude-Test+ #2
> [  246.701182] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  246.702278] kworker/0:1     D    0    60      2 0x80000000
> [  246.703293] Workqueue: pm pm_runtime_work
> [  246.704393] Call Trace:
> [  246.705403]  __schedule+0x322/0xaf0
> [  246.706439]  ? wait_for_completion+0x104/0x190
> [  246.707393]  schedule+0x33/0x90
> [  246.708375]  schedule_timeout+0x3a5/0x590
> [  246.709289]  ? mark_held_locks+0x58/0x80
> [  246.710208]  ? _raw_spin_unlock_irq+0x2c/0x40
> [  246.711222]  ? wait_for_completion+0x104/0x190
> [  246.712134]  ? trace_hardirqs_on_caller+0xf4/0x190
> [  246.713094]  ? wait_for_completion+0x104/0x190
> [  246.713964]  wait_for_completion+0x12c/0x190
> [  246.714895]  ? wake_up_q+0x80/0x80
> [  246.715727]  ? get_work_pool+0x90/0x90
> [  246.716649]  flush_work+0x1c9/0x280
> [  246.717483]  ? flush_workqueue_prep_pwqs+0x1b0/0x1b0
> [  246.718442]  __cancel_work_timer+0x146/0x1d0
> [  246.719247]  cancel_delayed_work_sync+0x13/0x20
> [  246.720043]  drm_kms_helper_poll_disable+0x1f/0x30 [drm_kms_helper]
> [  246.721123]  nouveau_pmops_runtime_suspend+0x3d/0xb0 [nouveau]
> [  246.721897]  pci_pm_runtime_suspend+0x6b/0x190
> [  246.722825]  ? pci_has_legacy_pm_support+0x70/0x70
> [  246.723737]  __rpm_callback+0x7a/0x1d0
> [  246.724721]  ? pci_has_legacy_pm_support+0x70/0x70
> [  246.725607]  rpm_callback+0x24/0x80
> [  246.726553]  ? pci_has_legacy_pm_support+0x70/0x70
> [  246.727376]  rpm_suspend+0x142/0x6b0
> [  246.728185]  pm_runtime_work+0x97/0xc0
> [  246.728938]  process_one_work+0x231/0x620
> [  246.729796]  worker_thread+0x44/0x3a0
> [  246.730614]  kthread+0x12b/0x150
> [  246.731395]  ? wq_pool_ids_show+0x140/0x140
> [  246.732202]  ? kthread_create_worker_on_cpu+0x70/0x70
> [  246.732878]  ret_from_fork+0x3a/0x50
> [  246.733768] INFO: task kworker/4:2:422 blocked for more than 120 seconds.
> [  246.734587]       Not tainted 4.18.0-rc5Lyude-Test+ #2
> [  246.735393] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  246.736113] kworker/4:2     D    0   422      2 0x80000080
> [  246.736789] Workqueue: events_long drm_dp_mst_link_probe_work [drm_kms_helper]
> [  246.737665] Call Trace:
> [  246.738490]  __schedule+0x322/0xaf0
> [  246.739250]  schedule+0x33/0x90
> [  246.739908]  rpm_resume+0x19c/0x850
> [  246.740750]  ? finish_wait+0x90/0x90
> [  246.741541]  __pm_runtime_resume+0x4e/0x90
> [  246.742370]  nv50_disp_atomic_commit+0x31/0x210 [nouveau]
> [  246.743124]  drm_atomic_commit+0x4a/0x50 [drm]
> [  246.743775]  restore_fbdev_mode_atomic+0x1c8/0x240 [drm_kms_helper]
> [  246.744603]  restore_fbdev_mode+0x31/0x140 [drm_kms_helper]
> [  246.745373]  drm_fb_helper_restore_fbdev_mode_unlocked+0x54/0xb0 [drm_kms_helper]
> [  246.746220]  drm_fb_helper_set_par+0x2d/0x50 [drm_kms_helper]
> [  246.746884]  drm_fb_helper_hotplug_event.part.28+0x96/0xb0 [drm_kms_helper]
> [  246.747675]  drm_fb_helper_output_poll_changed+0x23/0x30 [drm_kms_helper]
> [  246.748544]  drm_kms_helper_hotplug_event+0x2a/0x30 [drm_kms_helper]
> [  246.749439]  nv50_mstm_hotplug+0x15/0x20 [nouveau]
> [  246.750111]  drm_dp_send_link_address+0x177/0x1c0 [drm_kms_helper]
> [  246.750764]  drm_dp_check_and_send_link_address+0xa8/0xd0 [drm_kms_helper]
> [  246.751602]  drm_dp_mst_link_probe_work+0x51/0x90 [drm_kms_helper]
> [  246.752314]  process_one_work+0x231/0x620
> [  246.752979]  worker_thread+0x44/0x3a0
> [  246.753838]  kthread+0x12b/0x150
> [  246.754619]  ? wq_pool_ids_show+0x140/0x140
> [  246.755386]  ? kthread_create_worker_on_cpu+0x70/0x70
> [  246.756162]  ret_from_fork+0x3a/0x50
> [  246.756847]
>            Showing all locks held in the system:
> [  246.758261] 3 locks held by kworker/4:0/37:
> [  246.759016]  #0: 00000000f8df4d2d ((wq_completion)"events"){+.+.}, at: process_one_work+0x1b3/0x620
> [  246.759856]  #1: 00000000e6065461 ((work_completion)(&(&dev->mode_config.output_poll_work)->work)){+.+.}, at: process_one_work+0x1b3/0x620
> [  246.760670]  #2: 00000000cb66735f (&helper->lock){+.+.}, at: drm_fb_helper_hotplug_event.part.28+0x20/0xb0 [drm_kms_helper]
> [  246.761516] 2 locks held by kworker/0:1/60:
> [  246.762274]  #0: 00000000fff6be0f ((wq_completion)"pm"){+.+.}, at: process_one_work+0x1b3/0x620
> [  246.762982]  #1: 000000005ab44fb4 ((work_completion)(&dev->power.work)){+.+.}, at: process_one_work+0x1b3/0x620
> [  246.763890] 1 lock held by khungtaskd/64:
> [  246.764664]  #0: 000000008cb8b5c3 (rcu_read_lock){....}, at: debug_show_all_locks+0x23/0x185
> [  246.765588] 5 locks held by kworker/4:2/422:
> [  246.766440]  #0: 00000000232f0959 ((wq_completion)"events_long"){+.+.}, at: process_one_work+0x1b3/0x620
> [  246.767390]  #1: 00000000bb59b134 ((work_completion)(&mgr->work)){+.+.}, at: process_one_work+0x1b3/0x620
> [  246.768154]  #2: 00000000cb66735f (&helper->lock){+.+.}, at: drm_fb_helper_restore_fbdev_mode_unlocked+0x4c/0xb0 [drm_kms_helper]
> [  246.768966]  #3: 000000004c8f0b6b (crtc_ww_class_acquire){+.+.}, at: restore_fbdev_mode_atomic+0x4b/0x240 [drm_kms_helper]
> [  246.769921]  #4: 000000004c34a296 (crtc_ww_class_mutex){+.+.}, at: drm_modeset_backoff+0x8a/0x1b0 [drm]
> [  246.770839] 1 lock held by dmesg/1038:
> [  246.771739] 2 locks held by zsh/1172:
> [  246.772650]  #0: 00000000836d0438 (&tty->ldisc_sem){++++}, at: ldsem_down_read+0x37/0x40
> [  246.773680]  #1: 000000001f4f4d48 (&ldata->atomic_read_lock){+.+.}, at: n_tty_read+0xc1/0x870
> 
> [  246.775522] =============================================
> 
> After trying dozens of different solutions, I found one very simple one
> that should also have the benefit of preventing us from having to fight
> locking for the rest of our lives. So, we work around these deadlocks by
> deferring all fbcon hotplug events that happen after the runtime suspend
> process starts until after the device is resumed again.
> 
> Changes since v6:
>  - Remove unused nouveau_fbcon_hotplugged_in_suspend() - Ilia
> 
> Changes since v5:
>  - Come up with the (hopefully final) solution for solving this dumb
>    problem, one that is a lot less likely to cause issues with locking in
>    the future. This should work around all deadlock conditions with fbcon
>    brought up thus far.
> 
> Changes since v4:
>  - Add nouveau_fbcon_hotplugged_in_suspend() to workaround deadlock
>    condition that Lukas described
>  - Just move all of this out of drm_fb_helper. It seems that other DRM
>    drivers have already figured out other workarounds for this. If other
>    drivers do end up needing this in the future, we can just move this
>    back into drm_fb_helper again.
> 
> Changes since v3:
> - Actually check if fb_helper is NULL in both new helpers
> - Actually check drm_fbdev_emulation in both new helpers
> - Don't fire off a fb_helper hotplug unconditionally; only do it if
>   the following conditions are true (as otherwise, calling this in the
>   wrong spot will cause Bad Things to happen):
>   - fb_helper hotplug handling was actually inhibited previously
>   - fb_helper actually has a delayed hotplug pending
>   - fb_helper is actually bound
>   - fb_helper is actually initialized
> - Add __must_check to drm_fb_helper_suspend_hotplug(). There's no
>   situation where a driver would actually want to use this without
>   checking the return value, so enforce that
> - Rewrite and clarify the documentation for both helpers.
> - Make sure to return true in the drm_fb_helper_suspend_hotplug() stub
>   that's provided in drm_fb_helper.h when CONFIG_DRM_FBDEV_EMULATION
>   isn't enabled
> - Actually grab the toplevel fb_helper lock in
>   drm_fb_helper_resume_hotplug(), since it's possible other activity
>   (such as a hotplug) could be going on at the same time the driver
>   calls drm_fb_helper_resume_hotplug(). We need this to check whether or
>   not drm_fb_helper_hotplug_event() needs to be called anyway
> 
> Signed-off-by: Lyude Paul <lyude@xxxxxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx
> Cc: Lukas Wunner <lukas@xxxxxxxxx>
> Cc: Karol Herbst <karolherbst@xxxxxxxxx>
> ---
>  drivers/gpu/drm/nouveau/dispnv50/disp.c   |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_display.c |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_fbcon.c   | 57 +++++++++++++++++++++++
>  drivers/gpu/drm/nouveau/nouveau_fbcon.h   |  5 ++
>  4 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c
> index 8b522a9b12f6..a0772389ed90 100644
> --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c
> +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c
> @@ -2049,7 +2049,7 @@ nv50_disp_atomic_state_alloc(struct drm_device *dev)
>  static const struct drm_mode_config_funcs
>  nv50_disp_func = {
>  	.fb_create = nouveau_user_framebuffer_create,
> -	.output_poll_changed = drm_fb_helper_output_poll_changed,
> +	.output_poll_changed = nouveau_fbcon_output_poll_changed,
>  	.atomic_check = nv50_disp_atomic_check,
>  	.atomic_commit = nv50_disp_atomic_commit,
>  	.atomic_state_alloc = nv50_disp_atomic_state_alloc,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c
> index 1d36ab5d4796..4b873e668b26 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_display.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_display.c
> @@ -293,7 +293,7 @@ nouveau_user_framebuffer_create(struct drm_device *dev,
>  
>  static const struct drm_mode_config_funcs nouveau_mode_config_funcs = {
>  	.fb_create = nouveau_user_framebuffer_create,
> -	.output_poll_changed = drm_fb_helper_output_poll_changed,
> +	.output_poll_changed = nouveau_fbcon_output_poll_changed,
>  };
>  
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
> index 85c1f10bc2b6..8cf966690963 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
> @@ -466,6 +466,7 @@ nouveau_fbcon_set_suspend_work(struct work_struct *work)
>  	console_unlock();
>  
>  	if (state == FBINFO_STATE_RUNNING) {
> +		nouveau_fbcon_hotplug_resume(drm->fbcon);
>  		pm_runtime_mark_last_busy(drm->dev->dev);
>  		pm_runtime_put_sync(drm->dev->dev);
>  	}
> @@ -487,6 +488,61 @@ nouveau_fbcon_set_suspend(struct drm_device *dev, int state)
>  	schedule_work(&drm->fbcon_work);
>  }
>  
> +void
> +nouveau_fbcon_output_poll_changed(struct drm_device *dev)
> +{
> +	struct nouveau_drm *drm = nouveau_drm(dev);
> +	struct nouveau_fbdev *fbcon = drm->fbcon;
> +	int ret;
> +
> +	if (!fbcon)
> +		return;
> +
> +	mutex_lock(&fbcon->hotplug_lock);
> +
> +	ret = pm_runtime_get(dev->dev);
> +	if (ret == 1 || ret == -EACCES) {
> +		drm_fb_helper_hotplug_event(&fbcon->helper);
> +
> +		pm_runtime_mark_last_busy(dev->dev);
> +		pm_runtime_put_autosuspend(dev->dev);
> +	} else if (ret == 0) {
> +		/* If the GPU was already in the process of suspending before
> +		 * this event happened, then we can't block here as we'll
> +		 * deadlock the runtime pmops since they wait for us to
> +		 * finish. So, just defer this event for when we runtime
> +		 * resume again. It will be handled by fbcon_work.
> +		 */

Hm, I thought pm_runtime_get() never fails, and you need one of the other
variants if you want special behaviour here? Like
pm_runtime_get_noresume().

Otherwise lgtm.
-Daniel

> +		NV_DEBUG(drm, "fbcon HPD event deferred until runtime resume\n");
> +		fbcon->hotplug_waiting = true;
> +		pm_runtime_put_noidle(drm->dev->dev);
> +	} else {
> +		DRM_WARN("fbcon HPD event lost due to RPM failure: %d\n",
> +			 ret);
> +	}
> +
> +	mutex_unlock(&fbcon->hotplug_lock);
> +}
> +
> +void
> +nouveau_fbcon_hotplug_resume(struct nouveau_fbdev *fbcon)
> +{
> +	struct nouveau_drm *drm;
> +
> +	if (!fbcon)
> +		return;
> +	drm = nouveau_drm(fbcon->helper.dev);
> +
> +	mutex_lock(&fbcon->hotplug_lock);
> +	if (fbcon->hotplug_waiting) {
> +		fbcon->hotplug_waiting = false;
> +
> +		NV_DEBUG(drm, "Handling deferred fbcon HPD events\n");
> +		drm_fb_helper_hotplug_event(&fbcon->helper);
> +	}
> +	mutex_unlock(&fbcon->hotplug_lock);
> +}
> +
>  int
>  nouveau_fbcon_init(struct drm_device *dev)
>  {
> @@ -505,6 +561,7 @@ nouveau_fbcon_init(struct drm_device *dev)
>  
>  	drm->fbcon = fbcon;
>  	INIT_WORK(&drm->fbcon_work, nouveau_fbcon_set_suspend_work);
> +	mutex_init(&fbcon->hotplug_lock);
>  
>  	drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs);
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.h b/drivers/gpu/drm/nouveau/nouveau_fbcon.h
> index a6f192ea3fa6..db9d52047ef8 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.h
> @@ -41,6 +41,9 @@ struct nouveau_fbdev {
>  	struct nvif_object gdi;
>  	struct nvif_object blit;
>  	struct nvif_object twod;
> +
> +	struct mutex hotplug_lock;
> +	bool hotplug_waiting;
>  };
>  
>  void nouveau_fbcon_restore(void);
> @@ -68,6 +71,8 @@ void nouveau_fbcon_set_suspend(struct drm_device *dev, int state);
>  void nouveau_fbcon_accel_save_disable(struct drm_device *dev);
>  void nouveau_fbcon_accel_restore(struct drm_device *dev);
>  
> +void nouveau_fbcon_output_poll_changed(struct drm_device *dev);
> +void nouveau_fbcon_hotplug_resume(struct nouveau_fbdev *fbcon);
>  extern int nouveau_nofbaccel;
>  
>  #endif /* __NV50_FBCON_H__ */
> -- 
> 2.17.1
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch