[PATCH] drm/amdgpu: Count disabled CRTCs in commit tail earlier

Andrey.Grodzovsky@xxxxxxx (Andrey Grodzovsky) · Fri, 22 Jun 2018 22:42:12 -0400

On 06/22/2018 09:03 PM, Andrey Grodzovsky wrote:
>
>
> On 06/22/2018 02:56 PM, Lyude Paul wrote:
>> On Fri, 2018-06-22 at 13:34 -0400, Andrey Grodzovsky wrote:
>>> On 06/21/2018 04:48 PM, Lyude Paul wrote:
>>>> This fixes a regression I accidentally reduced that was picked up by
>>>> kasan, where we were checking the CRTC atomic states after DRM's 
>>>> helpers
>>>> had already freed them. Example:
>>>>
>>>> ==================================================================
>>>> BUG: KASAN: use-after-free in
>>>> amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> Read of size 1 at addr ffff8803a697b071 by task kworker/u16:0/7
>>>>
>>>> CPU: 7 PID: 7 Comm: kworker/u16:0 Tainted: G OÂ Â Â Â Â  4.18.0-
>>>> rc1Lyude-Upstream+ #1
>>>> Hardware name: HP HP ZBook 15 G4/8275, BIOS P70 Ver. 01.21 05/02/2018
>>>> Workqueue: events_unbound commit_work [drm_kms_helper]
>>>> Call Trace:
>>>> Â Â  dump_stack+0xc1/0x169
>>>> Â Â  ? dump_stack_print_info.cold.1+0x42/0x42
>>>> Â Â  ? kmsg_dump_rewind_nolock+0xd9/0xd9
>>>> Â Â  ? printk+0x9f/0xc5
>>>> Â Â  ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> Â Â  print_address_description+0x6c/0x23c
>>>> Â Â  ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> Â Â  kasan_report.cold.6+0x241/0x2fd
>>>> Â Â  amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> Â Â  ? commit_planes_to_stream.constprop.45+0x13b0/0x13b0 [amdgpu]
>>>> Â Â  ? cpu_load_update_active+0x290/0x290
>>>> Â Â  ? finish_task_switch+0x2bd/0x840
>>>> Â Â  ? __switch_to_asm+0x34/0x70
>>>> Â Â  ? read_word_at_a_time+0xe/0x20
>>>> Â Â  ? strscpy+0x14b/0x460
>>>> Â Â  ? drm_atomic_helper_wait_for_dependencies+0x47d/0x7e0 
>>>> [drm_kms_helper]
>>>> Â Â  commit_tail+0x96/0xe0 [drm_kms_helper]
>>>> Â Â  process_one_work+0x88a/0x1360
>>>> Â Â  ? create_worker+0x540/0x540
>>>> Â Â  ? __sched_text_start+0x8/0x8
>>>> Â Â  ? move_queued_task+0x760/0x760
>>>> Â Â  ? call_rcu_sched+0x20/0x20
>>>> Â Â  ? vsnprintf+0xcda/0x1350
>>>> Â Â  ? wait_woken+0x1c0/0x1c0
>>>> Â Â  ? mutex_unlock+0x1d/0x40
>>>> Â Â  ? init_timer_key+0x190/0x230
>>>> Â Â  ? schedule+0xea/0x390
>>>> Â Â  ? __schedule+0x1ea0/0x1ea0
>>>> Â Â  ? need_to_create_worker+0xe4/0x210
>>>> Â Â  ? init_worker_pool+0x700/0x700
>>>> Â Â  ? try_to_del_timer_sync+0xbf/0x110
>>>> Â Â  ? del_timer+0x120/0x120
>>>> Â Â  ? __mutex_lock_slowpath+0x10/0x10
>>>> Â Â  worker_thread+0x196/0x11f0
>>>> Â Â  ? flush_rcu_work+0x50/0x50
>>>> Â Â  ? __switch_to_asm+0x34/0x70
>>>> Â Â  ? __switch_to_asm+0x34/0x70
>>>> Â Â  ? __switch_to_asm+0x40/0x70
>>>> Â Â  ? __switch_to_asm+0x34/0x70
>>>> Â Â  ? __switch_to_asm+0x40/0x70
>>>> Â Â  ? __switch_to_asm+0x34/0x70
>>>> Â Â  ? __switch_to_asm+0x40/0x70
>>>> Â Â  ? __schedule+0x7d6/0x1ea0
>>>> Â Â  ? migrate_swap_stop+0x850/0x880
>>>> Â Â  ? __sched_text_start+0x8/0x8
>>>> Â Â  ? save_stack+0x8c/0xb0
>>>> Â Â  ? kasan_kmalloc+0xbf/0xe0
>>>> Â Â  ? kmem_cache_alloc_trace+0xe4/0x190
>>>> Â Â  ? kthread+0x98/0x390
>>>> Â Â  ? ret_from_fork+0x35/0x40
>>>> Â Â  ? ret_from_fork+0x35/0x40
>>>> Â Â  ? deactivate_slab.isra.67+0x3c4/0x5c0
>>>> Â Â  ? kthread+0x98/0x390
>>>> Â Â  ? kthread+0x98/0x390
>>>> Â Â  ? set_track+0x76/0x120
>>>> Â Â  ? schedule+0xea/0x390
>>>> Â Â  ? __schedule+0x1ea0/0x1ea0
>>>> Â Â  ? wait_woken+0x1c0/0x1c0
>>>> Â Â  ? kasan_unpoison_shadow+0x30/0x40
>>>> Â Â  ? parse_args.cold.15+0x17a/0x17a
>>>> Â Â  ? flush_rcu_work+0x50/0x50
>>>> Â Â  kthread+0x2d4/0x390
>>>> Â Â  ? kthread_create_worker_on_cpu+0xc0/0xc0
>>>> Â Â  ret_from_fork+0x35/0x40
>>>>
>>>> Allocated by task 1124:
>>>> Â Â  kasan_kmalloc+0xbf/0xe0
>>>> Â Â  kmem_cache_alloc_trace+0xe4/0x190
>>>> Â Â  dm_crtc_duplicate_state+0x78/0x130 [amdgpu]
>>>> Â Â  drm_atomic_get_crtc_state+0x147/0x410 [drm]
>>>> Â Â  page_flip_common+0x57/0x230 [drm_kms_helper]
>>>> Â Â  drm_atomic_helper_page_flip+0xa6/0x110 [drm_kms_helper]
>>>> Â Â  drm_mode_page_flip_ioctl+0xc4b/0x10a0 [drm]
>>>> Â Â  drm_ioctl_kernel+0x1d4/0x260 [drm]
>>>> Â Â  drm_ioctl+0x433/0x920 [drm]
>>>> Â Â  amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
>>>> Â Â  do_vfs_ioctl+0x1a1/0x13d0
>>>> Â Â  ksys_ioctl+0x60/0x90
>>>> Â Â  __x64_sys_ioctl+0x6f/0xb0
>>>> Â Â  do_syscall_64+0x147/0x440
>>>> Â Â  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> Freed by task 1124:
>>>> Â Â  __kasan_slab_free+0x12e/0x180
>>>> Â Â  kfree+0x92/0x1a0
>>>> Â Â  drm_atomic_state_default_clear+0x315/0xc40 [drm]
>>>> Â Â  __drm_atomic_state_free+0x35/0xd0 [drm]
>>>> Â Â  drm_atomic_helper_update_plane+0xac/0x350 [drm_kms_helper]
>>>> Â Â  __setplane_internal+0x2d6/0x840 [drm]
>>>> Â Â  drm_mode_cursor_universal+0x41e/0xbe0 [drm]
>>>> Â Â  drm_mode_cursor_common+0x49f/0x880 [drm]
>>>> Â Â  drm_mode_cursor_ioctl+0xd8/0x130 [drm]
>>>> Â Â  drm_ioctl_kernel+0x1d4/0x260 [drm]
>>>> Â Â  drm_ioctl+0x433/0x920 [drm]
>>>> Â Â  amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
>>>> Â Â  do_vfs_ioctl+0x1a1/0x13d0
>>>> Â Â  ksys_ioctl+0x60/0x90
>>>> Â Â  __x64_sys_ioctl+0x6f/0xb0
>>>> Â Â  do_syscall_64+0x147/0x440
>>>> Â Â  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> The buggy address belongs to the object at ffff8803a697b068
>>>> Â Â  which belongs to the cache kmalloc-1024 of size 1024
>>>> The buggy address is located 9 bytes inside of
>>>> Â Â  1024-byte region [ffff8803a697b068, ffff8803a697b468)
>>>> The buggy address belongs to the page:
>>>> page:ffffea000e9a5e00 count:1 mapcount:0 mapping:ffff88041e00efc0 
>>>> index:0x0
>>>> compound_mapcount: 0
>>>> flags: 0x8000000000008100(slab|head)
>>>> raw: 8000000000008100 ffffea000ecbc208 ffff88041e000c70 
>>>> ffff88041e00efc0
>>>> raw: 0000000000000000 0000000000170017 00000001ffffffff 
>>>> 0000000000000000
>>>> page dumped because: kasan: bad access detected
>>>>
>>>> Memory state around the buggy address:
>>>> Â Â  ffff8803a697af00: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>>> Â Â  ffff8803a697af80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>>>> ffff8803a697b000: fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb
>>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  ^
>>>> Â Â  ffff8803a697b080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>>> Â Â  ffff8803a697b100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>>> ==================================================================
>>>>
>>>> So, we fix this by counting the number of CRTCs this atomic commit 
>>>> disabled
>>>> early on in the function before their atomic states have been 
>>>> freed, then
>>>> use
>>>> that count later to do the appropriate number of RPM puts at the 
>>>> end of the
>>>> function.
>>> I am a bit not clear, are you saying that the problem was the 'in the
>>> middle' commit (cursor ioctl) doing
>>>
>>> drm_atomic_state_default_clear->dm_crtc_destroy_state->kfree(state)
>>>
>>> where the state is the one you access from from the non blocking 
>>> part of
>>> page flip though old_crtc_state->active?
>> The problem is that (see the comment in 
>> drivers/gpu/drm/drm_atomic_helper.c:2065
>> ) it's unsafe to touch any of the old_crtc_state structures after
>> drm_atomic_helper_commit_hw_done() is called, as it's likely that 
>> they've been
>> freed already.
>
> IÂ  am not sure about that, the comment in 
> drm_atomic_helper_commit_hw_done says that
> "the driver is not allowed to read or change any permanent software
> or hardware modeset state" I interpret it as not the old_crtc_state 
> but as the new_crtc_state or crtc->state after
> drm_atomic_helper_swap_state completed.Â  It means that if you touch 
> crtc->state after drm_atomic_helper_commit_hw_done
> you actually could already be accessing a state which belong to the 
> next atomic commit after you.
> It really looks like cursor's atomic commit sneaks in in a middle of 
> page flip between the page flip IOCTL
> and it's commit_tail part and swaps away crct->state to his own new 
> state and release the 'old' state which is not really
> old yet and needs to be used by the tail part of page flip. This makes 
> sense since do_aquire_global_lock we use in amdgpu_dm_atomic_check
> to serialize against concurrent atomic_commitsÂ  is not called for case 
> of cursor plane and so it may race against any commit_tail in flight...
> Not sure why we haven't seen this problem before.
> Obviously your fix makes the problem go away since you stopped 
> accessing the new_crtc_state and not the old_crtc_state but the root 
> problem
> seems to me still there.
>
> Andrey

I took another look and actually no problem with the CURSOR IOCTL as it 
will wait in drm_atomic_helper_swap_state for hw_done event, so
I agree with the fix but just disagree with the explanation, it should 
be said that it's unsafe to touch the new_crtc_state (same as 
crtc->state) after
call to drm_atomic_helper_commit_hw_done. So I would make the 
explanation a bit more detailed on this point.

Anyway, the fix is Reviewed-by: Andrey Grodzovsky 
<andrey.grodzovsky at amd.com>

Andrey

>
>>> Andrey
>>>> Fixes: 97028037a38ae ("drm/amdgpu: Grab/put runtime PM references in
>>>> atomic_commit_tail()")
>>>> Signed-off-by: Lyude Paul <lyude at redhat.com>
>>>> Cc: Michel DÃ¤nzer <michel at daenzer.net>
>>>> Reported-by: Michel DÃ¤nzer <michel at daenzer.net>
>>>> ---
>>>> Â Â  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++----
>>>> Â Â  1 file changed, 6 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> index f9add85157e7..689dbdf44bbf 100644
>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> @@ -4206,6 +4206,7 @@ static void amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>> Â Â Â Â Â Â  struct drm_connector *connector;
>>>> Â Â Â Â Â Â  struct drm_connector_state *old_con_state, *new_con_state;
>>>> Â Â Â Â Â Â  struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
>>>> +Â Â Â  int crtc_disable_count = 0;
>>>> Â Â  Â Â Â Â Â Â  drm_atomic_helper_update_legacy_modeset_state(dev, state);
>>>> Â Â  @@ -4410,6 +4411,9 @@ static void 
>>>> amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>> Â Â Â Â Â Â Â Â Â Â  struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc);
>>>> Â Â Â Â Â Â Â Â Â Â  bool modeset_needed;
>>>> Â Â  +Â Â Â Â Â Â Â  if (old_crtc_state->active && !new_crtc_state->active)
>>>> +Â Â Â Â Â Â Â Â Â Â Â  crtc_disable_count++;
>>>> +
>>>> Â Â Â Â Â Â Â Â Â Â  dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
>>>> Â Â Â Â Â Â Â Â Â Â  dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
>>>> Â Â Â Â Â Â Â Â Â Â  modeset_needed = modeset_required(
>>>> @@ -4463,11 +4467,9 @@ static void amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>> Â Â Â Â Â Â Â  * so we can put the GPU into runtime suspend if we're not 
>>>> driving
>>>> any
>>>> Â Â Â Â Â Â Â  * displays anymore
>>>> Â Â Â Â Â Â Â  */
>>>> +Â Â Â  for (i = 0; i < crtc_disable_count; i++)
>>>> +Â Â Â Â Â Â Â  pm_runtime_put_autosuspend(dev->dev);
>>>> Â Â Â Â Â Â  pm_runtime_mark_last_busy(dev->dev);
>>>> -Â Â Â  for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state,
>>>> new_crtc_state, i) {
>>>> -Â Â Â Â Â Â Â  if (old_crtc_state->active && !new_crtc_state->active)
>>>> -Â Â Â Â Â Â Â Â Â Â Â  pm_runtime_put_autosuspend(dev->dev);
>>>> -Â Â Â  }
>>>> Â Â  }
>>>
>