On 06/22/2018 09:03 PM, Andrey Grodzovsky wrote: > > > On 06/22/2018 02:56 PM, Lyude Paul wrote: >> On Fri, 2018-06-22 at 13:34 -0400, Andrey Grodzovsky wrote: >>> On 06/21/2018 04:48 PM, Lyude Paul wrote: >>>> This fixes a regression I accidentally reduced that was picked up by >>>> kasan, where we were checking the CRTC atomic states after DRM's >>>> helpers >>>> had already freed them. Example: >>>> >>>> ================================================================== >>>> BUG: KASAN: use-after-free in >>>> amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu] >>>> Read of size 1 at addr ffff8803a697b071 by task kworker/u16:0/7 >>>> >>>> CPU: 7 PID: 7 Comm: kworker/u16:0 Tainted: G O     4.18.0- >>>> rc1Lyude-Upstream+ #1 >>>> Hardware name: HP HP ZBook 15 G4/8275, BIOS P70 Ver. 01.21 05/02/2018 >>>> Workqueue: events_unbound commit_work [drm_kms_helper] >>>> Call Trace: >>>>   dump_stack+0xc1/0x169 >>>>   ? dump_stack_print_info.cold.1+0x42/0x42 >>>>   ? kmsg_dump_rewind_nolock+0xd9/0xd9 >>>>   ? printk+0x9f/0xc5 >>>>   ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu] >>>>   print_address_description+0x6c/0x23c >>>>   ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu] >>>>   kasan_report.cold.6+0x241/0x2fd >>>>   amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu] >>>>   ? commit_planes_to_stream.constprop.45+0x13b0/0x13b0 [amdgpu] >>>>   ? cpu_load_update_active+0x290/0x290 >>>>   ? finish_task_switch+0x2bd/0x840 >>>>   ? __switch_to_asm+0x34/0x70 >>>>   ? read_word_at_a_time+0xe/0x20 >>>>   ? strscpy+0x14b/0x460 >>>>   ? drm_atomic_helper_wait_for_dependencies+0x47d/0x7e0 >>>> [drm_kms_helper] >>>>   commit_tail+0x96/0xe0 [drm_kms_helper] >>>>   process_one_work+0x88a/0x1360 >>>>   ? create_worker+0x540/0x540 >>>>   ? __sched_text_start+0x8/0x8 >>>>   ? move_queued_task+0x760/0x760 >>>>   ? call_rcu_sched+0x20/0x20 >>>>   ? vsnprintf+0xcda/0x1350 >>>>   ? wait_woken+0x1c0/0x1c0 >>>>   ? mutex_unlock+0x1d/0x40 >>>>   ? init_timer_key+0x190/0x230 >>>>   ? schedule+0xea/0x390 >>>>   ? __schedule+0x1ea0/0x1ea0 >>>>   ? need_to_create_worker+0xe4/0x210 >>>>   ? init_worker_pool+0x700/0x700 >>>>   ? try_to_del_timer_sync+0xbf/0x110 >>>>   ? del_timer+0x120/0x120 >>>>   ? __mutex_lock_slowpath+0x10/0x10 >>>>   worker_thread+0x196/0x11f0 >>>>   ? flush_rcu_work+0x50/0x50 >>>>   ? __switch_to_asm+0x34/0x70 >>>>   ? __switch_to_asm+0x34/0x70 >>>>   ? __switch_to_asm+0x40/0x70 >>>>   ? __switch_to_asm+0x34/0x70 >>>>   ? __switch_to_asm+0x40/0x70 >>>>   ? __switch_to_asm+0x34/0x70 >>>>   ? __switch_to_asm+0x40/0x70 >>>>   ? __schedule+0x7d6/0x1ea0 >>>>   ? migrate_swap_stop+0x850/0x880 >>>>   ? __sched_text_start+0x8/0x8 >>>>   ? save_stack+0x8c/0xb0 >>>>   ? kasan_kmalloc+0xbf/0xe0 >>>>   ? kmem_cache_alloc_trace+0xe4/0x190 >>>>   ? kthread+0x98/0x390 >>>>   ? ret_from_fork+0x35/0x40 >>>>   ? ret_from_fork+0x35/0x40 >>>>   ? deactivate_slab.isra.67+0x3c4/0x5c0 >>>>   ? kthread+0x98/0x390 >>>>   ? kthread+0x98/0x390 >>>>   ? set_track+0x76/0x120 >>>>   ? schedule+0xea/0x390 >>>>   ? __schedule+0x1ea0/0x1ea0 >>>>   ? wait_woken+0x1c0/0x1c0 >>>>   ? kasan_unpoison_shadow+0x30/0x40 >>>>   ? parse_args.cold.15+0x17a/0x17a >>>>   ? flush_rcu_work+0x50/0x50 >>>>   kthread+0x2d4/0x390 >>>>   ? kthread_create_worker_on_cpu+0xc0/0xc0 >>>>   ret_from_fork+0x35/0x40 >>>> >>>> Allocated by task 1124: >>>>   kasan_kmalloc+0xbf/0xe0 >>>>   kmem_cache_alloc_trace+0xe4/0x190 >>>>   dm_crtc_duplicate_state+0x78/0x130 [amdgpu] >>>>   drm_atomic_get_crtc_state+0x147/0x410 [drm] >>>>   page_flip_common+0x57/0x230 [drm_kms_helper] >>>>   drm_atomic_helper_page_flip+0xa6/0x110 [drm_kms_helper] >>>>   drm_mode_page_flip_ioctl+0xc4b/0x10a0 [drm] >>>>   drm_ioctl_kernel+0x1d4/0x260 [drm] >>>>   drm_ioctl+0x433/0x920 [drm] >>>>   amdgpu_drm_ioctl+0x11d/0x290 [amdgpu] >>>>   do_vfs_ioctl+0x1a1/0x13d0 >>>>   ksys_ioctl+0x60/0x90 >>>>   __x64_sys_ioctl+0x6f/0xb0 >>>>   do_syscall_64+0x147/0x440 >>>>   entry_SYSCALL_64_after_hwframe+0x44/0xa9 >>>> >>>> Freed by task 1124: >>>>   __kasan_slab_free+0x12e/0x180 >>>>   kfree+0x92/0x1a0 >>>>   drm_atomic_state_default_clear+0x315/0xc40 [drm] >>>>   __drm_atomic_state_free+0x35/0xd0 [drm] >>>>   drm_atomic_helper_update_plane+0xac/0x350 [drm_kms_helper] >>>>   __setplane_internal+0x2d6/0x840 [drm] >>>>   drm_mode_cursor_universal+0x41e/0xbe0 [drm] >>>>   drm_mode_cursor_common+0x49f/0x880 [drm] >>>>   drm_mode_cursor_ioctl+0xd8/0x130 [drm] >>>>   drm_ioctl_kernel+0x1d4/0x260 [drm] >>>>   drm_ioctl+0x433/0x920 [drm] >>>>   amdgpu_drm_ioctl+0x11d/0x290 [amdgpu] >>>>   do_vfs_ioctl+0x1a1/0x13d0 >>>>   ksys_ioctl+0x60/0x90 >>>>   __x64_sys_ioctl+0x6f/0xb0 >>>>   do_syscall_64+0x147/0x440 >>>>   entry_SYSCALL_64_after_hwframe+0x44/0xa9 >>>> >>>> The buggy address belongs to the object at ffff8803a697b068 >>>>   which belongs to the cache kmalloc-1024 of size 1024 >>>> The buggy address is located 9 bytes inside of >>>>   1024-byte region [ffff8803a697b068, ffff8803a697b468) >>>> The buggy address belongs to the page: >>>> page:ffffea000e9a5e00 count:1 mapcount:0 mapping:ffff88041e00efc0 >>>> index:0x0 >>>> compound_mapcount: 0 >>>> flags: 0x8000000000008100(slab|head) >>>> raw: 8000000000008100 ffffea000ecbc208 ffff88041e000c70 >>>> ffff88041e00efc0 >>>> raw: 0000000000000000 0000000000170017 00000001ffffffff >>>> 0000000000000000 >>>> page dumped because: kasan: bad access detected >>>> >>>> Memory state around the buggy address: >>>>   ffff8803a697af00: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >>>>   ffff8803a697af80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >>>>> ffff8803a697b000: fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb >>>>                                                               ^ >>>>   ffff8803a697b080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >>>>   ffff8803a697b100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >>>> ================================================================== >>>> >>>> So, we fix this by counting the number of CRTCs this atomic commit >>>> disabled >>>> early on in the function before their atomic states have been >>>> freed, then >>>> use >>>> that count later to do the appropriate number of RPM puts at the >>>> end of the >>>> function. >>> I am a bit not clear, are you saying that the problem was the 'in the >>> middle' commit (cursor ioctl) doing >>> >>> drm_atomic_state_default_clear->dm_crtc_destroy_state->kfree(state) >>> >>> where the state is the one you access from from the non blocking >>> part of >>> page flip though old_crtc_state->active? >> The problem is that (see the comment in >> drivers/gpu/drm/drm_atomic_helper.c:2065 >> ) it's unsafe to touch any of the old_crtc_state structures after >> drm_atomic_helper_commit_hw_done() is called, as it's likely that >> they've been >> freed already. > > I am not sure about that, the comment in > drm_atomic_helper_commit_hw_done says that > "the driver is not allowed to read or change any permanent software > or hardware modeset state" I interpret it as not the old_crtc_state > but as the new_crtc_state or crtc->state after > drm_atomic_helper_swap_state completed. It means that if you touch > crtc->state after drm_atomic_helper_commit_hw_done > you actually could already be accessing a state which belong to the > next atomic commit after you. > It really looks like cursor's atomic commit sneaks in in a middle of > page flip between the page flip IOCTL > and it's commit_tail part and swaps away crct->state to his own new > state and release the 'old' state which is not really > old yet and needs to be used by the tail part of page flip. This makes > sense since do_aquire_global_lock we use in amdgpu_dm_atomic_check > to serialize against concurrent atomic_commits is not called for case > of cursor plane and so it may race against any commit_tail in flight... > Not sure why we haven't seen this problem before. > Obviously your fix makes the problem go away since you stopped > accessing the new_crtc_state and not the old_crtc_state but the root > problem > seems to me still there. > > Andrey I took another look and actually no problem with the CURSOR IOCTL as it will wait in drm_atomic_helper_swap_state for hw_done event, so I agree with the fix but just disagree with the explanation, it should be said that it's unsafe to touch the new_crtc_state (same as crtc->state) after call to drm_atomic_helper_commit_hw_done. So I would make the explanation a bit more detailed on this point. Anyway, the fix is Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> Andrey > >>> Andrey >>>> Fixes: 97028037a38ae ("drm/amdgpu: Grab/put runtime PM references in >>>> atomic_commit_tail()") >>>> Signed-off-by: Lyude Paul <lyude at redhat.com> >>>> Cc: Michel Dänzer <michel at daenzer.net> >>>> Reported-by: Michel Dänzer <michel at daenzer.net> >>>> --- >>>>   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++---- >>>>   1 file changed, 6 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c >>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c >>>> index f9add85157e7..689dbdf44bbf 100644 >>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c >>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c >>>> @@ -4206,6 +4206,7 @@ static void amdgpu_dm_atomic_commit_tail(struct >>>> drm_atomic_state *state) >>>>       struct drm_connector *connector; >>>>       struct drm_connector_state *old_con_state, *new_con_state; >>>>       struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state; >>>> +   int crtc_disable_count = 0; >>>>         drm_atomic_helper_update_legacy_modeset_state(dev, state); >>>>   @@ -4410,6 +4411,9 @@ static void >>>> amdgpu_dm_atomic_commit_tail(struct >>>> drm_atomic_state *state) >>>>           struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc); >>>>           bool modeset_needed; >>>>   +       if (old_crtc_state->active && !new_crtc_state->active) >>>> +           crtc_disable_count++; >>>> + >>>>           dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); >>>>           dm_old_crtc_state = to_dm_crtc_state(old_crtc_state); >>>>           modeset_needed = modeset_required( >>>> @@ -4463,11 +4467,9 @@ static void amdgpu_dm_atomic_commit_tail(struct >>>> drm_atomic_state *state) >>>>        * so we can put the GPU into runtime suspend if we're not >>>> driving >>>> any >>>>        * displays anymore >>>>        */ >>>> +   for (i = 0; i < crtc_disable_count; i++) >>>> +       pm_runtime_put_autosuspend(dev->dev); >>>>       pm_runtime_mark_last_busy(dev->dev); >>>> -   for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, >>>> new_crtc_state, i) { >>>> -       if (old_crtc_state->active && !new_crtc_state->active) >>>> -           pm_runtime_put_autosuspend(dev->dev); >>>> -   } >>>>   } >>> >