From: Xiaogang Chen <xiaogang.chen@xxxxxxx> Notice userptr buffer restore process has following issues: 1: amdgpu_ttm_tt_get_user_pages can fail(-EFAULT). If it failed we should not set it valid(mem->invalid = 0). In this case mem has no associated hmm range or user_pages associated. 2: mmu notifier can happen concurrently and update mem->range->notifier->invalidate_seq, but not mem->range->notifier_seq. That causes mem->range->notifier_seq stale when mem is in process_info->userptr_inval_list and amdgpu_amdkfd_restore_userptr_worker got interrupted. At next rescheduled next attempt we use stale mem->range->notifier_seq to compare with mem->range->notifier->invalidate_seq. Signed-off-by: Xiaogang Chen <Xiaogang.Chen@xxxxxxx> --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 7b1f5933ebaa..6881f1b0844c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2444,7 +2444,9 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info, ret = -EAGAIN; goto unlock_out; } - mem->invalid = 0; + /* set mem valid if mem has hmm range associated */ + if (mem->range) + mem->invalid = 0; } unlock_out: @@ -2576,16 +2578,28 @@ static int confirm_valid_user_pages_locked(struct amdkfd_process_info *process_i list_for_each_entry_safe(mem, tmp_mem, &process_info->userptr_inval_list, validate_list.head) { - bool valid = amdgpu_ttm_tt_get_user_pages_done( - mem->bo->tbo.ttm, mem->range); + /* Only check mem with hmm range associated */ + bool valid; - mem->range = NULL; - if (!valid) { - WARN(!mem->invalid, "Invalid BO not marked invalid"); + if (mem->range) { + valid = amdgpu_ttm_tt_get_user_pages_done( + mem->bo->tbo.ttm, mem->range); + + mem->range = NULL; + if (!valid) { + WARN(!mem->invalid, "Invalid BO not marked invalid"); + ret = -EAGAIN; + continue; + } + } else + /* keep mem without hmm range at userptr_inval_list */ + continue; + + if (mem->invalid) { + WARN(1, "Valid BO is marked invalid"); ret = -EAGAIN; continue; } - WARN(mem->invalid, "Valid BO is marked invalid"); list_move_tail(&mem->validate_list.head, &process_info->userptr_valid_list); @@ -2644,8 +2658,23 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) * reference counting inside KFD will handle this case. */ mutex_lock(&process_info->notifier_lock); - if (process_info->evicted_bos != evicted_bos) + if (process_info->evicted_bos != evicted_bos) { + /* mmu notifier interrupted amdgpu_amdkfd_restore_userptr_worker + * before reschedule next attempt update stale mem->range->notifier_seq + * inside userptr_inval_list + */ + struct kgd_mem *mem, *tmp_mem; + + list_for_each_entry_safe(mem, tmp_mem, + &process_info->userptr_inval_list, + validate_list.head) { + + if (mem->range) + mem->range->notifier_seq = mem->range->notifier->invalidate_seq; + } + goto unlock_notifier_out; + } if (confirm_valid_user_pages_locked(process_info)) { WARN(1, "User pages unexpectedly invalid"); -- 2.25.1