This reverts commit c45c3bc930bf60e7658f87c519a40f77513b96aa. Found KFDSVMEvict test regression on vega10, kernel BUG backtrace: [ 135.365083] amdgpu: Migration failed during eviction [ 135.365090] ------------[ cut here ]------------ [ 135.365097] This was not the last reference [ 135.365122] WARNING: CPU: 5 PID: 1998 at drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c:3515 svm_range_evict_svm_bo_worker+0x21c/0x390 [amdgpu] [ 135.365836] svm_range_evict_svm_bo_worker+0x21c/0x390 [amdgpu] [ 135.366249] process_one_work+0x298/0x590 [ 135.366256] worker_thread+0x3d/0x3d0 ...... [ 135.721257] kernel BUG at include/linux/swapops.h:472! [ 135.721537] Call Trace: [ 135.721540] <TASK> [ 135.721592] hmm_vma_walk_pmd+0x5c8/0x780 [ 135.721598] walk_pgd_range+0x3bc/0x7c0 [ 135.721604] __walk_page_range+0x1ec/0x200 [ 135.721609] walk_page_range+0x119/0x1a0 [ 135.721613] hmm_range_fault+0x5d/0xb0 [ 135.721617] amdgpu_hmm_range_get_pages+0x159/0x240 [amdgpu] [ 135.721820] svm_range_validate_and_map+0x57f/0x16c0 [amdgpu] [ 135.722411] svm_range_restore_pages+0xcd8/0x1150 [amdgpu] [ 135.722613] amdgpu_vm_handle_fault+0xc2/0x360 [amdgpu] [ 135.722777] gmc_v9_0_process_interrupt+0x255/0x670 [amdgpu] Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35 +++++++++------------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index f2b33fb2afcf..4d000c63cde8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1565,7 +1565,6 @@ static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) * 5. Release page table (and SVM BO) reservation */ static int svm_range_validate_and_map(struct mm_struct *mm, - unsigned long map_start, unsigned long map_last, struct svm_range *prange, int32_t gpuidx, bool intr, bool wait, bool flush_tlb) { @@ -1646,8 +1645,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm, end = (prange->last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { struct hmm_range *hmm_range; - unsigned long map_start_vma; - unsigned long map_last_vma; struct vm_area_struct *vma; uint64_t vram_pages_vma; unsigned long next = 0; @@ -1696,16 +1693,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, r = -EAGAIN; } - if (!r) { - map_start_vma = max(map_start, prange->start + offset); - map_last_vma = min(map_last, prange->start + offset + npages - 1); - if (map_start_vma <= map_last_vma) { - offset = map_start_vma - prange->start; - npages = map_last_vma - map_start_vma + 1; - r = svm_range_map_to_gpus(prange, offset, npages, readonly, - ctx->bitmap, wait, flush_tlb); - } - } + if (!r) + r = svm_range_map_to_gpus(prange, offset, npages, readonly, + ctx->bitmap, wait, flush_tlb); if (!r && next == end) prange->mapped_to_gpu = true; @@ -1811,8 +1801,8 @@ static void svm_range_restore_work(struct work_struct *work) */ mutex_lock(&prange->migrate_mutex); - r = svm_range_validate_and_map(mm, prange->start, prange->last, prange, - MAX_GPU_INSTANCE, false, true, false); + r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, + false, true, false); if (r) pr_debug("failed %d to map 0x%lx to gpus\n", r, prange->start); @@ -3026,8 +3016,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr, write_fault, timestamp); - start = prange->start; - last = prange->last; if (prange->actual_loc != 0 || best_loc != 0) { migration = true; /* Align migration range start and size to granularity size */ @@ -3061,11 +3049,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, } } - r = svm_range_validate_and_map(mm, start, last, prange, gpuidx, false, - false, false); + r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false); if (r) pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", - r, svms, start, last); + r, svms, prange->start, prange->last); kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr, migration); @@ -3611,8 +3598,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu; - r = svm_range_validate_and_map(mm, prange->start, prange->last, prange, - MAX_GPU_INSTANCE, true, true, flush_tlb); + r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, + true, true, flush_tlb); if (r) pr_debug("failed %d to map svm range\n", r); @@ -3626,8 +3613,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, pr_debug("Remapping prange 0x%p [0x%lx 0x%lx]\n", prange, prange->start, prange->last); mutex_lock(&prange->migrate_mutex); - r = svm_range_validate_and_map(mm, prange->start, prange->last, prange, - MAX_GPU_INSTANCE, true, true, prange->mapped_to_gpu); + r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, + true, true, prange->mapped_to_gpu); if (r) pr_debug("failed %d on remap svm range\n", r); mutex_unlock(&prange->migrate_mutex); -- 2.35.1