[AMD Official Use Only] To observe the issue. I made one kfdtest case for debug. It just alloc a userptr memory and detect if memory is corrupted. I can hit this failure in 2 minutes. :( diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp index 70c8033..a72f53f 100644 --- a/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/tests/kfdtest/src/KFDMemoryTest.cpp @@ -584,6 +584,32 @@ TEST_F(KFDMemoryTest, ZeroMemorySizeAlloc) { TEST_END } +TEST_F(KFDMemoryTest, swap) { + TEST_START(TESTPROFILE_RUNALL) + + unsigned int size = 128<<20; + unsigned int*tmp = (unsigned int *)mmap(0, + size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, + 0); + EXPECT_NE(tmp, MAP_FAILED); + + LOG() << "pls run this with KFDMemoryTest.LargestSysBufferTest" << std::endl; + do { + memset(tmp, 0xcc, size); + + HsaMemoryBuffer buf(tmp, size); + sleep(1); + EXPECT_EQ(tmp[0], 0xcccccccc); + } while (true); + + munmap(tmp, size); + + TEST_END +} + // Basic test for hsaKmtAllocMemory TEST_F(KFDMemoryTest, MemoryAlloc) { TEST_START(TESTPROFILE_RUNALL) -- 2.25.1 ________________________________________ 发件人: Pan, Xinhui <Xinhui.Pan@xxxxxxx> 发送时间: 2021年5月19日 10:28 收件人: amd-gfx@xxxxxxxxxxxxxxxxxxxxx 抄送: Kuehling, Felix; Deucher, Alexander; Koenig, Christian; dri-devel@xxxxxxxxxxxxxxxxxxxxx; daniel@xxxxxxxx; Pan, Xinhui 主题: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin cpu 1 cpu 2 kfd alloc BO A(userptr) alloc BO B(GTT) ->init -> validate -> init -> validate -> populate init_user_pages -> swapout BO A //hit ttm pages limit -> get_user_pages (fill up ttm->pages) -> validate -> populate -> swapin BO A // Now hit the BUG We know that get_user_pages may race with swapout on same BO. Threre are some issues I have met. 1) memory corruption. This is because we do a swap before memory is setup. ttm_tt_swapout() just create a swap_storage with its content being 0x0. So when we setup memory after the swapout. The following swapin makes the memory corrupted. 2) panic When swapout happes with get_user_pages, they touch ttm->pages without anylock. It causes memory corruption too. But I hit page fault mostly. Signed-off-by: xinhui pan <xinhui.pan@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 928e8d57cd08..42460e4480f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr) struct amdkfd_process_info *process_info = mem->process_info; struct amdgpu_bo *bo = mem->bo; struct ttm_operation_ctx ctx = { true, false }; + struct page **pages; int ret = 0; mutex_lock(&process_info->lock); @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr) goto out; } - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages); + pages = kvmalloc_array(bo->tbo.ttm->num_pages, + sizeof(struct page *), + GFP_KERNEL | __GFP_ZERO); + if (!pages) + goto unregister_out; + + ret = amdgpu_ttm_tt_get_user_pages(bo, pages); if (ret) { pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr) pr_err("%s: Failed to reserve BO\n", __func__); goto release_out; } + + WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED); + + memcpy(bo->tbo.ttm->pages, + pages, + sizeof(struct page*) * bo->tbo.ttm->num_pages); amdgpu_bo_placement_from_domain(bo, mem->domain); ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); if (ret) @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr) release_out: amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); unregister_out: + kvfree(pages); if (ret) amdgpu_mn_unregister(bo); out: -- 2.25.1