Hi David, Responses inline ... On 17-07-12 12:30 AM, zhoucm1 wrote: > nice improvement, just some nitpick inline, otherwise the patch is > Reviewed-by: Chunming Zhou <david1.zhou at amd.com>. > > On 2017å¹´07æ??12æ?¥ 11:56, Felix Kuehling wrote: >> When lots of virtual address spaces is used, there can be thousands >> of page table BOs. amdgpu_vm_update_level iterates over all of them >> recursively. In many cases only a few or none at all need to be >> updated. Minimize unnecessary code execution and memory usage in >> those cases. >> >> This speeds up memory mapping in a synthetic KFD memory mapping >> benchmark by roughly a factor two. >> >> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 >> +++++++++++++++++---------------- >> 1 file changed, 55 insertions(+), 54 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> index ff5de3a..23b899b 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> { >> struct amdgpu_bo *shadow; >> struct amdgpu_ring *ring = NULL; >> - uint64_t pd_addr, shadow_addr = 0; >> + uint64_t pd_addr = 0, shadow_addr = 0; >> uint32_t incr = amdgpu_vm_bo_size(adev, level + 1); >> uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0; >> unsigned count = 0, pt_idx, ndw = 0; >> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> WARN_ON(vm->use_cpu_for_update && shadow); >> if (vm->use_cpu_for_update && !shadow) { >> - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); >> - if (r) >> - return r; >> - r = amdgpu_vm_bo_wait(adev, parent->bo); >> - if (unlikely(r)) { >> - amdgpu_bo_kunmap(parent->bo); >> - return r; >> - } >> + /* Defer kmapping until it's actually needed. Some >> + * PDBs may need no update at all >> + */ >> params.func = amdgpu_vm_cpu_set_ptes; >> + params.ib = (void *)(long)-1; >> } else { >> - if (shadow) { >> - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem); >> - if (r) >> - return r; >> - } >> - ring = container_of(vm->entity.sched, struct amdgpu_ring, >> - sched); >> - >> - /* padding, etc. */ >> - ndw = 64; >> - >> - /* assume the worst case */ >> - ndw += parent->last_entry_used * 6; >> - >> - pd_addr = amdgpu_bo_gpu_offset(parent->bo); >> - >> - if (shadow) { >> - shadow_addr = amdgpu_bo_gpu_offset(shadow); >> - ndw *= 2; >> - } else { >> - shadow_addr = 0; >> - } >> - >> - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); >> - if (r) >> - return r; >> - >> - params.ib = &job->ibs[0]; >> + /* Defer IB allocation until it's actually >> + * needed. Some PDBs may need no update at all >> + */ >> + params.ib = NULL; >> params.func = amdgpu_vm_do_set_ptes; >> } >> - >> /* walk over the address space and update the directory */ >> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { >> struct amdgpu_bo *bo = parent->entries[pt_idx].bo; >> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> if (bo == NULL) >> continue; >> - if (bo->shadow) { >> - struct amdgpu_bo *pt_shadow = bo->shadow; >> - >> - r = amdgpu_ttm_bind(&pt_shadow->tbo, >> - &pt_shadow->tbo.mem); >> - if (r) >> - return r; >> - } >> - >> - pt = amdgpu_bo_gpu_offset(bo); >> - pt = amdgpu_gart_get_vm_pde(adev, pt); >> + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset); > Getting offset from amdgpu_bo_gpu_offset(bo) is better I think, > although we can assume pt bo is a known domain. amdgpu_bo_gpu_offset does a bunch of sanity checks (WARN_ON_ONCE). They are redundant here, because we can assume that the page table BOs have been properly created and validated by kernel code we can trust more than user mode. I saw amdgpu_bo_gpu_offset near the top with "perf top" when running my memory mapping benchmark, so I think it makes sense to eliminate the function call and redundant sanity checks in this code path. > >> if (parent->entries[pt_idx].addr == pt) >> continue; >> parent->entries[pt_idx].addr = pt; >> + if (!params.ib) { >> + if (shadow) { >> + r = amdgpu_ttm_bind(&shadow->tbo, >> + &shadow->tbo.mem); >> + if (r) >> + return r; >> + } >> + >> + ring = container_of(vm->entity.sched, >> + struct amdgpu_ring, sched); >> + >> + /* padding, etc. */ >> + ndw = 64; >> + >> + /* assume the worst case */ >> + ndw += (parent->last_entry_used - pt_idx) * 6; >> + >> + pd_addr = parent->bo->tbo.offset; >> + >> + if (shadow) { >> + shadow_addr = shadow->tbo.offset; >> + ndw *= 2; >> + } else { >> + shadow_addr = 0; >> + } >> + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); >> + if (r) >> + return r; >> + >> + params.ib = &job->ibs[0]; >> + } else if (!pd_addr) { >> + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); >> + if (r) >> + return r; >> + r = amdgpu_vm_bo_wait(adev, parent->bo); >> + if (unlikely(r)) { >> + amdgpu_bo_kunmap(parent->bo); >> + return r; >> + } >> + } >> + >> pde = pd_addr + pt_idx * 8; >> if (((last_pde + 8 * count) != pde) || >> ((last_pt + incr * count) != pt) || >> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> if (params.func == amdgpu_vm_cpu_set_ptes) >> amdgpu_bo_kunmap(parent->bo); >> - else if (params.ib->length_dw == 0) { >> + else if (params.ib && params.ib->length_dw == 0) { >> amdgpu_job_free(job); >> - } else { >> + } else if (params.ib) { >> amdgpu_ring_pad_ib(ring, params.ib); >> amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv, >> AMDGPU_FENCE_OWNER_VM); >> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> amdgpu_bo_fence(parent->bo, fence, true); >> dma_fence_put(vm->last_dir_update); >> - vm->last_dir_update = dma_fence_get(fence); >> - dma_fence_put(fence); >> + vm->last_dir_update = fence; > Here I prefer to keep previous code, the logic is more clear.:) Again, this is a code path that can be called very frequently. So I'd like to eliminate redundant atomic operations here. >> } >> /* >> * Recurse into the subdirectories. This recursion is harmless >> because >> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { >> struct amdgpu_vm_pt *entry = &parent->entries[pt_idx]; >> - if (!entry->bo) >> + if (!entry->bo || !entry->entries) > In fact, !entry->entries would be checked at the begin of this > function in next loop, but either way is ok to me here. Doing the check here eliminates the lowest level of recursive function calls. That's where it counts the most. Each successive level of the recursion runs 512 times as often as the level above it. So completely eliminating the function call overhead of the lowest level is a good thing. Regards, Felix > > Regards, > David Zhou >> continue; >> r = amdgpu_vm_update_level(adev, vm, entry, level + 1); > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx