On 2017å¹´07æ??12æ?¥ 12:56, Felix Kuehling wrote: > Hi David, > > Responses inline ... Yeah, I see your mean, you mainly want to save some cpu overhead. If others have no object, feel free add my RB then. Regards, David Zhou > > On 17-07-12 12:30 AM, zhoucm1 wrote: >> nice improvement, just some nitpick inline, otherwise the patch is >> Reviewed-by: Chunming Zhou <david1.zhou at amd.com>. >> >> On 2017å¹´07æ??12æ?¥ 11:56, Felix Kuehling wrote: >>> When lots of virtual address spaces is used, there can be thousands >>> of page table BOs. amdgpu_vm_update_level iterates over all of them >>> recursively. In many cases only a few or none at all need to be >>> updated. Minimize unnecessary code execution and memory usage in >>> those cases. >>> >>> This speeds up memory mapping in a synthetic KFD memory mapping >>> benchmark by roughly a factor two. >>> >>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 >>> +++++++++++++++++---------------- >>> 1 file changed, 55 insertions(+), 54 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> index ff5de3a..23b899b 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct >>> amdgpu_device *adev, >>> { >>> struct amdgpu_bo *shadow; >>> struct amdgpu_ring *ring = NULL; >>> - uint64_t pd_addr, shadow_addr = 0; >>> + uint64_t pd_addr = 0, shadow_addr = 0; >>> uint32_t incr = amdgpu_vm_bo_size(adev, level + 1); >>> uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0; >>> unsigned count = 0, pt_idx, ndw = 0; >>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct >>> amdgpu_device *adev, >>> WARN_ON(vm->use_cpu_for_update && shadow); >>> if (vm->use_cpu_for_update && !shadow) { >>> - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); >>> - if (r) >>> - return r; >>> - r = amdgpu_vm_bo_wait(adev, parent->bo); >>> - if (unlikely(r)) { >>> - amdgpu_bo_kunmap(parent->bo); >>> - return r; >>> - } >>> + /* Defer kmapping until it's actually needed. Some >>> + * PDBs may need no update at all >>> + */ >>> params.func = amdgpu_vm_cpu_set_ptes; >>> + params.ib = (void *)(long)-1; >>> } else { >>> - if (shadow) { >>> - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem); >>> - if (r) >>> - return r; >>> - } >>> - ring = container_of(vm->entity.sched, struct amdgpu_ring, >>> - sched); >>> - >>> - /* padding, etc. */ >>> - ndw = 64; >>> - >>> - /* assume the worst case */ >>> - ndw += parent->last_entry_used * 6; >>> - >>> - pd_addr = amdgpu_bo_gpu_offset(parent->bo); >>> - >>> - if (shadow) { >>> - shadow_addr = amdgpu_bo_gpu_offset(shadow); >>> - ndw *= 2; >>> - } else { >>> - shadow_addr = 0; >>> - } >>> - >>> - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); >>> - if (r) >>> - return r; >>> - >>> - params.ib = &job->ibs[0]; >>> + /* Defer IB allocation until it's actually >>> + * needed. Some PDBs may need no update at all >>> + */ >>> + params.ib = NULL; >>> params.func = amdgpu_vm_do_set_ptes; >>> } >>> - >>> /* walk over the address space and update the directory */ >>> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { >>> struct amdgpu_bo *bo = parent->entries[pt_idx].bo; >>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct >>> amdgpu_device *adev, >>> if (bo == NULL) >>> continue; >>> - if (bo->shadow) { >>> - struct amdgpu_bo *pt_shadow = bo->shadow; >>> - >>> - r = amdgpu_ttm_bind(&pt_shadow->tbo, >>> - &pt_shadow->tbo.mem); >>> - if (r) >>> - return r; >>> - } >>> - >>> - pt = amdgpu_bo_gpu_offset(bo); >>> - pt = amdgpu_gart_get_vm_pde(adev, pt); >>> + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset); >> Getting offset from amdgpu_bo_gpu_offset(bo) is better I think, >> although we can assume pt bo is a known domain. > amdgpu_bo_gpu_offset does a bunch of sanity checks (WARN_ON_ONCE). They > are redundant here, because we can assume that the page table BOs have > been properly created and validated by kernel code we can trust more > than user mode. I saw amdgpu_bo_gpu_offset near the top with "perf top" > when running my memory mapping benchmark, so I think it makes sense to > eliminate the function call and redundant sanity checks in this code path. > >>> if (parent->entries[pt_idx].addr == pt) >>> continue; >>> parent->entries[pt_idx].addr = pt; >>> + if (!params.ib) { >>> + if (shadow) { >>> + r = amdgpu_ttm_bind(&shadow->tbo, >>> + &shadow->tbo.mem); >>> + if (r) >>> + return r; >>> + } >>> + >>> + ring = container_of(vm->entity.sched, >>> + struct amdgpu_ring, sched); >>> + >>> + /* padding, etc. */ >>> + ndw = 64; >>> + >>> + /* assume the worst case */ >>> + ndw += (parent->last_entry_used - pt_idx) * 6; >>> + >>> + pd_addr = parent->bo->tbo.offset; >>> + >>> + if (shadow) { >>> + shadow_addr = shadow->tbo.offset; >>> + ndw *= 2; >>> + } else { >>> + shadow_addr = 0; >>> + } >>> + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); >>> + if (r) >>> + return r; >>> + >>> + params.ib = &job->ibs[0]; >>> + } else if (!pd_addr) { >>> + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); >>> + if (r) >>> + return r; >>> + r = amdgpu_vm_bo_wait(adev, parent->bo); >>> + if (unlikely(r)) { >>> + amdgpu_bo_kunmap(parent->bo); >>> + return r; >>> + } >>> + } >>> + >>> pde = pd_addr + pt_idx * 8; >>> if (((last_pde + 8 * count) != pde) || >>> ((last_pt + incr * count) != pt) || >>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct >>> amdgpu_device *adev, >>> if (params.func == amdgpu_vm_cpu_set_ptes) >>> amdgpu_bo_kunmap(parent->bo); >>> - else if (params.ib->length_dw == 0) { >>> + else if (params.ib && params.ib->length_dw == 0) { >>> amdgpu_job_free(job); >>> - } else { >>> + } else if (params.ib) { >>> amdgpu_ring_pad_ib(ring, params.ib); >>> amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv, >>> AMDGPU_FENCE_OWNER_VM); >>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct >>> amdgpu_device *adev, >>> amdgpu_bo_fence(parent->bo, fence, true); >>> dma_fence_put(vm->last_dir_update); >>> - vm->last_dir_update = dma_fence_get(fence); >>> - dma_fence_put(fence); >>> + vm->last_dir_update = fence; >> Here I prefer to keep previous code, the logic is more clear.:) > Again, this is a code path that can be called very frequently. So I'd > like to eliminate redundant atomic operations here. > >>> } >>> /* >>> * Recurse into the subdirectories. This recursion is harmless >>> because >>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct >>> amdgpu_device *adev, >>> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { >>> struct amdgpu_vm_pt *entry = &parent->entries[pt_idx]; >>> - if (!entry->bo) >>> + if (!entry->bo || !entry->entries) >> In fact, !entry->entries would be checked at the begin of this >> function in next loop, but either way is ok to me here. > Doing the check here eliminates the lowest level of recursive function > calls. That's where it counts the most. Each successive level of the > recursion runs 512 times as often as the level above it. So completely > eliminating the function call overhead of the lowest level is a good thing. > > Regards, > Felix > >> Regards, >> David Zhou >>> continue; >>> r = amdgpu_vm_update_level(adev, vm, entry, level + 1); >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx at lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx