When lots of virtual address spaces is used, there can be thousands of page table BOs. amdgpu_vm_update_level iterates over all of them recursively. In many cases only a few or none at all need to be updated. Minimize unnecessary code execution and memory usage in those cases. This speeds up memory mapping in a synthetic KFD memory mapping benchmark by roughly a factor two. Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++---------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ff5de3a..23b899b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, { struct amdgpu_bo *shadow; struct amdgpu_ring *ring = NULL; - uint64_t pd_addr, shadow_addr = 0; + uint64_t pd_addr = 0, shadow_addr = 0; uint32_t incr = amdgpu_vm_bo_size(adev, level + 1); uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0; unsigned count = 0, pt_idx, ndw = 0; @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, WARN_ON(vm->use_cpu_for_update && shadow); if (vm->use_cpu_for_update && !shadow) { - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); - if (r) - return r; - r = amdgpu_vm_bo_wait(adev, parent->bo); - if (unlikely(r)) { - amdgpu_bo_kunmap(parent->bo); - return r; - } + /* Defer kmapping until it's actually needed. Some + * PDBs may need no update at all + */ params.func = amdgpu_vm_cpu_set_ptes; + params.ib = (void *)(long)-1; } else { - if (shadow) { - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem); - if (r) - return r; - } - ring = container_of(vm->entity.sched, struct amdgpu_ring, - sched); - - /* padding, etc. */ - ndw = 64; - - /* assume the worst case */ - ndw += parent->last_entry_used * 6; - - pd_addr = amdgpu_bo_gpu_offset(parent->bo); - - if (shadow) { - shadow_addr = amdgpu_bo_gpu_offset(shadow); - ndw *= 2; - } else { - shadow_addr = 0; - } - - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); - if (r) - return r; - - params.ib = &job->ibs[0]; + /* Defer IB allocation until it's actually + * needed. Some PDBs may need no update at all + */ + params.ib = NULL; params.func = amdgpu_vm_do_set_ptes; } - /* walk over the address space and update the directory */ for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { struct amdgpu_bo *bo = parent->entries[pt_idx].bo; @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, if (bo == NULL) continue; - if (bo->shadow) { - struct amdgpu_bo *pt_shadow = bo->shadow; - - r = amdgpu_ttm_bind(&pt_shadow->tbo, - &pt_shadow->tbo.mem); - if (r) - return r; - } - - pt = amdgpu_bo_gpu_offset(bo); - pt = amdgpu_gart_get_vm_pde(adev, pt); + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset); if (parent->entries[pt_idx].addr == pt) continue; parent->entries[pt_idx].addr = pt; + if (!params.ib) { + if (shadow) { + r = amdgpu_ttm_bind(&shadow->tbo, + &shadow->tbo.mem); + if (r) + return r; + } + + ring = container_of(vm->entity.sched, + struct amdgpu_ring, sched); + + /* padding, etc. */ + ndw = 64; + + /* assume the worst case */ + ndw += (parent->last_entry_used - pt_idx) * 6; + + pd_addr = parent->bo->tbo.offset; + + if (shadow) { + shadow_addr = shadow->tbo.offset; + ndw *= 2; + } else { + shadow_addr = 0; + } + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); + if (r) + return r; + + params.ib = &job->ibs[0]; + } else if (!pd_addr) { + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); + if (r) + return r; + r = amdgpu_vm_bo_wait(adev, parent->bo); + if (unlikely(r)) { + amdgpu_bo_kunmap(parent->bo); + return r; + } + } + pde = pd_addr + pt_idx * 8; if (((last_pde + 8 * count) != pde) || ((last_pt + incr * count) != pt) || @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, if (params.func == amdgpu_vm_cpu_set_ptes) amdgpu_bo_kunmap(parent->bo); - else if (params.ib->length_dw == 0) { + else if (params.ib && params.ib->length_dw == 0) { amdgpu_job_free(job); - } else { + } else if (params.ib) { amdgpu_ring_pad_ib(ring, params.ib); amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv, AMDGPU_FENCE_OWNER_VM); @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, amdgpu_bo_fence(parent->bo, fence, true); dma_fence_put(vm->last_dir_update); - vm->last_dir_update = dma_fence_get(fence); - dma_fence_put(fence); + vm->last_dir_update = fence; } /* * Recurse into the subdirectories. This recursion is harmless because @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { struct amdgpu_vm_pt *entry = &parent->entries[pt_idx]; - if (!entry->bo) + if (!entry->bo || !entry->entries) continue; r = amdgpu_vm_update_level(adev, vm, entry, level + 1); -- 1.9.1