Am 12.07.2017 um 05:56 schrieb Felix Kuehling: > When lots of virtual address spaces is used, there can be thousands > of page table BOs. amdgpu_vm_update_level iterates over all of them > recursively. In many cases only a few or none at all need to be > updated. Minimize unnecessary code execution and memory usage in > those cases. > > This speeds up memory mapping in a synthetic KFD memory mapping > benchmark by roughly a factor two. > > Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> That optimization is unnecessary. I have a patch in the pipeline that makes the VM BO permanently CPU mapped and also fixes a couple of bugs in those code path. Give me a moment to clean that up and send it out. Regards, Christian. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++---------------- > 1 file changed, 55 insertions(+), 54 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index ff5de3a..23b899b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, > { > struct amdgpu_bo *shadow; > struct amdgpu_ring *ring = NULL; > - uint64_t pd_addr, shadow_addr = 0; > + uint64_t pd_addr = 0, shadow_addr = 0; > uint32_t incr = amdgpu_vm_bo_size(adev, level + 1); > uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0; > unsigned count = 0, pt_idx, ndw = 0; > @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, > > WARN_ON(vm->use_cpu_for_update && shadow); > if (vm->use_cpu_for_update && !shadow) { > - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); > - if (r) > - return r; > - r = amdgpu_vm_bo_wait(adev, parent->bo); > - if (unlikely(r)) { > - amdgpu_bo_kunmap(parent->bo); > - return r; > - } > + /* Defer kmapping until it's actually needed. Some > + * PDBs may need no update at all > + */ > params.func = amdgpu_vm_cpu_set_ptes; > + params.ib = (void *)(long)-1; > } else { > - if (shadow) { > - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem); > - if (r) > - return r; > - } > - ring = container_of(vm->entity.sched, struct amdgpu_ring, > - sched); > - > - /* padding, etc. */ > - ndw = 64; > - > - /* assume the worst case */ > - ndw += parent->last_entry_used * 6; > - > - pd_addr = amdgpu_bo_gpu_offset(parent->bo); > - > - if (shadow) { > - shadow_addr = amdgpu_bo_gpu_offset(shadow); > - ndw *= 2; > - } else { > - shadow_addr = 0; > - } > - > - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); > - if (r) > - return r; > - > - params.ib = &job->ibs[0]; > + /* Defer IB allocation until it's actually > + * needed. Some PDBs may need no update at all > + */ > + params.ib = NULL; > params.func = amdgpu_vm_do_set_ptes; > } > > - > /* walk over the address space and update the directory */ > for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { > struct amdgpu_bo *bo = parent->entries[pt_idx].bo; > @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, > if (bo == NULL) > continue; > > - if (bo->shadow) { > - struct amdgpu_bo *pt_shadow = bo->shadow; > - > - r = amdgpu_ttm_bind(&pt_shadow->tbo, > - &pt_shadow->tbo.mem); > - if (r) > - return r; > - } > - > - pt = amdgpu_bo_gpu_offset(bo); > - pt = amdgpu_gart_get_vm_pde(adev, pt); > + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset); > if (parent->entries[pt_idx].addr == pt) > continue; > > parent->entries[pt_idx].addr = pt; > > + if (!params.ib) { > + if (shadow) { > + r = amdgpu_ttm_bind(&shadow->tbo, > + &shadow->tbo.mem); > + if (r) > + return r; > + } > + > + ring = container_of(vm->entity.sched, > + struct amdgpu_ring, sched); > + > + /* padding, etc. */ > + ndw = 64; > + > + /* assume the worst case */ > + ndw += (parent->last_entry_used - pt_idx) * 6; > + > + pd_addr = parent->bo->tbo.offset; > + > + if (shadow) { > + shadow_addr = shadow->tbo.offset; > + ndw *= 2; > + } else { > + shadow_addr = 0; > + } > + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); > + if (r) > + return r; > + > + params.ib = &job->ibs[0]; > + } else if (!pd_addr) { > + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); > + if (r) > + return r; > + r = amdgpu_vm_bo_wait(adev, parent->bo); > + if (unlikely(r)) { > + amdgpu_bo_kunmap(parent->bo); > + return r; > + } > + } > + > pde = pd_addr + pt_idx * 8; > if (((last_pde + 8 * count) != pde) || > ((last_pt + incr * count) != pt) || > @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, > > if (params.func == amdgpu_vm_cpu_set_ptes) > amdgpu_bo_kunmap(parent->bo); > - else if (params.ib->length_dw == 0) { > + else if (params.ib && params.ib->length_dw == 0) { > amdgpu_job_free(job); > - } else { > + } else if (params.ib) { > amdgpu_ring_pad_ib(ring, params.ib); > amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv, > AMDGPU_FENCE_OWNER_VM); > @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, > > amdgpu_bo_fence(parent->bo, fence, true); > dma_fence_put(vm->last_dir_update); > - vm->last_dir_update = dma_fence_get(fence); > - dma_fence_put(fence); > + vm->last_dir_update = fence; > } > /* > * Recurse into the subdirectories. This recursion is harmless because > @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev, > for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { > struct amdgpu_vm_pt *entry = &parent->entries[pt_idx]; > > - if (!entry->bo) > + if (!entry->bo || !entry->entries) > continue; > > r = amdgpu_vm_update_level(adev, vm, entry, level + 1);