On 17-07-12 03:50 AM, Christian König wrote: > > That optimization is unnecessary. > > I have a patch in the pipeline that makes the VM BO permanently CPU > mapped and also fixes a couple of bugs in those code path. You mean just for the CPU-update path? I think deferring the SDMA IB allocation still makes sense. In fact that's the code path I tested. > > Give me a moment to clean that up and send it out. Sure. BTW, I found that the most expensive operation turned out to be amdgpu_vm_move_pt_bos_in_lru. It has to recurse through all PT BOs and does lots of redundant list pointer manipulations for two lists (LRU and swap), and kref get/put. For KFD VMs I'm removing the call to it. I think it's useless for us. amdgpu_vm_validate_pt_bos is also expensive, but not as bad because it's only done if there was an eviction. I was thinking about ways to optimize this. It should be possible to allocate multiple PTBs and PDBs in a single BO in amdgpu_vm_alloc_pts. That would potentially save lots of time in vm_move_pt_bos and vm_validate. What do you think? Regards, Felix > > Regards, > Christian. > >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 >> +++++++++++++++++---------------- >> 1 file changed, 55 insertions(+), 54 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> index ff5de3a..23b899b 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> { >> struct amdgpu_bo *shadow; >> struct amdgpu_ring *ring = NULL; >> - uint64_t pd_addr, shadow_addr = 0; >> + uint64_t pd_addr = 0, shadow_addr = 0; >> uint32_t incr = amdgpu_vm_bo_size(adev, level + 1); >> uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0; >> unsigned count = 0, pt_idx, ndw = 0; >> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> WARN_ON(vm->use_cpu_for_update && shadow); >> if (vm->use_cpu_for_update && !shadow) { >> - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); >> - if (r) >> - return r; >> - r = amdgpu_vm_bo_wait(adev, parent->bo); >> - if (unlikely(r)) { >> - amdgpu_bo_kunmap(parent->bo); >> - return r; >> - } >> + /* Defer kmapping until it's actually needed. Some >> + * PDBs may need no update at all >> + */ >> params.func = amdgpu_vm_cpu_set_ptes; >> + params.ib = (void *)(long)-1; >> } else { >> - if (shadow) { >> - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem); >> - if (r) >> - return r; >> - } >> - ring = container_of(vm->entity.sched, struct amdgpu_ring, >> - sched); >> - >> - /* padding, etc. */ >> - ndw = 64; >> - >> - /* assume the worst case */ >> - ndw += parent->last_entry_used * 6; >> - >> - pd_addr = amdgpu_bo_gpu_offset(parent->bo); >> - >> - if (shadow) { >> - shadow_addr = amdgpu_bo_gpu_offset(shadow); >> - ndw *= 2; >> - } else { >> - shadow_addr = 0; >> - } >> - >> - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); >> - if (r) >> - return r; >> - >> - params.ib = &job->ibs[0]; >> + /* Defer IB allocation until it's actually >> + * needed. Some PDBs may need no update at all >> + */ >> + params.ib = NULL; >> params.func = amdgpu_vm_do_set_ptes; >> } >> - >> /* walk over the address space and update the directory */ >> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { >> struct amdgpu_bo *bo = parent->entries[pt_idx].bo; >> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> if (bo == NULL) >> continue; >> - if (bo->shadow) { >> - struct amdgpu_bo *pt_shadow = bo->shadow; >> - >> - r = amdgpu_ttm_bind(&pt_shadow->tbo, >> - &pt_shadow->tbo.mem); >> - if (r) >> - return r; >> - } >> - >> - pt = amdgpu_bo_gpu_offset(bo); >> - pt = amdgpu_gart_get_vm_pde(adev, pt); >> + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset); >> if (parent->entries[pt_idx].addr == pt) >> continue; >> parent->entries[pt_idx].addr = pt; >> + if (!params.ib) { >> + if (shadow) { >> + r = amdgpu_ttm_bind(&shadow->tbo, >> + &shadow->tbo.mem); >> + if (r) >> + return r; >> + } >> + >> + ring = container_of(vm->entity.sched, >> + struct amdgpu_ring, sched); >> + >> + /* padding, etc. */ >> + ndw = 64; >> + >> + /* assume the worst case */ >> + ndw += (parent->last_entry_used - pt_idx) * 6; >> + >> + pd_addr = parent->bo->tbo.offset; >> + >> + if (shadow) { >> + shadow_addr = shadow->tbo.offset; >> + ndw *= 2; >> + } else { >> + shadow_addr = 0; >> + } >> + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job); >> + if (r) >> + return r; >> + >> + params.ib = &job->ibs[0]; >> + } else if (!pd_addr) { >> + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr); >> + if (r) >> + return r; >> + r = amdgpu_vm_bo_wait(adev, parent->bo); >> + if (unlikely(r)) { >> + amdgpu_bo_kunmap(parent->bo); >> + return r; >> + } >> + } >> + >> pde = pd_addr + pt_idx * 8; >> if (((last_pde + 8 * count) != pde) || >> ((last_pt + incr * count) != pt) || >> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> if (params.func == amdgpu_vm_cpu_set_ptes) >> amdgpu_bo_kunmap(parent->bo); >> - else if (params.ib->length_dw == 0) { >> + else if (params.ib && params.ib->length_dw == 0) { >> amdgpu_job_free(job); >> - } else { >> + } else if (params.ib) { >> amdgpu_ring_pad_ib(ring, params.ib); >> amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv, >> AMDGPU_FENCE_OWNER_VM); >> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> amdgpu_bo_fence(parent->bo, fence, true); >> dma_fence_put(vm->last_dir_update); >> - vm->last_dir_update = dma_fence_get(fence); >> - dma_fence_put(fence); >> + vm->last_dir_update = fence; >> } >> /* >> * Recurse into the subdirectories. This recursion is harmless >> because >> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct >> amdgpu_device *adev, >> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) { >> struct amdgpu_vm_pt *entry = &parent->entries[pt_idx]; >> - if (!entry->bo) >> + if (!entry->bo || !entry->entries) >> continue; >> r = amdgpu_vm_update_level(adev, vm, entry, level + 1); > >