[PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When lots of virtual address spaces is used, there can be thousands
of page table BOs. amdgpu_vm_update_level iterates over all of them
recursively. In many cases only a few or none at all need to be
updated. Minimize unnecessary code execution and memory usage in
those cases.

This speeds up memory mapping in a synthetic KFD memory mapping
benchmark by roughly a factor two.

Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++----------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ff5de3a..23b899b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 {
 	struct amdgpu_bo *shadow;
 	struct amdgpu_ring *ring = NULL;
-	uint64_t pd_addr, shadow_addr = 0;
+	uint64_t pd_addr = 0, shadow_addr = 0;
 	uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
 	uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
 	unsigned count = 0, pt_idx, ndw = 0;
@@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 	WARN_ON(vm->use_cpu_for_update && shadow);
 	if (vm->use_cpu_for_update && !shadow) {
-		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
-		if (r)
-			return r;
-		r = amdgpu_vm_bo_wait(adev, parent->bo);
-		if (unlikely(r)) {
-			amdgpu_bo_kunmap(parent->bo);
-			return r;
-		}
+		/* Defer kmapping until it's actually needed. Some
+		 * PDBs may need no update at all
+		 */
 		params.func = amdgpu_vm_cpu_set_ptes;
+		params.ib = (void *)(long)-1;
 	} else {
-		if (shadow) {
-			r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
-			if (r)
-				return r;
-		}
-		ring = container_of(vm->entity.sched, struct amdgpu_ring,
-				    sched);
-
-		/* padding, etc. */
-		ndw = 64;
-
-		/* assume the worst case */
-		ndw += parent->last_entry_used * 6;
-
-		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
-
-		if (shadow) {
-			shadow_addr = amdgpu_bo_gpu_offset(shadow);
-			ndw *= 2;
-		} else {
-			shadow_addr = 0;
-		}
-
-		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
-		if (r)
-			return r;
-
-		params.ib = &job->ibs[0];
+		/* Defer IB allocation until it's actually
+		 * needed. Some PDBs may need no update at all
+		 */
+		params.ib = NULL;
 		params.func = amdgpu_vm_do_set_ptes;
 	}
 
-
 	/* walk over the address space and update the directory */
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
 		struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
@@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 		if (bo == NULL)
 			continue;
 
-		if (bo->shadow) {
-			struct amdgpu_bo *pt_shadow = bo->shadow;
-
-			r = amdgpu_ttm_bind(&pt_shadow->tbo,
-					    &pt_shadow->tbo.mem);
-			if (r)
-				return r;
-		}
-
-		pt = amdgpu_bo_gpu_offset(bo);
-		pt = amdgpu_gart_get_vm_pde(adev, pt);
+		pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
 		if (parent->entries[pt_idx].addr == pt)
 			continue;
 
 		parent->entries[pt_idx].addr = pt;
 
+		if (!params.ib) {
+			if (shadow) {
+				r = amdgpu_ttm_bind(&shadow->tbo,
+						    &shadow->tbo.mem);
+				if (r)
+					return r;
+			}
+
+			ring = container_of(vm->entity.sched,
+					    struct amdgpu_ring, sched);
+
+			/* padding, etc. */
+			ndw = 64;
+
+			/* assume the worst case */
+			ndw += (parent->last_entry_used - pt_idx) * 6;
+
+			pd_addr = parent->bo->tbo.offset;
+
+			if (shadow) {
+				shadow_addr = shadow->tbo.offset;
+				ndw *= 2;
+			} else {
+				shadow_addr = 0;
+			}
+			r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
+			if (r)
+				return r;
+
+			params.ib = &job->ibs[0];
+		} else if (!pd_addr) {
+			r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
+			if (r)
+				return r;
+			r = amdgpu_vm_bo_wait(adev, parent->bo);
+			if (unlikely(r)) {
+				amdgpu_bo_kunmap(parent->bo);
+				return r;
+			}
+		}
+
 		pde = pd_addr + pt_idx * 8;
 		if (((last_pde + 8 * count) != pde) ||
 		    ((last_pt + incr * count) != pt) ||
@@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 	if (params.func == amdgpu_vm_cpu_set_ptes)
 		amdgpu_bo_kunmap(parent->bo);
-	else if (params.ib->length_dw == 0) {
+	else if (params.ib && params.ib->length_dw == 0) {
 		amdgpu_job_free(job);
-	} else {
+	} else if (params.ib) {
 		amdgpu_ring_pad_ib(ring, params.ib);
 		amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
 				 AMDGPU_FENCE_OWNER_VM);
@@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 		amdgpu_bo_fence(parent->bo, fence, true);
 		dma_fence_put(vm->last_dir_update);
-		vm->last_dir_update = dma_fence_get(fence);
-		dma_fence_put(fence);
+		vm->last_dir_update = fence;
 	}
 	/*
 	 * Recurse into the subdirectories. This recursion is harmless because
@@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
 
-		if (!entry->bo)
+		if (!entry->bo || !entry->entries)
 			continue;
 
 		r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
-- 
1.9.1



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux