Re: [PATCH 2/6] drm/amdgpu: Separate eviction from VM status.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 31.10.23 um 15:39 schrieb Tatsuyuki Ishi:

On Oct 31, 2023, at 22:55, Christian König <christian.koenig@xxxxxxx> wrote:

Am 31.10.23 um 14:40 schrieb Tatsuyuki Ishi:
In short, eviction never really belonged to the vm_status state machine.
I strongly disagree to that.

Even when evicted, the BO could belong to either the moved or done state.
The "evicted" state needed to handle both cases, causing greater confusion.

Additionally, there were inconsistencies in the definition of an evicted
BO. Some places are based on the `evict` parameter passed from the TTM move
callback, while the others were updated based on whether the BO got its
optimal placement. The second is more accurate for our use case. With this
refactor, the evicted state is solely determined by the second rule.
That strongly sounds like you don't understand what the evicted state it good for.

The evicted state is for page directories, page tables and per VM BOs which needs to move around before doing the next CS.

Please further explain what you try to do here.
This is mainly an attempt to address inconsistency in the definition of “eviction”. The TTM move callback sets eviction when eviction happens through ttm_bo_evict. This is however not the only way a BO might end up outside its preferred domains.

amdgpu_vm_bo_update later updates the eviction state based on whether the BO is in its preferred domains. In my understanding this includes all cases where the BO is evicted through ttm_bo_evict. Therefore we should apply this definition right from the move callback, not only after amdgpu_vm_bo_update has been called at least once.

No, that is something completely separated. The evicted state just means that we need to re-validate the BO.

One cause of this is that TTM moved the BO.

But a different cause is that TTM moved the BO, we tried to validated it but fallen back to GTT for now and called amdgpu_vm_bo_update(). amdgpu_vm_bo_update() then moves the BO into the evicted state again so that we try to move it into VRAM on the next command submission.

This is purely an optimization done to create enough pressure so that TTM can do it's work.

Christian.


Tatsuyuki.

Regards,
Christian.

Signed-off-by: Tatsuyuki Ishi <ishitatsuyuki@xxxxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    | 67 +++++++++--------------
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c |  1 +
  3 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 7b9762f1cddd..dd6f72e2a1d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -174,19 +174,23 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm,
   * State for PDs/PTs and per VM BOs which are not at the location they should
   * be.
   */
-static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
+static void amdgpu_vm_bo_set_evicted(struct amdgpu_vm_bo_base *vm_bo, bool evicted)
  {
  	struct amdgpu_vm *vm = vm_bo->vm;
  	struct amdgpu_bo *bo = vm_bo->bo;
  -	vm_bo->moved = true;
  	spin_lock(&vm_bo->vm->status_lock);
-	if (bo->tbo.type == ttm_bo_type_kernel)
-		list_move(&vm_bo->vm_status, &vm->evicted);
-	else
-		list_move_tail(&vm_bo->vm_status, &vm->evicted);
+	if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) {
+		if (bo->tbo.type == ttm_bo_type_kernel)
+			list_move(&vm_bo->eviction_status, &vm->evicted);
+		else
+			list_move_tail(&vm_bo->eviction_status, &vm->evicted);
+	} else {
+		list_del_init(&vm_bo->eviction_status);
+	}
  	spin_unlock(&vm_bo->vm->status_lock);
  }
+
  /**
   * amdgpu_vm_bo_moved - vm_bo is moved
   *
@@ -310,6 +314,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
  	base->bo = bo;
  	base->next = NULL;
  	INIT_LIST_HEAD(&base->vm_status);
+	INIT_LIST_HEAD(&base->eviction_status);
    	if (!bo)
  		return;
@@ -336,7 +341,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
  	 * is currently evicted. add the bo to the evicted list to make sure it
  	 * is validated on next vm use to avoid fault.
  	 * */
-	amdgpu_vm_bo_evicted(base);
+	amdgpu_vm_bo_set_evicted(base, true);
  }
    /**
@@ -460,7 +465,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  	while (!list_empty(&vm->evicted)) {
  		bo_base = list_first_entry(&vm->evicted,
  					   struct amdgpu_vm_bo_base,
-					   vm_status);
+					   eviction_status);
  		spin_unlock(&vm->status_lock);
    		bo = bo_base->bo;
@@ -1034,7 +1039,7 @@ void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
  	list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status)
  		amdgpu_vm_bo_get_memory(bo_va, stats);
  -	list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status)
+	list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.eviction_status)
  		amdgpu_vm_bo_get_memory(bo_va, stats);
    	list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status)
@@ -1153,21 +1158,10 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
  			return r;
  	}
  -	/* If the BO is not in its preferred location add it back to
-	 * the evicted list so that it gets validated again on the
-	 * next command submission.
-	 */
-	if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) {
-		uint32_t mem_type = bo->tbo.resource->mem_type;
-
-		if (!(bo->preferred_domains &
-		      amdgpu_mem_type_to_domain(mem_type)))
-			amdgpu_vm_bo_evicted(&bo_va->base);
-		else
-			amdgpu_vm_bo_idle(&bo_va->base);
-	} else {
+	if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
+		amdgpu_vm_bo_idle(&bo_va->base);
+	else
  		amdgpu_vm_bo_done(&bo_va->base);
-	}
    	list_splice_init(&bo_va->invalids, &bo_va->valids);
  	bo_va->cleared = clear;
@@ -1883,6 +1877,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
    	spin_lock(&vm->status_lock);
  	list_del(&bo_va->base.vm_status);
+	list_del(&bo_va->base.eviction_status);
  	spin_unlock(&vm->status_lock);
    	list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
@@ -1959,13 +1954,18 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
  	if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo))
  		bo = bo->parent;
  +	/* If the BO is not in its preferred location add it back to
+	 * the evicted list so that it gets validated again on the
+	 * next command submission.
+	 */
+	uint32_t mem_type = bo->tbo.resource->mem_type;
+	bool suboptimal = !(bo->preferred_domains &
+			 amdgpu_mem_type_to_domain(mem_type));
+
  	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
  		struct amdgpu_vm *vm = bo_base->vm;
  -		if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) {
-			amdgpu_vm_bo_evicted(bo_base);
-			continue;
-		}
+		amdgpu_vm_bo_set_evicted(bo_base, suboptimal);
    		if (bo_base->moved)
  			continue;
@@ -2648,13 +2648,11 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
  {
  	struct amdgpu_bo_va *bo_va, *tmp;
  	u64 total_idle = 0;
-	u64 total_evicted = 0;
  	u64 total_relocated = 0;
  	u64 total_moved = 0;
  	u64 total_invalidated = 0;
  	u64 total_done = 0;
  	unsigned int total_idle_objs = 0;
-	unsigned int total_evicted_objs = 0;
  	unsigned int total_relocated_objs = 0;
  	unsigned int total_moved_objs = 0;
  	unsigned int total_invalidated_objs = 0;
@@ -2671,15 +2669,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
  	total_idle_objs = id;
  	id = 0;
  -	seq_puts(m, "\tEvicted BOs:\n");
-	list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) {
-		if (!bo_va->base.bo)
-			continue;
-		total_evicted += amdgpu_bo_print_info(id++, bo_va->base.bo, m);
-	}
-	total_evicted_objs = id;
-	id = 0;
-
  	seq_puts(m, "\tRelocated BOs:\n");
  	list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) {
  		if (!bo_va->base.bo)
@@ -2718,8 +2707,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
    	seq_printf(m, "\tTotal idle size:        %12lld\tobjs:\t%d\n", total_idle,
  		   total_idle_objs);
-	seq_printf(m, "\tTotal evicted size:     %12lld\tobjs:\t%d\n", total_evicted,
-		   total_evicted_objs);
  	seq_printf(m, "\tTotal relocated size:   %12lld\tobjs:\t%d\n", total_relocated,
  		   total_relocated_objs);
  	seq_printf(m, "\tTotal moved size:       %12lld\tobjs:\t%d\n", total_moved,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 204ab13184ed..d9ab97eabda9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -156,6 +156,7 @@ struct amdgpu_vm_bo_base {
    	/* protected by spinlock */
  	struct list_head		vm_status;
+	struct list_head		eviction_status;
    	/* protected by the BO being reserved */
  	bool				moved;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 96d601e209b8..f78f4040f466 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -652,6 +652,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
    	spin_lock(&entry->vm->status_lock);
  	list_del(&entry->vm_status);
+	list_del(&entry->eviction_status);
  	spin_unlock(&entry->vm->status_lock);
  	amdgpu_bo_unref(&entry->bo);
  }




[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux