Re: [PATCH v2 2/2] drm/amdgpu: track bo memory stats at runtime

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 24.06.24 um 16:08 schrieb Yunxiang Li:
Before, every time fdinfo is queried we try to lock all the BOs in the
VM and calculate memory usage from scratch. This works okay if the
fdinfo is rarely read and the VMs don't have a ton of BOs. If either of
these conditions is not true, we get a massive performance hit.

In this new revision, we track the BOs as they change state. This way
when the fdinfo is queried we only need to take the status lock and copy
out the usage stats with minimal impact to the runtime performance.

Signed-off-by: Yunxiang Li <Yunxiang.Li@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      |   2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  15 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |   3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |   4 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  21 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 183 +++++++++++++++-----
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  32 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |   1 +
  8 files changed, 185 insertions(+), 76 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index ec888fc6ead8..397cb0f83811 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1166,7 +1166,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
  			if (!bo)
  				continue;
- amdgpu_vm_bo_invalidate(adev, bo, false);
+			amdgpu_vm_bo_invalidate(bo, NULL, false);
  		}
  	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 8e81a83d37d8..87fc3f37a1eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -190,6 +190,13 @@ static void amdgpu_dma_buf_unmap(struct dma_buf_attachment *attach,
  	}
  }
+static void amdgpu_dma_buf_release(struct dma_buf *buf)
+{
+	struct amdgpu_bo *bo = gem_to_amdgpu_bo(buf->priv);
+	amdgpu_vm_bo_handle_shared(bo, true);
+	drm_gem_dmabuf_release(buf);
+}
+
  /**
   * amdgpu_dma_buf_begin_cpu_access - &dma_buf_ops.begin_cpu_access implementation
   * @dma_buf: Shared DMA buffer
@@ -237,7 +244,7 @@ const struct dma_buf_ops amdgpu_dmabuf_ops = {
  	.unpin = amdgpu_dma_buf_unpin,
  	.map_dma_buf = amdgpu_dma_buf_map,
  	.unmap_dma_buf = amdgpu_dma_buf_unmap,
-	.release = drm_gem_dmabuf_release,
+	.release = amdgpu_dma_buf_release,
  	.begin_cpu_access = amdgpu_dma_buf_begin_cpu_access,
  	.mmap = drm_gem_dmabuf_mmap,
  	.vmap = drm_gem_dmabuf_vmap,
@@ -265,8 +272,10 @@ struct dma_buf *amdgpu_gem_prime_export(struct drm_gem_object *gobj,
  		return ERR_PTR(-EPERM);
buf = drm_gem_prime_export(gobj, flags);
-	if (!IS_ERR(buf))
+	if (!IS_ERR(buf)) {
  		buf->ops = &amdgpu_dmabuf_ops;
+		amdgpu_vm_bo_handle_shared(bo, false);
+	}
return buf;
  }
@@ -345,7 +354,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
  	/* FIXME: This should be after the "if", but needs a fix to make sure
  	 * DMABuf imports are initialized in the right VM list.
  	 */
-	amdgpu_vm_bo_invalidate(adev, bo, false);
+	amdgpu_vm_bo_invalidate(bo, NULL, false);
  	if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
  		return;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 1f22b4208729..1cd28fc1dcf5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -843,7 +843,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
  int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
  			struct drm_file *filp)
  {
-	struct amdgpu_device *adev = drm_to_adev(dev);
  	struct drm_amdgpu_gem_op *args = data;
  	struct drm_gem_object *gobj;
  	struct amdgpu_vm_bo_base *base;
@@ -903,7 +902,7 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
  			robj->allowed_domains |= AMDGPU_GEM_DOMAIN_GTT;
if (robj->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
-			amdgpu_vm_bo_invalidate(adev, robj, true);
+			amdgpu_vm_bo_invalidate(robj, NULL, true);
amdgpu_bo_unreserve(robj);
  		break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index bcf25c7e85e0..469bb600bcaa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -37,6 +37,7 @@
  #include <drm/amdgpu_drm.h>
  #include <drm/drm_cache.h>
  #include "amdgpu.h"
+#include "amdgpu_vm.h"
  #include "amdgpu_trace.h"
  #include "amdgpu_amdkfd.h"
@@ -1258,7 +1259,6 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
  			   bool evict,
  			   struct ttm_resource *new_mem)
  {
-	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
  	struct ttm_resource *old_mem = bo->resource;
  	struct amdgpu_bo *abo;
@@ -1266,7 +1266,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
  		return;
abo = ttm_to_amdgpu_bo(bo);
-	amdgpu_vm_bo_invalidate(adev, abo, evict);
+	amdgpu_vm_bo_invalidate(abo, new_mem, evict);
amdgpu_bo_kunmap(abo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 218919fc13ec..c65463f66cb2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -135,27 +135,6 @@ struct amdgpu_bo_vm {
  	struct amdgpu_vm_bo_base        entries[];
  };
-struct amdgpu_mem_stats {
-	/* current VRAM usage */
-	uint64_t vram;
-	/* current shared VRAM usage */
-	uint64_t vram_shared;
-	/* current GTT usage */
-	uint64_t gtt;
-	/* current shared GTT usage */
-	uint64_t gtt_shared;
-	/* current system memory usage */
-	uint64_t cpu;
-	/* current shared system memory usage */
-	uint64_t cpu_shared;
-	/* sum of evicted buffers */
-	uint64_t evicted_vram;
-	/* how much userspace asked for */
-	uint64_t requested_vram;
-	/* how much userspace asked for */
-	uint64_t requested_gtt;
-};
-
  static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
  {
  	return container_of(tbo, struct amdgpu_bo, tbo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..8dd239912bb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -36,6 +36,7 @@
  #include <drm/ttm/ttm_tt.h>
  #include <drm/drm_exec.h>
  #include "amdgpu.h"
+#include "amdgpu_vm.h"
  #include "amdgpu_trace.h"
  #include "amdgpu_amdkfd.h"
  #include "amdgpu_gmc.h"
@@ -310,6 +311,111 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
  	spin_unlock(&vm->status_lock);
  }
+static void amdgpu_vm_stats_update_shared(struct amdgpu_vm_bo_base *base,
+				      bool unshare)
+{
+	struct amdgpu_vm *vm = base->vm;
+	struct amdgpu_bo *bo = base->bo;
+	struct amdgpu_mem_stats *stats;
+	uint64_t size;
+
+	if (!vm || !bo)
+		return;
+
+	spin_lock(&vm->status_lock);
+	stats = &vm->stats;
+	size = amdgpu_bo_size(bo);
+	switch (bo->tbo.resource->mem_type) {
+	case TTM_PL_VRAM:
+		stats->vram_shared += unshare ? -size : size;
+		break;
+	case TTM_PL_TT:
+		stats->gtt_shared += unshare ? -size : size;
+		break;
+	case TTM_PL_SYSTEM:
+	default:
+		stats->cpu_shared += unshare ? -size : size;
+		break;
+	}
+	spin_unlock(&vm->status_lock);
+}
+
+void amdgpu_vm_stats_update(struct amdgpu_vm_bo_base *base,
+			    struct ttm_resource *new_mem,
+			    struct ttm_resource *old_mem)
+{
+	struct amdgpu_vm *vm = base->vm;
+	struct amdgpu_bo *bo = base->bo;
+	struct amdgpu_mem_stats *stats;
+	uint64_t size;
+	bool shared;
+
+	if (!vm || !bo || (!new_mem && !old_mem))
+		return;
+	spin_lock(&vm->status_lock);
+
+	stats = &vm->stats;
+	size = amdgpu_bo_size(bo);
+	shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base);
+
+	if (old_mem) {
+		switch (old_mem->mem_type) {
+		case TTM_PL_VRAM:
+			stats->vram -= size;
+			if (shared)
+				stats->vram_shared -= size;
+			break;
+		case TTM_PL_TT:
+			stats->gtt -= size;
+			if (shared)
+				stats->gtt_shared -= size;
+			break;
+		case TTM_PL_SYSTEM:
+		default:
+			stats->cpu -= size;
+			if (shared)
+				stats->cpu_shared -= size;
+			break;
+		}
+	}
+	if (new_mem) {
+		switch (new_mem->mem_type) {
+		case TTM_PL_VRAM:
+			stats->vram += size;
+			if (shared)
+				stats->vram_shared += size;
+			break;
+		case TTM_PL_TT:
+			stats->gtt += size;
+			if (shared)
+				stats->gtt_shared += size;
+			break;
+		case TTM_PL_SYSTEM:
+		default:
+			stats->cpu += size;
+			if (shared)
+				stats->cpu_shared += size;
+			break;
+		}
+	}
+	if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) {
+		if (!old_mem)
+			stats->requested_vram += size;
+		else if (old_mem->mem_type != TTM_PL_VRAM)
+			stats->evicted_vram -= size;
+		if (!new_mem)
+			stats->requested_vram -= size;
+		else if (new_mem->mem_type != TTM_PL_VRAM)
+			stats->evicted_vram += size;
+	} else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) {
+		if (!old_mem)
+			stats->requested_gtt += size;
+		if (!new_mem)
+			stats->requested_gtt -= size;
+	}
+	spin_unlock(&vm->status_lock);
+}
+
  /**
   * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
   *
@@ -332,6 +438,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
  		return;
  	base->next = bo->vm_bo;
  	bo->vm_bo = base;
+	amdgpu_vm_stats_update(base, bo->tbo.resource, NULL);
if (!amdgpu_vm_is_bo_always_valid(vm, bo))
  		return;
@@ -1088,51 +1195,11 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  	return r;
  }
-static void amdgpu_vm_bo_get_memory(struct amdgpu_bo_va *bo_va,
-				    struct amdgpu_mem_stats *stats)
-{
-	struct amdgpu_vm *vm = bo_va->base.vm;
-	struct amdgpu_bo *bo = bo_va->base.bo;
-
-	if (!bo)
-		return;
-
-	/*
-	 * For now ignore BOs which are currently locked and potentially
-	 * changing their location.
-	 */
-	if (!amdgpu_vm_is_bo_always_valid(vm, bo) &&
-	    !dma_resv_trylock(bo->tbo.base.resv))
-		return;
-
-	amdgpu_bo_get_memory(bo, stats);
-	if (!amdgpu_vm_is_bo_always_valid(vm, bo))
-		dma_resv_unlock(bo->tbo.base.resv);
-}
-
  void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
  			  struct amdgpu_mem_stats *stats)
  {
-	struct amdgpu_bo_va *bo_va, *tmp;
-
  	spin_lock(&vm->status_lock);
-	list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status)
-		amdgpu_vm_bo_get_memory(bo_va, stats);
-
-	list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status)
-		amdgpu_vm_bo_get_memory(bo_va, stats);
-
-	list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status)
-		amdgpu_vm_bo_get_memory(bo_va, stats);
-
-	list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status)
-		amdgpu_vm_bo_get_memory(bo_va, stats);
-
-	list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status)
-		amdgpu_vm_bo_get_memory(bo_va, stats);
-
-	list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status)
-		amdgpu_vm_bo_get_memory(bo_va, stats);
+	*stats = vm->stats;
  	spin_unlock(&vm->status_lock);
  }
@@ -2043,6 +2110,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
  			if (*base != &bo_va->base)
  				continue;
+ amdgpu_vm_stats_update(*base, NULL, bo->tbo.resource);
  			*base = bo_va->base.next;
  			break;
  		}
@@ -2108,17 +2176,37 @@ bool amdgpu_vm_evictable(struct amdgpu_bo *bo)
  	return true;
  }
+/**
+ * amdgpu_vm_bo_handle_shared - called when bo gets shared/unshared
+ *
+ * @bo: amdgpu buffer object
+ * @unshare: bo got unshared
+ *
+ * For now the only thing this does is to update the per VM stats
+ */
+void amdgpu_vm_bo_handle_shared(struct amdgpu_bo *bo, bool unshare)
+{
+
+	struct amdgpu_vm_bo_base *bo_base;
+
+	if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo))
+		bo = bo->parent;
+
+	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next)
+		amdgpu_vm_stats_update_shared(bo_base, unshare);
+}
+
  /**
   * amdgpu_vm_bo_invalidate - mark the bo as invalid
   *
- * @adev: amdgpu_device pointer
   * @bo: amdgpu buffer object
+ * @new_mem: the new placement of the BO move, NULL if just invalidate
   * @evicted: is the BO evicted
   *
   * Mark @bo as invalid.
   */
-void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
-			     struct amdgpu_bo *bo, bool evicted)
+void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, struct ttm_resource *new_mem,
+			     bool evicted)
  {
  	struct amdgpu_vm_bo_base *bo_base;
@@ -2129,6 +2217,9 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
  	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
  		struct amdgpu_vm *vm = bo_base->vm;
+ if (new_mem)
+			amdgpu_vm_stats_update(bo_base, new_mem, bo->tbo.resource);
+

Ok that looks extremely ugly. Please just add a separate function and call that from the TTM move function.

  		if (evicted && amdgpu_vm_is_bo_always_valid(vm, bo)) {
  			amdgpu_vm_bo_evicted(bo_base);
  			continue;
@@ -2640,6 +2731,10 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
  		}
  	}
+ if (memchr_inv(&vm->stats, 0, sizeof(vm->stats)))

Please either drop that or compare each memory stat variable separately. Byte by byte compares are really frowned upon.

Apart from that looks rather good to me,
Christian.

+		dev_err(adev->dev, "VM memory stats is non-zero when fini\n");
+	else
+		dev_dbg(adev->dev, "VM memory stats is zero when fini\n");
  }
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 046949c4b695..13e1ec4deb05 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -42,7 +42,27 @@ struct amdgpu_bo_va;
  struct amdgpu_job;
  struct amdgpu_bo_list_entry;
  struct amdgpu_bo_vm;
-struct amdgpu_mem_stats;
+
+struct amdgpu_mem_stats {
+	/* current VRAM usage */
+	uint64_t vram;
+	/* current shared VRAM usage */
+	uint64_t vram_shared;
+	/* current GTT usage */
+	uint64_t gtt;
+	/* current shared GTT usage */
+	uint64_t gtt_shared;
+	/* current system memory usage */
+	uint64_t cpu;
+	/* current shared system memory usage */
+	uint64_t cpu_shared;
+	/* sum of evicted buffers */
+	uint64_t evicted_vram;
+	/* how much userspace asked for */
+	uint64_t requested_vram;
+	/* how much userspace asked for */
+	uint64_t requested_gtt;
+};
/*
   * GPUVM handling
@@ -336,6 +356,8 @@ struct amdgpu_vm {
  	/* Lock to protect vm_bo add/del/move on all lists of vm */
  	spinlock_t		status_lock;
+ struct amdgpu_mem_stats stats;
+
  	/* Per-VM and PT BOs who needs a validation */
  	struct list_head	evicted;
@@ -514,8 +536,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
  			struct amdgpu_bo_va *bo_va,
  			bool clear);
  bool amdgpu_vm_evictable(struct amdgpu_bo *bo);
-void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
-			     struct amdgpu_bo *bo, bool evicted);
+void amdgpu_vm_stats_update(struct amdgpu_vm_bo_base *base,
+			    struct ttm_resource *new_mem,
+			    struct ttm_resource *old_mem);
+void amdgpu_vm_bo_handle_shared(struct amdgpu_bo *bo, bool unshare);
+void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, struct ttm_resource *new_mem,
+			     bool evicted);
  uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr);
  struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
  				       struct amdgpu_bo *bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index e39d6e7643bf..841a575cb18d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -586,6 +586,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
  	if (!entry->bo)
  		return;
+ amdgpu_vm_stats_update(entry, NULL, entry->bo->tbo.resource);
  	entry->bo->vm_bo = NULL;
  	shadow = amdgpu_bo_shadowed(entry->bo);
  	if (shadow) {




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux