Re: [RFC PATCH 08/18] drm/amdgpu: Don't try moving BOs to preferred domain before submit

Christian König <christian.koenig@xxxxxxx> · Thu, 25 Apr 2024 08:36:47 +0200

Am 24.04.24 um 18:56 schrieb Friedrich Vock:
TTM now takes care of moving buffers to the best possible domain.

Yeah, I've been planning to do this for a while as well. The problem is 
really that we need to keep the functionality.

For example TTM currently doesn't have a concept of an userspace client. 
So it can't track the bytes already evicted for each client.

This needs to be added as infrastructure first and then we can start to 
change this over into moving more functionality into TTM.

Regards,
Christian.


Signed-off-by: Friedrich Vock <friedrich.vock@xxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   2 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 191 +--------------------
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h     |   4 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |   7 -
  4 files changed, 3 insertions(+), 201 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cac0ca64367b3..3004adc6fa679 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1404,8 +1404,6 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev);
  bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev);
  bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev);

-void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
-				  u64 num_vis_bytes);
  int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev);
  void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
  					     const u32 *registers,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index e9168677ef0a6..92a0cffc1adc3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -638,196 +638,19 @@ static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
  	return 0;
  }

-/* Convert microseconds to bytes. */
-static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
-{
-	if (us <= 0 || !adev->mm_stats.log2_max_MBps)
-		return 0;
-
-	/* Since accum_us is incremented by a million per second, just
-	 * multiply it by the number of MB/s to get the number of bytes.
-	 */
-	return us << adev->mm_stats.log2_max_MBps;
-}
-
-static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
-{
-	if (!adev->mm_stats.log2_max_MBps)
-		return 0;
-
-	return bytes >> adev->mm_stats.log2_max_MBps;
-}
-
-/* Returns how many bytes TTM can move right now. If no bytes can be moved,
- * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
- * which means it can go over the threshold once. If that happens, the driver
- * will be in debt and no other buffer migrations can be done until that debt
- * is repaid.
- *
- * This approach allows moving a buffer of any size (it's important to allow
- * that).
- *
- * The currency is simply time in microseconds and it increases as the clock
- * ticks. The accumulated microseconds (us) are converted to bytes and
- * returned.
- */
-static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
-					      u64 *max_bytes,
-					      u64 *max_vis_bytes)
-{
-	s64 time_us, increment_us;
-	u64 free_vram, total_vram, used_vram;
-	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
-	 * throttling.
-	 *
-	 * It means that in order to get full max MBps, at least 5 IBs per
-	 * second must be submitted and not more than 200ms apart from each
-	 * other.
-	 */
-	const s64 us_upper_bound = 200000;
-
-	if (!adev->mm_stats.log2_max_MBps) {
-		*max_bytes = 0;
-		*max_vis_bytes = 0;
-		return;
-	}
-
-	total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
-	used_vram = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
-	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
-
-	spin_lock(&adev->mm_stats.lock);
-
-	/* Increase the amount of accumulated us. */
-	time_us = ktime_to_us(ktime_get());
-	increment_us = time_us - adev->mm_stats.last_update_us;
-	adev->mm_stats.last_update_us = time_us;
-	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
-				      us_upper_bound);
-
-	/* This prevents the short period of low performance when the VRAM
-	 * usage is low and the driver is in debt or doesn't have enough
-	 * accumulated us to fill VRAM quickly.
-	 *
-	 * The situation can occur in these cases:
-	 * - a lot of VRAM is freed by userspace
-	 * - the presence of a big buffer causes a lot of evictions
-	 *   (solution: split buffers into smaller ones)
-	 *
-	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
-	 * accum_us to a positive number.
-	 */
-	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
-		s64 min_us;
-
-		/* Be more aggressive on dGPUs. Try to fill a portion of free
-		 * VRAM now.
-		 */
-		if (!(adev->flags & AMD_IS_APU))
-			min_us = bytes_to_us(adev, free_vram / 4);
-		else
-			min_us = 0; /* Reset accum_us on APUs. */
-
-		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
-	}
-
-	/* This is set to 0 if the driver is in debt to disallow (optional)
-	 * buffer moves.
-	 */
-	*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
-
-	/* Do the same for visible VRAM if half of it is free */
-	if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) {
-		u64 total_vis_vram = adev->gmc.visible_vram_size;
-		u64 used_vis_vram =
-		  amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
-
-		if (used_vis_vram < total_vis_vram) {
-			u64 free_vis_vram = total_vis_vram - used_vis_vram;
-
-			adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
-							  increment_us, us_upper_bound);
-
-			if (free_vis_vram >= total_vis_vram / 2)
-				adev->mm_stats.accum_us_vis =
-					max(bytes_to_us(adev, free_vis_vram / 2),
-					    adev->mm_stats.accum_us_vis);
-		}
-
-		*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
-	} else {
-		*max_vis_bytes = 0;
-	}
-
-	spin_unlock(&adev->mm_stats.lock);
-}
-
-/* Report how many bytes have really been moved for the last command
- * submission. This can result in a debt that can stop buffer migrations
- * temporarily.
- */
-void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
-				  u64 num_vis_bytes)
-{
-	spin_lock(&adev->mm_stats.lock);
-	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
-	adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
-	spin_unlock(&adev->mm_stats.lock);
-}
-
  static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo)
  {
-	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
-	struct amdgpu_cs_parser *p = param;
  	struct ttm_operation_ctx ctx = {
  		.interruptible = true,
  		.no_wait_gpu = false,
  		.resv = bo->tbo.base.resv
  	};
-	uint32_t domain;
-	int r;

  	if (bo->tbo.pin_count)
  		return 0;

-	/* Don't move this buffer if we have depleted our allowance
-	 * to move it. Don't move anything if the threshold is zero.
-	 */
-	if (p->bytes_moved < p->bytes_moved_threshold &&
-	    (!bo->tbo.base.dma_buf ||
-	    list_empty(&bo->tbo.base.dma_buf->attachments))) {
-		if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
-		    (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
-			/* And don't move a CPU_ACCESS_REQUIRED BO to limited
-			 * visible VRAM if we've depleted our allowance to do
-			 * that.
-			 */
-			if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
-				domain = bo->preferred_domains;
-			else
-				domain = bo->allowed_domains;
-		} else {
-			domain = bo->preferred_domains;
-		}
-	} else {
-		domain = bo->allowed_domains;
-	}
-
-retry:
-	amdgpu_bo_placement_from_domain(bo, domain);
-	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
-
-	p->bytes_moved += ctx.bytes_moved;
-	if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
-	    amdgpu_res_cpu_visible(adev, bo->tbo.resource))
-		p->bytes_moved_vis += ctx.bytes_moved;
-
-	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
-		domain = bo->allowed_domains;
-		goto retry;
-	}
-
-	return r;
+	amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
  }

  static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
@@ -947,13 +770,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  		e->user_pages = NULL;
  	}

-	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
-					  &p->bytes_moved_vis_threshold);
-	p->bytes_moved = 0;
-	p->bytes_moved_vis = 0;
-
  	r = amdgpu_vm_validate(p->adev, &fpriv->vm, NULL,
-			       amdgpu_cs_bo_validate, p);
+			       amdgpu_cs_bo_validate, NULL);
  	if (r) {
  		DRM_ERROR("amdgpu_vm_validate() failed.\n");
  		goto out_free_user_pages;
@@ -973,9 +791,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  		p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(p->uf_bo);
  	}

-	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
-				     p->bytes_moved_vis);
-
  	for (i = 0; i < p->gang_size; ++i)
  		amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
  					 p->bo_list->gws_obj,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
index 39c33ad100cb7..e3d04ac4764be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
@@ -67,10 +67,6 @@ struct amdgpu_cs_parser {
  	struct amdgpu_bo_list		*bo_list;
  	struct amdgpu_mn		*mn;
  	struct dma_fence		*fence;
-	uint64_t			bytes_moved_threshold;
-	uint64_t			bytes_moved_vis_threshold;
-	uint64_t			bytes_moved;
-	uint64_t			bytes_moved_vis;

  	/* user fence */
  	struct amdgpu_bo		*uf_bo;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 331b9ed8062c7..5834a95d680d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -621,13 +621,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
  	if (unlikely(r != 0))
  		return r;

-	if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
-	    amdgpu_res_cpu_visible(adev, bo->tbo.resource))
-		amdgpu_cs_report_moved_bytes(adev, ctx.bytes_moved,
-					     ctx.bytes_moved);
-	else
-		amdgpu_cs_report_moved_bytes(adev, ctx.bytes_moved, 0);
-
  	if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
  	    bo->tbo.resource->mem_type == TTM_PL_VRAM) {
  		struct dma_fence *fence;
--
2.44.0